├── CNAME
├── stop_server.sh
├── run_tests.sh
├── google_cloud_run_services
    ├── Makefile
    ├── docker
    │   ├── pangolin
    │   │   ├── sha256_grch37.txt
    │   │   ├── sha256_grch38.txt
    │   │   ├── requirements.txt
    │   │   └── Dockerfile
    │   └── spliceai
    │   │   ├── sha256_grch37.txt
    │   │   ├── sha256_grch38.txt
    │   │   ├── requirements.txt
    │   │   └── Dockerfile
    ├── database_admin.sh
    ├── README.md
    ├── create_pangolin_db.py
    ├── connect_to_db.sh
    ├── test_score_consistency.py
    ├── build_and_deploy.py
    └── server.py
├── icon.png
├── restart_server.sh
├── chm13v2-hg38.over.chain.gz
├── hg19ToHg38.over.chain.gz
├── hg38-chm13v2.over.chain.gz
├── hg38ToHg19.over.chain.gz
├── t2t-chm13-v1.0.hg38.over.chain.gz
├── t2t-chm13-v1.1.grch38.over.chain.gz
├── annotations
    ├── list_current_homo_sapiens_ensembl_dbs.sh
    ├── upload_annotations_to_server.sh
    ├── update_json_annotation_files.sh
    ├── convert_primate_ai_to_indexed_table.sh
    ├── update_pangolin_db_files.sh
    ├── README.md
    ├── update_SpliceAI_annotation_txt_files.sh
    ├── convert_SpliceAI_annotation_input_format_to_bed.py
    ├── combine_PrimateAI_scores_and_gene_threshold_tables.py
    ├── combine_score_tables.py
    ├── convert_gtf_to_SpliceAI_annotation_input_format.py
    └── generate_transcript_annotation_json.py
├── test_data
    ├── spliceai_scores.raw.snv.hg38_subset.vcf.gz
    ├── run_spliceai_on_test_vcf.sh
    ├── spliceai_scores.masked.snv.hg38_subset.vcf.gz
    ├── spliceai_scores.raw.indel.hg38_subset.vcf.gz
    ├── spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi
    ├── spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi
    ├── spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi
    └── test.vcf
├── temporarily_disable_liftover_rate_limit.py
├── start_server.sh
├── requirements.txt
├── start_local_server.sh
├── .github
    └── ISSUE_TEMPLATE
    │   └── issue-or-feature-request.md
├── LICENSE
├── .gitignore
├── test_spliceai.py
├── README.md
└── server.py


/CNAME:
--------------------------------------------------------------------------------
1 | spliceailookup.broadinstitute.org


--------------------------------------------------------------------------------
/stop_server.sh:
--------------------------------------------------------------------------------
1 | pkill -9 gunicorn
2 | 


--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 | set -x
2 | 
3 | python3.6 -m unittest test_spliceai
4 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python3 build_and_deploy.py
3 | 


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/icon.png


--------------------------------------------------------------------------------
/restart_server.sh:
--------------------------------------------------------------------------------
1 | #redis-cli flushall
2 | 
3 | kill -HUP $(pgrep gunicorn | head -n 1)
4 | 


--------------------------------------------------------------------------------
/chm13v2-hg38.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/chm13v2-hg38.over.chain.gz


--------------------------------------------------------------------------------
/hg19ToHg38.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg19ToHg38.over.chain.gz


--------------------------------------------------------------------------------
/hg38-chm13v2.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg38-chm13v2.over.chain.gz


--------------------------------------------------------------------------------
/hg38ToHg19.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg38ToHg19.over.chain.gz


--------------------------------------------------------------------------------
/t2t-chm13-v1.0.hg38.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/t2t-chm13-v1.0.hg38.over.chain.gz


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/pangolin/sha256_grch37.txt:
--------------------------------------------------------------------------------
1 | sha256:bcc5a434b184f9dc986528b2aca9744e7c953d7f9fa340172eec8e3130f1e1b6
2 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/pangolin/sha256_grch38.txt:
--------------------------------------------------------------------------------
1 | sha256:6e248671d1e83c5ea312a1a4bbc973838c6793a1922e238b9fbb060f7570247f
2 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/spliceai/sha256_grch37.txt:
--------------------------------------------------------------------------------
1 | sha256:4905bed9a69fd3a967b7ee820026543b9e4d3f82707bb601e670e6cf83e60d4c
2 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/spliceai/sha256_grch38.txt:
--------------------------------------------------------------------------------
1 | sha256:25275794ffc49033bcc6247441b14887e65a1a08e395aed97aea595eaef78fce
2 | 


--------------------------------------------------------------------------------
/t2t-chm13-v1.1.grch38.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/t2t-chm13-v1.1.grch38.over.chain.gz


--------------------------------------------------------------------------------
/annotations/list_current_homo_sapiens_ensembl_dbs.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | mysql -h useastdb.ensembl.org -u anonymous -e "show databases;" | grep -i homo_sapiens_core
3 | 


--------------------------------------------------------------------------------
/annotations/upload_annotations_to_server.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | 
3 | gcloud compute scp gencode.v44* weisburd@spliceai-lookup:/home/weisburd/SpliceAI-lookup/annotations/
4 | 


--------------------------------------------------------------------------------
/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz


--------------------------------------------------------------------------------
/test_data/run_spliceai_on_test_vcf.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | 
3 | spliceai -R ~/p1/ref/GRCh38/hg38.fa  -I test.vcf -O results.vcf -A ../annotations/gencode.v43.annotation.txt.gz
4 | 
5 | 


--------------------------------------------------------------------------------
/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz


--------------------------------------------------------------------------------
/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz


--------------------------------------------------------------------------------
/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi


--------------------------------------------------------------------------------
/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi


--------------------------------------------------------------------------------
/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/spliceai/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask
 2 | flask_cors
 3 | flask-talisman==1.1.0
 4 | gunicorn
 5 | pandas==2.2.2
 6 | biopython==1.83
 7 | pyfastx==2.1.0
 8 | 
 9 | # sql
10 | psycopg2==2.9.9
11 | 


--------------------------------------------------------------------------------
/temporarily_disable_liftover_rate_limit.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | import time
 3 | 
 4 | r = redis.Redis(host='localhost')
 5 | 
 6 | while True:
 7 |     for key in r.keys("request *liftover*"):
 8 |         print("Deleting key: ", key.decode("UTF-8"))
 9 |         r.delete(key)
10 |     time.sleep(1)
11 | 


--------------------------------------------------------------------------------
/start_server.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | #redis-cli flushall  #  clear all keys from redis
 4 | while true
 5 | do
 6 |       
 7 | gunicorn -w 8 -t 1800 -b 0.0.0.0:80  -b 0.0.0.0:443 \
 8 |   --keyfile=../spliceailookup-api.broadinstitute.org.key \
 9 |   --certfile=../spliceailookup-api.broadinstitute.org.crt \
10 |   server:app
11 | 
12 | done
13 | 


--------------------------------------------------------------------------------
/annotations/update_json_annotation_files.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | gencode_version=v44
3 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz  gencode.${gencode_version}lift37.basic.annotation.gtf.gz; do
4 |     log_path=process_$(echo $p | sed s/.gtf.gz//).log
5 |     time python3 generate_transcript_annotation_json.py $p | tee -a ${log_path}    
6 | done
7 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/pangolin/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask
 2 | flask_cors
 3 | flask-talisman==1.1.0
 4 | gunicorn
 5 | 
 6 | # pangolin dependencies:
 7 | gffutils==0.13
 8 | biopython==1.83
 9 | pyfastx==2.1.0
10 | PyVCF3>=1.0.3
11 | 
12 | # sql
13 | psycopg2==2.9.9
14 | 
15 | # Pangolin dependencies
16 | numpy==1.26.4
17 | pandas==2.2.2
18 | torch==2.2.1
19 | 
20 | 


--------------------------------------------------------------------------------
/annotations/convert_primate_ai_to_indexed_table.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | 
3 | python3 -c 'pd.read_table("PrimateAI_3D.hg19.txt.gz").sort_values(["chr", "pos"], ascending=[True, True]).to_csv("PrimateAI_3D.hg19.sorted.txt.gz", sep="\t", header=True, index=False)'
4 | gunzip -c PrimateAI_3D.hg19.sorted.txt.gz | bgzip > PrimateAI_3D.hg19.txt.gz
5 | tabix -S 1 -s 1 -b 2 -e 2 PrimateAI_3D.hg19.txt.gz
6 | tabix -S 1 -s 1 -b 2 -e 2 PrimateAI_3D.hg38.txt.gz
7 | 
8 | 


--------------------------------------------------------------------------------
/annotations/update_pangolin_db_files.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | gencode_version=v44
 4 | gzcat gencode.${gencode_version}lift37.basic.annotation.gtf.gz | sed 's/chr//g' | bgzip > gencode.${gencode_version}lift37.basic.annotation.without_chr_prefix.gtf.gz
 5 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz  gencode.${gencode_version}lift37.basic.annotation.without_chr_prefix.gtf.gz; do
 6 |     set -x
 7 |     python3 ~/code/Pangolin/scripts/create_db.py $p &
 8 |     set +x
 9 | done
10 | 
11 | wait
12 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/database_admin.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | gcloud --project spliceai-lookup-412920 sql instances list
 3 | 
 4 | # view settings
 5 | gcloud --project spliceai-lookup-412920 sql instances describe spliceai-lookup-db
 6 | 
 7 | # adjust DB settings
 8 | # https://cloud.google.com/sql/docs/postgres/flags#gcloud
 9 | gcloud --project spliceai-lookup-412920 sql instances patch spliceai-lookup-db --database-flags=max_connections=50
10 | 
11 | # restart DB
12 | gcloud --project spliceai-lookup-412920 sql instances restart spliceai-lookup-db
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow==2.16.1
 2 | keras>=3.0.0
 3 | spliceai @ git+https://github.com/bw2/SpliceAI   # if the latest version isn't being installed, clone the repo and install from the local directory by running python3 setup.py install
 4 | flask
 5 | flask-cors
 6 | flask-talisman
 7 | gunicorn
 8 | intervaltree
 9 | markdown2
10 | pandas
11 | pysam
12 | redis
13 | # pangolin dependencies:
14 | gffutils
15 | biopython
16 | pyfastx
17 | PyVCF3>=1.0.3
18 | pangolin @ git+https://github.com/bw2/Pangolin     # if the latest version isn't being installed, clone the repo and install from the local directory by running python3 setup.py install
19 | 


--------------------------------------------------------------------------------
/start_local_server.sh:
--------------------------------------------------------------------------------
 1 | set -x
 2 | 
 3 | NUM_THREADS=1    # set this to the number of cores on your machine (or a bit less)
 4 | HOST=127.0.0.1   # set this to 0.0.0.0 instead of 127.0.0.1 to allow access from other computers
 5 | PORT=8080        # set this to a port number that is not already in use
 6 | TIMEOUT=1800     # kill the server thread if it takes more than this many seconds to compute a response
 7 | 
 8 | # clear the redis cache to avoid reusing outdated or incorrectly formatted SpliceAI responses
 9 | redis-cli flushall
10 | 
11 | # start the gunicorn server
12 | gunicorn -w ${NUM_THREADS} -t ${TIMEOUT} -b ${HOST}:${PORT}  server:app
13 | 


--------------------------------------------------------------------------------
/annotations/README.md:
--------------------------------------------------------------------------------
1 | To generate or update the transcript annotation files needed for running SpliceAI and Pangolin:
2 | 
3 | 1. Download the latest "basic" gene annotations in GTF format from Gencode for both [GRCh38](https://www.gencodegenes.org/human/) and GRCh37.
4 | 2. Update the Gencode version string at the top of these bash scripts, and then run them:
5 |     - [update_json_annotation_files.sh](update_json_annotation_files.sh)
6 |     - [update_SpliceAI_annotation_txt_files.sh](update_SpliceAI_annotation_txt_files.sh)
7 |     - [update_pangolin_db_files.sh](update_pangolin_db_files.sh)
8 | 3. Update the GENCODE_VERSION string in [../server.py](../server.py)
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-or-feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Issue or Feature Request
 3 | about: Issue or Feature Request
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | REQUIRED: If the issue is about a specific variant or results page (or you have an example), please copy-paste the variant here and/or provide a link to the results page where you see the issue. 
11 | 
12 | ---
13 | OPTIONAL: If you'd like to also share a screenshot:
14 | 
15 | MacOS:   Press Command+Shift+4, select a region of the page, then paste the image here.
16 | Windows:  Press Windows Logo Key + PrtScn   or   Fn + Windows logo key + Space Bar. Then, in File Explorer, open the Pictures > Screenshots folder and drag the image here.
17 | 


--------------------------------------------------------------------------------
/annotations/update_SpliceAI_annotation_txt_files.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | # make sure annotation-utils is installed since it's a dependency of convert_gtf_to_SpliceAI_annotation_input_format.py
 4 | python3 -m pip install git+https://github.com/bw2/annotation-utils
 5 | 
 6 | gencode_version=v44
 7 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz  gencode.${gencode_version}lift37.basic.annotation.gtf.gz; do
 8 |     log_path=process_$(echo ${p} | sed s/.gtf.gz//).log
 9 |     time python3 generate_transcript_annotation_json.py ${p} | tee -a ${log_path}
10 |     json_path=$(echo ${p} | sed 's/.gtf.gz/.transcript_annotations.json/')
11 |     time python3 convert_gtf_to_SpliceAI_annotation_input_format.py -a ${json_path} ${p}  | tee ${log_path}
12 | done
13 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains the [Google Cloud Run](https://cloud.google.com/run) implementation of SpliceAI and Pangolin web service APIs used by [spliceai-lookup.broadinstitute.org](https://spliceai-lookup.broadinstitute.org)  (NOTE: `Cloud Run` is different from Google's `Cloud Functions` service).
 2 | 
 3 | The `build_and_deploy.py` script includes the following commands for building docker images, updating gencode annotations, updating the SpliceAI-lookup Google Cloud Run services, and running tests:
 4 | 
 5 | * **build** the docker images for the SpliceAI and Pangolin services
 6 | * **update_annotations** download Gencode annotations and reprocess them into the formats used by SpliceAI and Pangolin
 7 | * **deploy** the services to Google Cloud Run
 8 | * **test** run the service locally using a `docker run` command
 9 | * **test2** run the service locally using the heavier-weight `gcloud beta code dev` command which uses kubectl
10 | * **run** open an interactive shell inside the container
11 | To perform any of these operations, run `python3 build_and_deploy.py <sub-command>`.
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 bw2
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/spliceai/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim-bullseye
 2 | 
 3 | RUN apt update && apt-get install --no-install-recommends -y \
 4 |     ca-certificates \
 5 |     wget \
 6 |     bzip2 \
 7 |     unzip \
 8 |     git \
 9 |     libcurl4-openssl-dev \
10 |     libbz2-dev \
11 |     liblzma-dev \
12 |     zlib1g-dev
13 | 
14 | RUN python3 -m pip install tensorflow==2.16.1
15 | 
16 | RUN apt update && apt-get install --no-install-recommends -y build-essential libpq-dev
17 | 
18 | COPY docker/spliceai/requirements.txt /
19 | RUN python3 -m pip install --upgrade -r /requirements.txt
20 | 
21 | ARG RANDOM=2
22 | RUN python3 -m pip install https://github.com/bw2/SpliceAI/archive/refs/heads/master.zip
23 | 
24 | ARG CONCURRENCY="2"
25 | ARG GENOME_VERSION="unknown"
26 | 
27 | COPY docker/ref/GRCh${GENOME_VERSION} /
28 | COPY docker/spliceai/annotations/GRCh${GENOME_VERSION} /
29 | COPY server.py /
30 | 
31 | ENV TF_CPP_MIN_LOG_LEVEL=3
32 | 
33 | ENV PORT=8080
34 | ENV TOOL=spliceai
35 | ENV GENOME_VERSION=${GENOME_VERSION}
36 | ENV CONCURRENCY=${CONCURRENCY}
37 | ENV RUNNING_ON_GOOGLE_CLOUD_RUN=1
38 | 
39 | CMD exec gunicorn --preload --bind :$PORT --workers ${CONCURRENCY} --threads 1 --timeout 0 server:app
40 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/docker/pangolin/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim-bullseye
 2 | 
 3 | RUN apt update && apt-get install --no-install-recommends -y \
 4 |     ca-certificates \
 5 |     wget \
 6 |     bzip2 \
 7 |     unzip \
 8 |     git \
 9 |     libcurl4-openssl-dev \
10 |     libbz2-dev \
11 |     liblzma-dev \
12 |     zlib1g-dev
13 | 
14 | RUN python3 -m pip install torch==2.2.1 -f https://download.pytorch.org/whl/torch_stable.html
15 | 
16 | RUN apt update && apt-get install --no-install-recommends -y build-essential libpq-dev
17 | 
18 | COPY docker/pangolin/requirements.txt /
19 | RUN python3 -m pip install --upgrade -r /requirements.txt
20 | 
21 | RUN git clone https://github.com/bw2/Pangolin.git \
22 |     && cd Pangolin \
23 | 	&& python3 -m pip install .
24 | 
25 | ARG CONCURRENCY="2"
26 | ARG GENOME_VERSION="unknown"
27 | 
28 | COPY docker/ref/GRCh${GENOME_VERSION} /
29 | COPY docker/pangolin/annotations/GRCh${GENOME_VERSION} /
30 | COPY server.py /
31 | 
32 | ENV PORT=8080
33 | ENV TOOL=pangolin
34 | ENV GENOME_VERSION=${GENOME_VERSION}
35 | ENV CONCURRENCY=${CONCURRENCY}
36 | ENV RUNNING_ON_GOOGLE_CLOUD_RUN=1
37 | 
38 | CMD exec gunicorn --preload --bind :$PORT --workers ${CONCURRENCY} --threads 1 --timeout 0 server:app
39 | 


--------------------------------------------------------------------------------
/test_data/test.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##fileDate=20191004
 3 | ##reference=GRCh38/hg38
 4 | ##contig=<ID=1,length=248956422>
 5 | ##contig=<ID=2,length=242193529>
 6 | ##contig=<ID=3,length=198295559>
 7 | ##contig=<ID=4,length=190214555>
 8 | ##contig=<ID=5,length=181538259>
 9 | ##contig=<ID=6,length=170805979>
10 | ##contig=<ID=7,length=159345973>
11 | ##contig=<ID=8,length=145138636>
12 | ##contig=<ID=9,length=138394717>
13 | ##contig=<ID=10,length=133797422>
14 | ##contig=<ID=11,length=135086622>
15 | ##contig=<ID=12,length=133275309>
16 | ##contig=<ID=13,length=114364328>
17 | ##contig=<ID=14,length=107043718>
18 | ##contig=<ID=15,length=101991189>
19 | ##contig=<ID=16,length=90338345>
20 | ##contig=<ID=17,length=83257441>
21 | ##contig=<ID=18,length=80373285>
22 | ##contig=<ID=19,length=58617616>
23 | ##contig=<ID=20,length=64444167>
24 | ##contig=<ID=21,length=46709983>
25 | ##contig=<ID=22,length=50818468>
26 | ##contig=<ID=X,length=156040895>
27 | ##contig=<ID=Y,length=57227415>
28 | ##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
29 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
30 | 1	69091	.	A	C	.	.	SpliceAI=C|OR4F5|0.01|0.00|0.00|0.00|42|25|24|2
31 | 11	108301737	.	CA	TG	.	.	.
32 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/create_pangolin_db.py:
--------------------------------------------------------------------------------
 1 | ## This script was copied from the Pangolin repo (https://github.com/tkzeng/Pangolin)
 2 | 
 3 | import argparse
 4 | import gffutils
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("annotation_file", help="GTF file containing gene annotations. For example, from https://www.gencodegenes.org/")
 8 | parser.add_argument("--filter", default="Ensembl_canonical", help="Only keep GTF features with the specified tags. Format: tag1,tag2,... or None to keep all features. Default: Ensembl_canonical")
 9 | args = parser.parse_args()
10 | 
11 | gtf = args.annotation_file
12 | if gtf.endswith(".gtf"):
13 |     prefix = gtf[:-4]
14 | elif gtf.endswith(".gtf.gz"):
15 |     prefix = gtf[:-7]
16 | else:
17 |     exit("ERROR, annotation_file should be a GTF file.")
18 | 
19 | def filter(feat):
20 |     if feat.featuretype not in ["gene","transcript","exon"]:
21 |         return False
22 |     elif args.filter != "None" and feat.featuretype in ["transcript","exon"]:
23 |         present = False
24 |         for tag in args.filter.split(','):
25 |             if "tag" in feat.attributes and tag in feat["tag"]:
26 |                 present = True
27 |         if not present:
28 |             return False
29 |     return feat
30 | 
31 | db = gffutils.create_db(gtf, prefix+".db", force=True,
32 |                         disable_infer_genes=True, disable_infer_transcripts=True,
33 |                         transform=filter)
34 | 
35 | print("Database created: %s.db" % prefix)
36 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/connect_to_db.sh:
--------------------------------------------------------------------------------
 1 | #set -ex
 2 | 
 3 | PGPASSWORD=$(cat .pgpass) psql -h 34.173.33.168 -d spliceai-lookup-db -U postgres -d spliceai-lookup-db
 4 | 
 5 | 
 6 | # useful queries:
 7 | 
 8 | # count variant consequences (counted once per variant)
 9 | # select variant_consequence, count(*) as c from (select variant_consequence, variant from log where length(variant_consequence) > 1 group by variant_consequence, variant) log group by variant_consequence order by c desc;
10 | 
11 | # count queries per ip per day
12 | # select ip, logtime::timestamp::date, MAX(event_name), count(*) as c from log group by ip, logtime::timestamp::date ORDER BY logtime::timestamp::date desc, c desc
13 | 
14 | 
15 | # check intergenic variants
16 | # select distinct variant, genome from log where variant_consequence = 'intergenic' and genome='38';
17 | 
18 | 
19 | # compute % of queried variants that are splice-region
20 | # select c as splice_region_variants, d as total_variants, c::float/d::float as percent from ( select count(*) as c from ( select variant_consequence, variant from log where length(variant_consequence) > 1 group by variant_consequence, variant ) temp1 where variant_consequence = 'splice_region_variant' or variant_consequence = 'splice_donor_variant' or variant_consequence = 'splice_acceptor_variant' or variant_consequence = 'splice_polypyrimidine_tract_variant' or variant_consequence = 'splice_donor_region_variant' )  temp2 full outer join ( select count(*) as d from (	select variant from log where length(variant_consequence) > 1 group by variant_consequence, variant ) temp3 ) temp4  on 1=1;	
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.vcf*
  2 | *.txt*
  3 | *.tsv*
  4 | *.json*
  5 | 
  6 | service.backup_copy.yaml
  7 | 
  8 | .pgpass
  9 | .idea
 10 | *.iml
 11 | *.db
 12 | *.txt.gz
 13 | 
 14 | # Data
 15 | *.fa.gz
 16 | *.gz.fxi
 17 | *.fa
 18 | *.fai
 19 | *.bed
 20 | *.bed.gz
 21 | *.gtf.gz
 22 | *.tbi
 23 | *.crt
 24 | 
 25 | # Byte-compiled / optimized / DLL files
 26 | __pycache__/
 27 | *.py[cod]
 28 | *$py.class
 29 | 
 30 | # C extensions
 31 | *.so
 32 | 
 33 | # Distribution / packaging
 34 | .Python
 35 | build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | pip-wheel-metadata/
 48 | share/python-wheels/
 49 | *.egg-info/
 50 | .installed.cfg
 51 | *.egg
 52 | MANIFEST
 53 | 
 54 | # PyInstaller
 55 | #  Usually these files are written by a python script from a template
 56 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 57 | *.manifest
 58 | *.spec
 59 | 
 60 | # Installer logs
 61 | pip-log.txt
 62 | pip-delete-this-directory.txt
 63 | 
 64 | # Unit test / coverage reports
 65 | htmlcov/
 66 | .tox/
 67 | .nox/
 68 | .coverage
 69 | .coverage.*
 70 | .cache
 71 | nosetests.xml
 72 | coverage.xml
 73 | *.cover
 74 | *.py,cover
 75 | .hypothesis/
 76 | .pytest_cache/
 77 | 
 78 | # Translations
 79 | *.mo
 80 | *.pot
 81 | 
 82 | # Django stuff:
 83 | *.log
 84 | local_settings.py
 85 | db.sqlite3
 86 | db.sqlite3-journal
 87 | 
 88 | # Flask stuff:
 89 | instance/
 90 | .webassets-cache
 91 | 
 92 | # Scrapy stuff:
 93 | .scrapy
 94 | 
 95 | # Sphinx documentation
 96 | docs/_build/
 97 | 
 98 | # PyBuilder
 99 | target/
100 | 
101 | # Jupyter Notebook
102 | .ipynb_checkpoints
103 | 
104 | # IPython
105 | profile_default/
106 | ipython_config.py
107 | 
108 | # pyenv
109 | .python-version
110 | 
111 | # pipenv
112 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
113 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
114 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
115 | #   install all needed dependencies.
116 | #Pipfile.lock
117 | 
118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
119 | __pypackages__/
120 | 
121 | # Celery stuff
122 | celerybeat-schedule
123 | celerybeat.pid
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv/
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # mkdocs documentation
145 | /site
146 | 
147 | # mypy
148 | .mypy_cache/
149 | .dmypy.json
150 | dmypy.json
151 | 
152 | # Pyre type checker
153 | .pyre/
154 | 


--------------------------------------------------------------------------------
/annotations/convert_SpliceAI_annotation_input_format_to_bed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The original SpliceAI annotation format is:
 3 | 
 4 | #NAME   CHROM   STRAND  TX_START        TX_END  EXON_START      EXON_END
 5 | OR4F5   1       +       69090   70008   69090,  70008,
 6 | OR4F16  1       -       685715  686654  685715, 686654,
 7 | ...
 8 | 
 9 | Convert it to BED format so that it can be viewed in IGV.
10 | """
11 | 
12 | 
13 | import argparse
14 | import gzip
15 | import os
16 | import re
17 | 
18 | def main():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument("-n", type=int, help="Number of lines to process (for testing)")
21 |     parser.add_argument("-o", "--output-prefix", help="Output prefix for genePred file")
22 |     parser.add_argument("spliceai_annotation_table")
23 |     args = parser.parse_args()
24 | 
25 |     if not args.output_prefix:
26 |         args.output_prefix = re.sub("(.tsv|.txt)(.gz)?$", "", os.path.basename(args.spliceai_annotation_table))
27 | 
28 |     output_path = f"{args.output_prefix}.bed"
29 | 
30 |     line_count = 0
31 |     fopen = gzip.open if args.spliceai_annotation_table.endswith("gz") else open
32 |     with fopen(args.spliceai_annotation_table, "rt") as f:
33 |         with open(output_path, "wt") as out:
34 |             header = f.readline().strip().split("\t")
35 |             if header != ["#NAME", "CHROM", "STRAND", "TX_START", "TX_END", "EXON_START", "EXON_END"]:
36 |                 raise ValueError(f"Unexpected header: {header}")
37 | 
38 |             for i, line in enumerate(f):
39 |                 line_count += 1
40 |                 fields = line.strip().split("\t")
41 |                 if len(fields) != 7:
42 |                     raise ValueError(f"Expected 7 fields, got {len(fields)}: {fields}")
43 | 
44 |                 name, chrom, strand, tx_start, tx_end, exon_starts, exon_ends = fields
45 | 
46 | 
47 |                 exon_starts = exon_starts.strip(",").split(",")
48 |                 exon_ends = exon_ends.strip(",").split(",")
49 |                 if exon_starts.count(",") != exon_ends.count(","):
50 |                     raise ValueError(f"Mismatch in the number of exon starts and ends: {fields}")
51 | 
52 |                 exon_sizes = [str(int(end) - int(start)) for start, end in zip(exon_starts, exon_ends)]
53 |                 exon_starts = [str(int(start) - int(tx_start)) for start in exon_starts]
54 | 
55 |                 score = item_rgb = "."
56 |                 out.write("\t".join([
57 |                     chrom, tx_start, tx_end, name, score, strand,
58 |                     tx_start, tx_end, item_rgb,
59 |                     str(len(exon_sizes)), ",".join(exon_sizes), ",".join(exon_starts),
60 |                 ]) + "\n")
61 | 
62 |                 if args.n is not None and i > args.n:
63 |                     break
64 | 
65 |     os.system(f"bgzip -f {output_path}")
66 |     os.system(f"tabix -f {output_path}.gz")
67 | 
68 |     print(f"Wrote {line_count:,d} lines to {output_path}.gz")
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/annotations/combine_PrimateAI_scores_and_gene_threshold_tables.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gzip
 3 | import os
 4 | import pandas as pd
 5 | import subprocess
 6 | import tqdm
 7 | 
 8 | def parse_args():
 9 | 	parser = argparse.ArgumentParser()
10 | 	parser.add_argument("-g", "--gene-thresholds-csv", default="PrimateAI_3D.per_gene_percentile_thresholds.csv.gz",
11 | 						help="Input CSV file")
12 | 	parser.add_argument("-p", "--show-progress-bar", action="store_true", help="Show progress bar")
13 | 
14 | 	parser.add_argument("scores_table", help="Table with precomputed PrimateAI_3D scores")
15 | 	args = parser.parse_args()
16 | 
17 | 	if not os.path.isfile(args.gene_thresholds_csv):
18 | 		parser.error(f"Gene thresholds CSV file not found: {args.gene_thresholds_csv}")
19 | 
20 | 	if not args.scores_table.endswith(".txt.gz"):
21 | 		parser.error("Scores table must have a .txt.gz extension")
22 | 
23 | 	if not os.path.isfile(args.scores_table):
24 | 		parser.error(f"Scores file not found: {args.scores_table}")
25 | 
26 | 	return args
27 | 
28 | def main():
29 | 
30 | 	args = parse_args()
31 | 
32 | 	# Load gene thresholds
33 | 	print(f"Parsing {args.gene_thresholds_csv}")
34 | 	gene_thresholds_df = pd.read_csv(args.gene_thresholds_csv)
35 | 	transcript_to_percentile_threshold_map = dict(
36 | 		zip(gene_thresholds_df['Transcript'], gene_thresholds_df['PAI3D_Gene_Percentile_Threshold']))
37 | 
38 | 	print(f"Loaded {len(gene_thresholds_df):,d} gene thresholds")
39 | 	print(f"Parsing {args.scores_table}")
40 | 	with gzip.open(args.scores_table, "rt") as f, open(f"{args.scores_table}.unfinished", "wt") as out_f:
41 | 		header = next(f).strip().split("\t")
42 | 		transcript_id_index = 4
43 | 		percentile_index = 9
44 | 		assert header[transcript_id_index] == "gene_name"
45 | 		assert header[percentile_index] == "percentile_PAI3D"
46 | 
47 | 		if args.show_progress_bar:
48 | 			f = tqdm.tqdm(f, unit=" lines", unit_scale=True)
49 | 
50 | 		output_columns = [
51 | 			"chrom", "pos", "ref", "alt", "PAI3D_percentile", "PAI3D_gene_threshold",
52 | 		]
53 | 		out_f.write("\t".join(output_columns) + "\n")
54 | 
55 | 		for i, line in enumerate(f):
56 | 			fields = line.strip().split("\t")
57 | 			output_row = fields[:4]
58 | 			transcript_id = fields[transcript_id_index]
59 | 
60 | 			if transcript_id not in transcript_to_percentile_threshold_map:
61 | 				raise ValueError(f"Transcript ID {transcript_id} from {args.score_table} not found in {args.gene_thresholds_csv}")
62 | 
63 | 			percentile = fields[percentile_index]
64 | 			output_row += [percentile, f"{float(transcript_to_percentile_threshold_map[transcript_id]):0.3f}"]
65 | 			out_f.write("\t".join(output_row) + "\n")
66 | 
67 | 	output_table_path = args.scores_table.replace(".txt.gz", "") + ".with_gene_thresholds.txt.gz"
68 | 	subprocess.check_output(f"bgzip {args.scores_table}.unfinished", shell=True)
69 | 	subprocess.check_output(f"mv {args.scores_table}.unfinished.gz {output_table_path}", shell=True)
70 | 	subprocess.check_output(f"tabix -f -S 1 -s 1 -b 2 -e 2 {output_table_path}", shell=True)
71 | 	#subprocess.check_output(f"gsutil -m cp {output_table_path} {output_table_path}.tbi gs://spliceai-lookup-reference-data/", shell=True)
72 | 
73 | if __name__ == "__main__":
74 | 	main()
75 | 


--------------------------------------------------------------------------------
/test_spliceai.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from server import SPLICEAI_ANNOTATOR, SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK, VariantRecord, parse_variant, process_variant
 3 | 
 4 | 
 5 | class Test(unittest.TestCase):
 6 | 
 7 |     def test_parse_variant(self):
 8 |         self.assertEqual(parse_variant("chr3:12345 A>G"), ("3", 12345, "A", "G"))
 9 |         self.assertEqual(parse_variant("3:12345:A:G"), ("3", 12345, "A", "G"))
10 |         self.assertEqual(parse_variant("chrX:12345:A:G"), ("X", 12345, "A", "G"))
11 |         self.assertEqual(parse_variant("chrY:12345:A:G"), ("Y", 12345, "A", "G"))
12 |         with self.assertRaises(ValueError):
13 |             parse_variant("Z:12345:A:G")
14 | 
15 |     def test_spliceai_results(self):
16 |         # from test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz
17 |         # 1       69091   .       A       AA      .       .       SpliceAI=AA|OR4F5|0.00|0.00|0.03|0.00|-15|42|2|24
18 |         # 1       69124   .       GATT    G       .       .       SpliceAI=G|OR4F5|0.00|0.02|0.00|0.06|18|9|27|-31
19 | 
20 |         variant = "1 69091 A AA"
21 |         for distance in SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_DISTANCE - 1:
22 |             result = process_variant(variant, "38", distance, SPLICEAI_DEFAULT_MASK)
23 |             self.assertEqual(result['variant'], variant)
24 |             self.assertEqual(result['chrom'], "1")
25 |             self.assertEqual(result['pos'], 69091)
26 |             self.assertEqual(result['ref'], "A")
27 |             self.assertEqual(result['alt'], "AA")
28 |             self.assertEqual(result['genome_version'], "38")
29 |             self.assertEqual(result['source'], "lookup" if distance == SPLICEAI_DEFAULT_DISTANCE else "computed")
30 |             self.assertListEqual(result['scores'], ["OR4F5|0.00|0.00|0.03|0.00|-15|42|2|24"])
31 | 
32 |         variant = "1:69539:T:G"
33 |         for masked in 0, 1:
34 |             for distance in SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_DISTANCE - 1:
35 |                 result = process_variant(variant, "38", distance, masked)
36 |                 self.assertEqual(result['variant'], variant)
37 |                 self.assertEqual(result['chrom'], "1")
38 |                 self.assertEqual(result['pos'], 69539)
39 |                 self.assertEqual(result['ref'], "T")
40 |                 self.assertEqual(result['alt'], "G")
41 |                 self.assertEqual(result['genome_version'], "38")
42 |                 self.assertEqual(result['source'], "lookup" if distance == SPLICEAI_DEFAULT_DISTANCE else "computed")
43 |                 self.assertListEqual(result['scores'], ["OR4F5|0.00|0.01|0.11|0.29|20|-2|49|-2"] if not masked else ["OR4F5|0.00|0.00|0.11|0.00|20|-2|49|-2"])
44 | 
45 |         #print(get_delta_scores(VariantRecord(*parse_variant("2-179531962-C-A")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK))
46 |         #print(get_delta_scores(VariantRecord(*parse_variant("2-179532167-A-G")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK))
47 |         #print(get_delta_scores(VariantRecord(*parse_variant("2-179529170-GACAGTTAAGAATGTACCTTTGACAGGTACA-G")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK))
48 | 
49 | 


--------------------------------------------------------------------------------
/annotations/combine_score_tables.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gzip
  3 | import os
  4 | import tqdm
  5 | 
  6 | chrom_to_index = {f"chr{i}": i for i in range(1, 23)}
  7 | chrom_to_index["chrX"] = 23
  8 | chrom_to_index["chrY"] = 24
  9 | index_to_chrom = {i: v for v, i in chrom_to_index.items()}
 10 | base_to_index = {c: i for i, c in enumerate("ACGT")}
 11 | index_to_base = {i: c for c, i in base_to_index.items()}
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("-r", "--genome-version", choices=["hg38", "hg19"], required=True)
 15 | args = parser.parse_args()
 16 | 
 17 | hg38_or_hg19 = args.genome_version
 18 | 
 19 | ## Process hg38 tables
 20 | all_keys = set()
 21 | table1_lookup = {}
 22 | table1_path = os.path.expanduser(f"~/code/SpliceAI-lookup/annotations/PrimateAI_3D.{hg38_or_hg19}.with_gene_thresholds.txt.gz")
 23 | 
 24 | chrom_column = "chr" if hg38_or_hg19 == "hg38" else "chrom"
 25 | ref_column = "non_flipped_ref" if hg38_or_hg19 == "hg38" else "ref"
 26 | alt_column = "non_flipped_alt" if hg38_or_hg19 == "hg38" else "alt"
 27 | percentile_column = "percentile_PAI3D" if hg38_or_hg19 == "hg38" else "PAI3D_percentile"
 28 | gene_threshold_column = "PAI3D_Gene_Percentile_Threshold" if hg38_or_hg19 == "hg38" else "PAI3D_gene_threshold"
 29 | 
 30 | print(f"Reading table #1 from {table1_path}")
 31 | with gzip.open(table1_path, "rt") as f:
 32 | 	header = next(f).strip().split("\t")
 33 | 	header_indices = {c: i for i, c in enumerate(header)}
 34 | 	counter = 0
 35 | 	for line in tqdm.tqdm(f, unit=" lines", unit_scale=True, total=70_667_467):
 36 | 		fields = line.rstrip().split("\t")
 37 | 		chrom = fields[header_indices[chrom_column]]
 38 | 		if "_" in chrom:
 39 | 			# skip supercontigs
 40 | 			continue
 41 | 
 42 | 		key = (
 43 | 			chrom_to_index[chrom],
 44 | 			int(fields[header_indices["pos"]]),
 45 | 			base_to_index[fields[header_indices[ref_column]]],
 46 | 			base_to_index[fields[header_indices[alt_column]]]
 47 | 		)
 48 | 		all_keys.add(key)
 49 | 		table1_lookup[key] = [
 50 | 			float(fields[header_indices[percentile_column]]), 
 51 | 			float(fields[header_indices[gene_threshold_column]]), 
 52 | 		]
 53 | 		counter += 1
 54 | 
 55 | print(f"Parsed {counter:,d} records from table #1")
 56 | 
 57 | table2_lookup = {}
 58 | if hg38_or_hg19 == "hg38":
 59 | 	table2_path = os.path.expanduser("~/code/SpliceAI-lookup/annotations/promoterAI_tss500.tsv.gz")
 60 | else:
 61 | 	table2_path = os.path.expanduser("~/code/SpliceAI-lookup/annotations/promoterAI_tss500_hg19.tsv.gz")
 62 | 
 63 | print(f"Reading table #2 from {table2_path}")
 64 | with gzip.open(table2_path, "rt") as f:
 65 | 	header = next(f).strip().split("\t")
 66 | 	header_indices = {c: i for i, c in enumerate(header)}
 67 | 	counter = 0
 68 | 	for line in tqdm.tqdm(f, unit=" lines", unit_scale=True, total=261_666_406):
 69 | 		fields = line.rstrip().split("\t")
 70 | 		chrom = fields[header_indices[chrom_column]]
 71 | 		if "_" in chrom:
 72 | 			# skip supercontigs
 73 | 			continue
 74 | 
 75 | 		key = (
 76 | 			chrom_to_index[chrom],
 77 | 			int(fields[header_indices["pos"]]),
 78 | 			base_to_index[fields[header_indices["ref"]]],
 79 | 			base_to_index[fields[header_indices["alt"]]]
 80 | 		)
 81 | 		all_keys.add(key)
 82 | 		table2_lookup[key] = float(fields[header_indices["promoterAI"]])
 83 | 		counter += 1
 84 | 
 85 | print(f"Parsed {counter:,d} records from table #2")
 86 | output_path = f"PrimateAI_and_PromoterAI_scores.{hg38_or_hg19}.tsv"
 87 | print(f"Writing output to {output_path}")
 88 | with open(output_path, "wt") as f:
 89 | 	f.write("\t".join([
 90 | 		"chrom",
 91 | 		"pos",
 92 | 		"ref",
 93 | 		"alt",
 94 | 		"PAI3D_percentile",
 95 | 		"PAI3D_gene_threshold",
 96 | 		"PromoterAI_score",
 97 | 	]) + "\n")
 98 | 
 99 | 	for key in tqdm.tqdm(sorted(all_keys), unit=" records", unit_scale=True, total=len(all_keys)):
100 | 		chrom_index, pos, ref_index, alt_index = key
101 | 		percentile_PAI3D, PAI3D_gene_threshold = table1_lookup.get(key, (None, None))
102 | 		promoterAI_score = table2_lookup.get(key, None)
103 | 
104 | 		f.write("\t".join([
105 | 			index_to_chrom[chrom_index],
106 | 			str(pos),
107 | 			index_to_base[ref_index],
108 | 			index_to_base[alt_index],
109 | 			f"{percentile_PAI3D:.3f}" if percentile_PAI3D is not None else "",
110 | 			f"{PAI3D_gene_threshold:.2f}" if PAI3D_gene_threshold is not None else "",
111 | 			f"{promoterAI_score:.3f}" if promoterAI_score is not None else "",
112 | 		]) + "\n")
113 | 
114 | os.system(f"bgzip {output_path}")
115 | os.system(f"tabix -f -S 1 -s 1 -b 2 -e 2 {output_path}.gz")
116 | 
117 | #%%
118 | 


--------------------------------------------------------------------------------
/annotations/convert_gtf_to_SpliceAI_annotation_input_format.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | 
  3 | import argparse
  4 | import collections
  5 | import gzip
  6 | import json
  7 | import os
  8 | import pandas as pd
  9 | import re
 10 | 
 11 | from annotation_utils.gtf_utils import parse_gtf
 12 | 
 13 | 
 14 | def main():
 15 |     p = argparse.ArgumentParser(description="""This script takes a Gencode .gtf.gz file
 16 |         and outputs an annotation file which can be passed to SpliceAI instead of 
 17 |         the default SpliceAI annotations which are still on Gencode v24.""")
 18 | 
 19 |     p.add_argument("-a", "--annotation-json-path", required=True, help="Path of the transcript annotations JSON file "
 20 |                    "created by generate_transcript_annotation_json.py")
 21 |     p.add_argument("--gtf-id-field", default="transcript_id", choices=["transcript_id", "gene_id"])
 22 |     p.add_argument("gtf_gz_path", help="Path of gene annotations file in GTF format")
 23 |     args = p.parse_args()
 24 | 
 25 |     for path in args.annotation_json_path, args.gtf_gz_path:
 26 |         if not os.path.exists(path):
 27 |             p.error(f"File not found: {path}")
 28 | 
 29 |     fopen = gzip.open if args.gtf_gz_path.endswith("gz") else open
 30 |     with fopen(args.annotation_json_path, "rt") as f:
 31 |         transcript_annotations = json.load(f)
 32 | 
 33 |     print(f"Parsing {args.gtf_gz_path}")
 34 |     gtf_id_to_exons = collections.defaultdict(set)
 35 |     for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), "exon"):
 36 |         key = (record[args.gtf_id_field], record["strand"], record["chrom"])
 37 |         exon_tuple = (record["start"], record["end"])
 38 |         if exon_tuple in gtf_id_to_exons[key]:
 39 |             raise ValueError(f"Duplicate exon: {exon_tuple} in transcript {key}")
 40 |         gtf_id_to_exons[key].add(exon_tuple)
 41 | 
 42 |     output_records = []
 43 |     # SpliceAI predictions (prior to 'masking') depend only on transcript chrom/start/end/strand. Often, transcripts
 44 |     # within the same gene have the same chrom/start/end/strand and differ only in their internal exon structure.
 45 |     # We can discard these redundant transcripts (while making sure to keep all MANE Select and canonical transcripts).
 46 |     output_records_transcript_keys = set()
 47 |     maybe_output_records = []
 48 |     for (gtf_id, strand, chrom), exon_set in gtf_id_to_exons.items():
 49 |         tx_start_0based = min([start_1based - 1 for start_1based, _ in exon_set])
 50 |         tx_end_1based = max([end_1based for _, end_1based in exon_set])
 51 |         gtf_id_without_version = gtf_id.split(".")[0]
 52 |         if gtf_id_without_version not in transcript_annotations:
 53 |             print(f"WARNING: transcript {gtf_id_without_version} not found in {args.annotation_json_path}")
 54 |             continue
 55 |         transcript_annotation = transcript_annotations[gtf_id_without_version]
 56 |         if transcript_annotation['t_priority'] == "N":
 57 |             output_list = maybe_output_records
 58 |         else:
 59 |             output_list = output_records
 60 |             transcript_key = (chrom, strand, str(tx_start_0based), str(tx_end_1based))
 61 |             output_records_transcript_keys.add(transcript_key)
 62 | 
 63 |         # if it's a MANE Select, MANE Plus Clinical or canonical transcript
 64 |         exon_list = sorted(list(exon_set))
 65 |         exon_starts_0based = [start_1based - 1 for start_1based, _ in exon_list]
 66 |         exon_ends_1based = [end_1based for _, end_1based in exon_list]
 67 | 
 68 |         # reformat the records into a list which can be turned into a pandas DataFrame
 69 |         output_list.append({
 70 |             "#NAME": gtf_id,
 71 |             "CHROM": chrom,
 72 |             "STRAND": strand,
 73 |             "TX_START": str(tx_start_0based),
 74 |             "TX_END": str(tx_end_1based),
 75 |             "EXON_START": ",".join([str(s) for s in exon_starts_0based]) + ",",
 76 |             "EXON_END": ",".join([str(s) for s in exon_ends_1based]) + ",",
 77 |         })
 78 | 
 79 |     transcripts_kept_counter1 = len(output_records)
 80 |     transcripts_kept_counter2 = 0
 81 |     for output_record in maybe_output_records:
 82 |         transcript_key = (output_record["CHROM"], output_record["STRAND"], output_record["TX_START"], output_record["TX_END"])
 83 |         if transcript_key not in output_records_transcript_keys:
 84 |             # if this transcript has a chrom/start/end/strand that hasn't been seen before, add it to the output
 85 |             output_records_transcript_keys.add(transcript_key)
 86 |             output_records.append(output_record)
 87 |             transcripts_kept_counter2 += 1
 88 | 
 89 |     assert transcripts_kept_counter1 + transcripts_kept_counter2 == len(output_records)
 90 |     print(f"Kept {transcripts_kept_counter1:,d} transcripts which were MANE Select, MANE Plus Clinical or canonical.")
 91 |     print(f"Kept {transcripts_kept_counter2:,d} additional transcripts with unique transcript start/stop coords.")
 92 |     print(f"Discarded {len(maybe_output_records) - transcripts_kept_counter2:,d} out of {len(gtf_id_to_exons):,d} "
 93 |           f"({(len(maybe_output_records) - transcripts_kept_counter2) / len(gtf_id_to_exons):.1%}) transcripts "
 94 |           f"because they were redundant.")
 95 | 
 96 |     output_df = pd.DataFrame(output_records)
 97 |     output_df = output_df[["#NAME", "CHROM", "STRAND", "TX_START", "TX_END", "EXON_START", "EXON_END"]]
 98 |     output_path = re.sub(".gtf.gz$", "", os.path.basename(args.gtf_gz_path)) + ".txt.gz"
 99 |     output_df.to_csv(output_path, index=False, sep="\t")
100 | 
101 |     print(f"Wrote {len(output_df):,d} records to {os.path.abspath(output_path)}")
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repo contains: 
 2 | - client-side code for [spliceailookup.broadinstitute.org](https://spliceailookup.broadinstitute.org/) - contained within the [index.html](index.html) file and hosted via GitHub Pages.
 3 | - server-side code for SpliceAI and Pangolin REST APIs - contained within the [google_cloud_run_services/](google_cloud_run_services/) subdirectory and hosted on Google Cloud Run. 
 4 | 
 5 | ---
 6 | 
 7 | #### SpliceAI, Pangolin APIs
 8 | 
 9 | 
10 | <b>NOTE:</b> These APIs are intended for interactive use only, and do not support more than several requests per user per minute. More frequent queries will trigger a "rate limit" error in the response. To process large batches of variants, please set up and query your own local instance of the API server. This is easy to do using the publicly available docker images (see below for details). Alternatively, you can intall and run the underlying SpliceAI and/or Pangolin models directly on your local infrastructure. Their source code is available @ [https://github.com/bw2/SpliceAI](https://github.com/bw2/SpliceAI) and [https://github.com/bw2/Pangolin](https://github.com/bw2/Pangolin). <br />
11 | <br />
12 | 
13 | The SpliceAI and Pangolin APIs have different base urls for different genome versions:
14 | 
15 | `https://spliceai-37-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=37&variant=` - SpliceAI for variants on GRCh37<br />
16 | `https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=` - SpliceAI for variants on GRCh38<br />
17 | `https://pangolin-37-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=37&variant=` - Pangolin for variants on GRCh37<br />
18 | `https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=` - Pangolin for variants on GRCh38 <br />
19 | 
20 | To query the API, append your variant of interest in `chrom-pos-ref-alt` format to the appropriate base url above.
21 | 
22 | For example, to get SpliceAI scores for `chr8-140300616-T-G`:<br>
23 | 
24 | *[https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=chr8-140300616-T-G](https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=chr8-140300616-T-G)*
25 |   
26 | or to get Pangolin scores while also setting the `distance` and `mask` parameters:<br>
27 | 
28 | *[https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=chr8-140300616-T-G&distance=1000&mask=1](https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=chr8-140300616-T-G&distance=1000&mask=1)*
29 | 
30 | #### API parameters
31 | 
32 | Parameter descriptions:  
33 | 
34 | - **variant** (required) a variant in the format "chrom-pos-ref-alt"  
35 | - **hg** (required) can be 37 or 38  
36 | - **distance** (optional) distance parameter of SpliceAI model (default: 50)   
37 | - **mask** (optional) can be 0 which means raw scores or 1 which means masked scores (default: 0). 
38 | Splicing changes corresponding to strengthening annotated splice sites and weakening unannotated splice sites are typically much less pathogenic than weakening annotated splice sites and
39 | strengthening unannotated splice sites. When this parameter is = 1 (masked), the delta scores of such splicing changes are set to 0. SpliceAI developers recommend using raw (0) for alternative splicing analysis and masked (1) for variant interpretation.  
40 | 
41 | 
42 | ---
43 | #### Running Your Own Local API Server
44 | 
45 | If you have [docker](https://docs.docker.com/engine/install/) installed, you can easily start your own SpliceAI-lookup API server by running one of these commands (depending on which model and genome version you want to query):
46 | 
47 | ```
48 | docker run -p 8080:8080 docker.io/weisburd/spliceai-38:latest
49 | docker run -p 8080:8080 docker.io/weisburd/spliceai-37:latest
50 | docker run -p 8080:8080 docker.io/weisburd/pangolin-38:latest
51 | docker run -p 8080:8080 docker.io/weisburd/pangolin-37:latest
52 | ```
53 | When it starts, it will print:  
54 | ```
55 |  * Serving Flask app 'server'
56 |  * Debug mode: on
57 | ```   
58 | 
59 | Let's say you ran the `spliceai-38` instance. You should then be able to query it by, for example, opening http://localhost:8080/spliceai/?hg=38&variant=chr8-140300616-T-G in your browser.
60 | The docker container will initially print:   
61 | ```
62 | ERROR: Unable to connect to SQL database...
63 | WARNING:absl:No training configuration found...
64 | WARNING:tensorflow:...
65 | ```
66 | but these messages can be ignored, and subsequent queries will run faster.
67 | 
68 | 
69 | If you would like to run your own API instance on Google Cloud instead of locally, see the [build_and_deploy.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/build_and_deploy.py#L224-L238) script which we use to deploy and update the SpliceAI-lookup API on [Google Cloud Run](https://cloud.google.com/run?hl=en). Submit a GitHub issue if you have any questions.
70 | 
71 | ---
72 | #### Code Overview For Developers
73 | 
74 | The [spliceailookup.broadinstitute.org](https://spliceailookup.broadinstitute.org) front-end is contained within [index.html](index.html). It uses ES6 javascript with [Semantic UI](https://semantic-ui.com) and [jQuery](https://en.wikipedia.org/wiki/JQuery). Also, it uses a [custom version of igv.js](https://github.com/bw2/igv.js) that includes new track types for visualizing the SpliceAI & Pangolin scores. The new server-side code is in the [google_cloud_run_services/](google_cloud_run_services/) subdirectory and includes Dockerfiles for building API server images, as well as the [build_and_deploy.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/build_and_deploy.py#L224-L238) script for deploying SpliceAI and Pangolin API services to [Google Cloud Run](https://cloud.google.com/run?hl=en). 
75 | The API server logic is in [google_cloud_run_services/server.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/server.py) and uses the [Flask](https://flask.palletsprojects.com/en/3.0.x) library.
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/test_score_consistency.py:
--------------------------------------------------------------------------------
  1 | """Use the cache to check if any scores have changed since the last time the scores were computed."""
  2 | 
  3 | import collections
  4 | import configargparse
  5 | import json
  6 | import os
  7 | import pandas as pd
  8 | import psycopg2
  9 | import re
 10 | import requests
 11 | import tqdm
 12 | import time
 13 | 
 14 | from contextlib import contextmanager
 15 | 
 16 | @contextmanager
 17 | def get_db_connection():
 18 |     """Get a database connection"""
 19 |     #conn = DATABASE_CONNECTION_POOL.getconn()
 20 |     conn = psycopg2.connect(
 21 |         dbname="spliceai-lookup-db",
 22 |         user="postgres",
 23 |         password=os.environ.get("DB_PASSWORD"),
 24 |         host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db",
 25 |         port="5432",
 26 |         connect_timeout=5,
 27 |     )
 28 | 
 29 |     try:
 30 |         yield conn
 31 |     finally:
 32 |         conn.close()
 33 | 
 34 | @contextmanager        
 35 | def get_db_cursor(conn):
 36 |     """Get a database cursor"""
 37 |     cursor = conn.cursor()
 38 |     try:
 39 |         yield cursor
 40 |         conn.commit()
 41 |     finally:
 42 |         cursor.close()
 43 | 
 44 | def run_sql(conn, sql_query, *params):
 45 |     with get_db_cursor(conn) as cursor:
 46 |         cursor.execute(sql_query, *params)
 47 |         try:
 48 |             results = cursor.fetchall()
 49 |         except:
 50 |             results = []
 51 |     return results
 52 | 
 53 | p = configargparse.ArgParser(default_config_files=["~/.spliceai_lookup_db_config"])
 54 | p.add_argument("--ip", required=True)
 55 | p.add_argument("--user", required=True)
 56 | p.add_argument("--password", required=True)
 57 | p.add_argument("--db", default="spliceai-lookup-db")
 58 | p.add_argument("-n", type=int, help="number of rows to query", default=1000)
 59 | p.add_argument("-p", "--show-progress-bar", action="store_true")
 60 | args, _ = p.parse_known_args()
 61 | 
 62 | myip = requests.get("http://checkip.dyndns.com").text
 63 | myip_match = re.search(r'Address: (\d+\.\d+\.\d+\.\d+)', myip)
 64 | if myip_match:
 65 |     myip_match = myip_match.group(1)
 66 | 
 67 | days_ago = 30
 68 | conn = psycopg2.connect(f"dbname='{args.db}' user='{args.user}' host='{args.ip}' password='{args.password}'")
 69 | #query = f"SELECT key, value, accessed FROM cache WHERE accessed < now() - INTERVAL '{days_ago} days' ORDER BY accessed ASC"
 70 | #query = f"SELECT key, value, accessed FROM cache WHERE key LIKE 'pangolin%hg38%' AND accessed > now() - INTERVAL '{days_ago} days' ORDER BY accessed ASC"
 71 | query = f"SELECT key, value, accessed FROM cache WHERE key LIKE 'pangolin%hg38%' ORDER BY accessed ASC"
 72 | df = pd.read_sql_query(query, conn)
 73 | print(f"Retrieved {len(df):,d} records from cache that were last accessed less than {days_ago} days ago.")
 74 | if args.n:
 75 |     keep_every_kth_record = len(df)//args.n
 76 |     if keep_every_kth_record > 1:
 77 |         df = df[df.index % keep_every_kth_record == 0]
 78 |         print(f"Kept {len(df):,d} records after applying -n {args.n} arg")
 79 | 
 80 | counter = collections.Counter()
 81 | iterator = zip(df.key, df.value, df.accessed)
 82 | if args.show_progress_bar:
 83 |     iterator = tqdm.tqdm(iterator, total=len(df), unit=" variants", unit_scale=True)
 84 | 
 85 | for i, (cache_key, cache_value, last_accessed) in enumerate(iterator):
 86 |     print(f"{i+1:3,d}: Processing", cache_key, "which was last accessed on", last_accessed)
 87 |     data = json.loads(cache_value)
 88 | 
 89 |     if not data.get("scores"):
 90 |         print("ERROR: No scores found in cached value. Skipping...")
 91 |         continue
 92 | 
 93 |     tool = data["source"].split(":")[0]
 94 |     hg = data["genomeVersion"]
 95 |     distance = data["distance"]
 96 |     cache_key = cache_key.replace("__basic", "").replace("__comprehensive", "")
 97 |     assert cache_key[-2:] in ("m1", "m0")
 98 |     mask = cache_key[-1]
 99 |     variant = data["variant"]
100 | 
101 |     # get json response
102 |     # time requests
103 |     start_time = time.time()
104 |     url = f"https://{tool}-{hg}-xwkwwwxdwq-uc.a.run.app/{tool}/?hg={hg}&distance={distance}&mask={mask}&variant={variant}&raw={variant}"
105 |     # print(url)
106 |     try:
107 |         response_json = requests.get(f"{url}&force=1").json()
108 |     except Exception as e:
109 |         print(f"ERROR: {e} when retrieving {url}  Skipping...")
110 |         continue
111 | 
112 |     if not response_json.get("scores"):
113 |         print(f"ERROR: {url} response doesn't contain scores: {response_json}. Skipping...")
114 |         continue
115 | 
116 |     if myip_match:
117 |         print(f"Deleting logs for ip {myip_match}")
118 |         run_sql(conn, f"DELETE FROM log WHERE ip='{myip_match}'")
119 | 
120 |     
121 |     elapsed_time = time.time() - start_time
122 |     response_json["scores"] = list(sorted(response_json["scores"], key=lambda s: s.get("t_id")))
123 |     response_scores = response_json["scores"][0]
124 | 
125 |     data["scores"] = list(sorted(data["scores"], key=lambda s: s.get("t_id")))
126 |     cached_scores = data["scores"][0]
127 | 
128 |     if not response_scores.get("t_id") or response_scores.get("t_id") != cached_scores.get("t_id"):
129 |         print("Transcript ids don't match:", response_scores.get("t_id"), "vs", cached_scores.get("t_id"),
130 |               ". Skipping...")
131 |         continue
132 | 
133 |     if not response_scores.get("g_id") or response_scores.get("g_id") != cached_scores.get("g_id"):
134 |         print("Gene ids don't match:", response_scores.get("g_id"), "vs", cached_scores.get("g_id"),
135 |               ". Skipping...")
136 |         continue
137 | 
138 |     counter[f"   {tool}"] += 1
139 |     counter[f"  hg{hg}"] += 1
140 |     counter[f" m{mask}"] += 1
141 | 
142 |     missing_keys = set()
143 |     mismatched_values = set()
144 |     values_to_print = set()
145 |     for k, v1 in cached_scores.items():
146 |         if k in ("t_refseq_ids", "t_id", "g_id", "g_name"):
147 |             # differences in gene ids are not important
148 |             continue
149 | 
150 |         if k not in response_scores:
151 |             missing_keys.add(k)
152 |             continue
153 | 
154 |         v2 = response_scores[k]
155 |         try:
156 |             diff = float(v1) - float(v2)
157 |         except:
158 |             diff = "?"
159 | 
160 |         values_to_print.add((k, v1, v2, diff))
161 |         if v1 != v2:
162 |             mismatched_values.add((k, v1, v2, diff))
163 |             continue
164 | 
165 |     if missing_keys:
166 |         print(f"ERROR: {cache_key} which was last accessed on {last_accessed} is missing keys: {missing_keys}. Response: {json.dumps(response_json, indent=1)}")
167 | 
168 |     if mismatched_values:
169 |         counter["ERROR: mismatched_values"] += 1
170 |         print(f"ERROR: {cache_key} which was last accessed on {last_accessed} has mismatched values for keys: "
171 |               f"{', '.join(sorted([t[0] for t in mismatched_values]))} "
172 |               f"with max delta_score_diff="
173 |               f"{max([abs(t[3]) for t in mismatched_values if t[0].startswith('DS')] or [None])} "
174 |               f"and max raw_score_diff="
175 |               f"{max([abs(t[3]) for t in mismatched_values if t[0].startswith('S')] or [None])} ")
176 | 
177 |         for k, v1, v2, diff in sorted(values_to_print):
178 |             print(f"    {k}:  {v1}  vs  {v2}   diff: {diff}")
179 | 
180 |         #print(f"	Cache: {json.dumps(data, indent=1)}")
181 |         #print(f"	Response: {json.dumps(response_json, indent=1)}")
182 | 
183 |         df = pd.read_sql_query(f"SELECT * FROM log WHERE variant='{variant}'", conn)
184 |         print(f"        Log:")
185 |         print(df.to_string(index=False))
186 |                 
187 |     print(f"{i+1:3,d}: Done with", cache_key, f"elapsed_time={elapsed_time:.1f}s")
188 | 
189 | conn.close()
190 | 
191 | print(f"Done")
192 | 
193 | print("Stats:")
194 | for key, value in sorted(counter.items()):
195 |     print(f"{value:10,d}    {key}")
196 | 
197 | #%%
198 | 


--------------------------------------------------------------------------------
/annotations/generate_transcript_annotation_json.py:
--------------------------------------------------------------------------------
  1 | #%%
  2 | 
  3 | """This script creates a json file with annotation fields for each transcript id, including
  4 | whether it's a "MANE select" transcript, canonical transcript, etc.
  5 | """
  6 | 
  7 | import argparse
  8 | import gzip
  9 | import json
 10 | import os
 11 | import re
 12 | 
 13 | from annotation_utils.get_ensembl_db_info import get_gene_id_to_canonical_transcript_id, \
 14 |     get_ensembl_ENST_to_RefSeq_ids
 15 | from annotation_utils.get_MANE_table import get_MANE_ensembl_transcript_table
 16 | from annotation_utils.gtf_utils import parse_gtf
 17 | 
 18 | 
 19 | # to get the latest database name, run:
 20 | #    mysql -h useastdb.ensembl.org -u anonymous -e "show databases;" | grep homo_sapiens_core
 21 | DEFAULT_ENSEMBL_DATABASE = "homo_sapiens_core_115_38"
 22 | 
 23 | # this is used to get the list of MANE select and MANE plus clinical ENST transcript ids.
 24 | DEFAULT_MANE_URL_BASE = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.4"
 25 | DEFAULT_MANE_SUMMARY_TABLE_FILENAME = "MANE.GRCh38.v1.4.summary.txt.gz"
 26 | 
 27 | def main():
 28 |     p = argparse.ArgumentParser(description="""This script takes a Gencode .gtf.gz file
 29 |         and outputs an annotation file which can be passed to SpliceAI instead of 
 30 |         the default SpliceAI annotations which are still on Gencode v24. 
 31 |     """)
 32 |     p.add_argument("--mane-url-base", default=DEFAULT_MANE_URL_BASE)
 33 |     p.add_argument("--mane-summary-table-filename", default=DEFAULT_MANE_SUMMARY_TABLE_FILENAME)
 34 |     p.add_argument("-e", "--ensembl-database", default=DEFAULT_ENSEMBL_DATABASE)
 35 |     p.add_argument("gtf_gz_path", help="Path of gene annotations file in GTF format")
 36 |     args = p.parse_args()
 37 | 
 38 |     mane_summary_table_url = os.path.join(args.mane_url_base, args.mane_summary_table_filename)
 39 |     MANE_df = get_MANE_ensembl_transcript_table(mane_summary_table_url=mane_summary_table_url)
 40 | 
 41 |     print(f"Initalizing transcript priority annotation function")
 42 |     compute_transcript_priority = get_transcript_priority_annotation_function(
 43 |         ensembl_database=args.ensembl_database, MANE_df=MANE_df)
 44 | 
 45 |     esnembl_ENST_to_RefSeq_ids = get_ensembl_ENST_to_RefSeq_ids(database=args.ensembl_database)
 46 |     print(f"Downloaded {len(esnembl_ENST_to_RefSeq_ids):,d} ENST to RefSeq mappings")
 47 |     for key, refseq_ids in esnembl_ENST_to_RefSeq_ids.items():
 48 |         esnembl_ENST_to_RefSeq_ids[key] = list(sorted(refseq_ids))
 49 | 
 50 |     MANE_df["ensembl_ENST_without_version"] = MANE_df["Ensembl_nuc"].apply(lambda s: s.split(".")[0])
 51 |     MANE_ensembl_ENST_to_RefSeq_id = dict(MANE_df[["ensembl_ENST_without_version", "RefSeq_nuc"]].itertuples(index=False))
 52 |     print(f"Got {len(MANE_ensembl_ENST_to_RefSeq_id):,d} MANE ENST to RefSeq mappings, of which "
 53 |           f"{len(set(MANE_ensembl_ENST_to_RefSeq_id) - set(esnembl_ENST_to_RefSeq_ids)):,d} are unique.")
 54 |     MANE_ensembl_ENST_to_RefSeq_id = {k: [v] for k, v in MANE_ensembl_ENST_to_RefSeq_id.items()}
 55 |     esnembl_ENST_to_RefSeq_ids.update(MANE_ensembl_ENST_to_RefSeq_id)
 56 | 
 57 |     gene_coordinates_lookup = {}  # used for checking coordinate consistency between gene and transcript records (eg. GPR143)
 58 |     max_transcript_coordinates_lookup = {}
 59 |     for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), feature_type="gene"):
 60 |         gene_id_without_version = record["gene_id"].split(".")[0]
 61 |         gene_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = (record["chrom"], record["start"], record["end"])
 62 | 
 63 |     print(f"Parsing {args.gtf_gz_path}")
 64 |     output_json = {}
 65 |     for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), feature_type="transcript"):
 66 |         transcript_id_without_version = record["transcript_id"].split(".")[0]
 67 |         transcript_priority = compute_transcript_priority(transcript_id=transcript_id_without_version)
 68 |         refseq_transcript_ids = esnembl_ENST_to_RefSeq_ids.get(transcript_id_without_version)
 69 |         output_json[transcript_id_without_version] = {
 70 |             "g_name": record["gene_name"],
 71 |             "g_id": record["gene_id"],
 72 |             "t_id": record["transcript_id"],
 73 |             "t_type": record["transcript_type"],
 74 |             "t_strand": record["strand"],
 75 |             "t_priority": transcript_priority,
 76 |             "t_refseq_ids": refseq_transcript_ids,
 77 |         }
 78 | 
 79 |         gene_id_without_version = record["gene_id"].split(".")[0]
 80 |         if gene_id_without_version in max_transcript_coordinates_lookup:
 81 |             transcript_chrom, transcript_start, transcript_end = max_transcript_coordinates_lookup[gene_id_without_version]
 82 |             max_transcript_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = (
 83 |                 transcript_chrom, min(transcript_start, record["start"]), max(transcript_end, record["end"]))
 84 |         else:
 85 |             max_transcript_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = (
 86 |                 record["chrom"], record["start"], record["end"])
 87 | 
 88 |     # check consistency of gene vs. transcript start/end coordinates
 89 |     warning1_counter = 0
 90 |     warning2_counter = 0
 91 |     for gene_name, gene_id_without_version in gene_coordinates_lookup:
 92 |         key = gene_name, gene_id_without_version
 93 |         gene_chrom, gene_start, gene_end = gene_coordinates_lookup[key]
 94 |         transcript_chrom, transcript_start, transcript_end = max_transcript_coordinates_lookup.get(key, (None, None, None))
 95 |         if transcript_chrom is None:
 96 |             warning1_counter += 1
 97 |             print(f"WARNING: Gene {gene_name} ({gene_id_without_version}) has no transcript records")
 98 |         elif transcript_start > gene_start or transcript_end < gene_end:
 99 |             warning2_counter += 1
100 |             #start_diff = f". Start diff is {transcript_start - gene_start:,d}bp" if transcript_start - gene_start else ""
101 |             #end_diff = f". End diff is {gene_end - transcript_end:,d}bp" if gene_end - transcript_end else ""
102 |             #print(f"WARNING: Gene {gene_name} ({gene_id_without_version}) has inconsistent coordinates: "
103 |             #      f"gene={gene_chrom}:{gene_start}-{gene_end}, "
104 |             #      f"transcript={transcript_chrom}:{transcript_start}-{transcript_end}"
105 |             #      f"{start_diff}{end_diff}")
106 |     if warning1_counter > 0:
107 |         print(f"WARNING: {warning1_counter:,d} genes don't have any transcript records")
108 |     if warning2_counter > 0:
109 |         print(f"WARNING: {warning2_counter:,d} out of {len(gene_coordinates_lookup):,d} genes have a genomic interval "
110 |               f"that is wider than the interval of any of their transcripts.")
111 | 
112 |     output_path = re.sub(".gtf.gz$", "", os.path.basename(args.gtf_gz_path)) + ".transcript_annotations.json.gz"
113 |     with gzip.open(output_path, "wt") as f:
114 |         json.dump(output_json, f, indent=4, sort_keys=True)
115 | 
116 |     print(f"Done writing {len(output_json):,d} transcript annotations to {output_path}")
117 | 
118 | 
119 | def get_transcript_priority_annotation_function(ensembl_database, MANE_df):
120 |     """Initializes annotation data and returns the compute_transcript_priority function."""
121 | 
122 |     MANE_select_transcript_ids = {
123 |         t_id.split(".")[0] for t_id in MANE_df[MANE_df["MANE_status"] == "MANE Select"]["Ensembl_nuc"]}
124 |     print(f"Got {len(MANE_select_transcript_ids):,d} MANE select transcript ids")
125 | 
126 |     MANE_plus_clinical_transcript_ids = {
127 |         t_id.split(".")[0] for t_id in MANE_df[MANE_df["MANE_status"] == "MANE Plus Clinical"]["Ensembl_nuc"]}
128 |     print(f"Got {len(MANE_plus_clinical_transcript_ids):,d} MANE plus clinical transcript ids")
129 | 
130 |     gene_id_to_canonical_transcript_id = get_gene_id_to_canonical_transcript_id(database=ensembl_database)
131 |     canonical_transcript_ids = {
132 |         t_id.split(".")[0] for t_id in gene_id_to_canonical_transcript_id.values()}
133 |     print(f"Got {len(canonical_transcript_ids):,d} canonical transcript ids")
134 | 
135 |     def compute_transcript_priority(transcript_id):
136 |         """Returns a string indicating the priority of the given transcript.
137 |         The return value can be (in order from higher to lower priority):
138 |             "MS" (for MANE select)
139 |             "MP" (for MANE plus clinical)
140 |             "C" (for canonical)
141 |             "N" (for none of the above)
142 |         """
143 |         transcript_id = transcript_id.split(".")[0]
144 | 
145 |         if transcript_id in MANE_select_transcript_ids:
146 |             transcript_priority = "MS"
147 |         elif transcript_id in MANE_plus_clinical_transcript_ids:
148 |             transcript_priority = "MP"
149 |         elif transcript_id in canonical_transcript_ids:
150 |             transcript_priority = "C"
151 |         else:
152 |             transcript_priority = "N"
153 | 
154 |         return transcript_priority
155 | 
156 |     return compute_transcript_priority
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     main()
161 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/build_and_deploy.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import time
  5 | 
  6 | import pandas as pd
  7 | import re
  8 | 
  9 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(message)s")
 10 | 
 11 | VALID_COMMANDS = {
 12 |     "update_annotations", "build", "deploy", "test", "test2", "run",
 13 | }
 14 | 
 15 | GCLOUD_PROJECT = "spliceai-lookup-412920"
 16 | DOCKERHUB_REPO = "docker.io/weisburd"
 17 | 
 18 | def get_service_name(tool, genome_version):
 19 |     return f"{tool}-{genome_version}"
 20 | 
 21 | def get_tag(tool, genome_version, repo_name="gcr.io"):
 22 |     if repo_name == "gcr.io":
 23 |         return f"us-central1-docker.pkg.dev/spliceai-lookup-412920/docker/{get_service_name(tool, genome_version)}"
 24 |     elif repo_name == "dockerhub":
 25 |         return f"{DOCKERHUB_REPO}/{get_service_name(tool, genome_version)}"
 26 |     else:
 27 |         raise ValueError(f"Invalid repo_name arg: {repo_name}")
 28 | 
 29 | def run(c):
 30 |     logging.info(c)
 31 |     os.system(c)
 32 | 
 33 | def main():
 34 |     parser = argparse.ArgumentParser()
 35 |     parser.add_argument("-g", "--genome-version", choices=["37", "38"], help="If not specified, command will run for both GRCh37 and GRCh38")
 36 |     parser.add_argument("-t", "--tool", choices=["spliceai", "pangolin"], help="If not specified, command will run for both spliceai and pangolin")
 37 |     parser.add_argument("-d", "--docker-command", choices=["docker", "podman"], default="docker", help="Whether to use docker or podman to build the image")
 38 |     g = parser.add_mutually_exclusive_group()
 39 |     g.add_argument("--gencode-version",
 40 |                    help="The gencode version to use for the 'update_annotations' command (example: 'v49'). Either this "
 41 |                         "or --gencode-gtf must be specified for the 'update_annotations' command")
 42 |     g.add_argument("--gencode-gtf",
 43 |                    help="Path of the newest 'basic' Gencode GTF file that was downloaded from "
 44 |                         "https://www.gencodegenes.org/human/. Either this or --gencode-version must be specified for "
 45 |                         "the 'update_annotations' command")
 46 | 
 47 |     parser.add_argument("command", nargs="?", choices=VALID_COMMANDS,
 48 |                         help="Command to run. If not specified, it will run 'build' and then 'deploy'")
 49 | 
 50 |     args = parser.parse_args()
 51 | 
 52 |     if args.genome_version:
 53 |         genome_versions = [args.genome_version]
 54 |     else:
 55 |         genome_versions = ["38", "37"]
 56 | 
 57 |     if args.tool:
 58 |         tools = [args.tool]
 59 |     else:
 60 |         tools = ["spliceai", "pangolin"]
 61 | 
 62 |     if args.gencode_version:
 63 |         if not re.match("v[0-9][0-9]", args.gencode_version):
 64 |             parser.error("--gencode-version must be of the form 'v46'")
 65 |         gencode_version_number = int(args.gencode_version.lstrip("v"))
 66 |     else:
 67 |         gencode_version_number = None
 68 | 
 69 |     if args.command == "update_annotations":
 70 |         if not args.gencode_version and not args.gencode_gtf:
 71 |             parser.error("Either --gencode-version or --gencode-gtf must be specified for the update_annotations command")
 72 | 
 73 |         gencode_gtf_paths = {}
 74 |         if args.gencode_version:
 75 |             for genome_version in genome_versions:
 76 |                 for basic_or_comprehensive in "", ".basic":
 77 |                     if genome_version == "37":
 78 |                         gencode_gtf_url = f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_version_number}/GRCh37_mapping/gencode.{args.gencode_version}lift37{basic_or_comprehensive}.annotation.gtf.gz"
 79 |                     elif genome_version == "38":
 80 |                         gencode_gtf_url = f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_version_number}/gencode.{args.gencode_version}{basic_or_comprehensive}.annotation.gtf.gz"
 81 |                     else:
 82 |                         parser.error(f"Invalid genome version: {genome_version}")
 83 | 
 84 |                     run(f"wget -nc {gencode_gtf_url}")
 85 |                     run(f"wget -nc https://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/gtfToGenePred")
 86 |                     run(f"chmod 777 gtfToGenePred")
 87 |                     gencode_gtf_paths[(genome_version, basic_or_comprehensive)] = os.path.basename(gencode_gtf_url)
 88 |         else:
 89 |             if not args.genome_version:
 90 |                 parser.error("If --gencode-gtf is specified, --genome-version is required")
 91 |             if not os.path.isfile(args.gencode_gtf):
 92 |                 parser.error(f"File not found: {args.gencode_gtf}")
 93 |             gencode_gtf_paths[(args.genome_version, "basic")] = args.gencode_gtf
 94 | 
 95 |         for genome_version, _ in gencode_gtf_paths.keys():
 96 |             run(f"rm ./docker/ref/GRCh{genome_version}/gencode.*.basic.annotation.transcript_annotations.json.gz")
 97 |             run(f"rm ./docker/spliceai/annotations/GRCh{genome_version}/gencode.*.annotation*.txt.gz")
 98 |             run(f"rm ./docker/pangolin/annotations/GRCh{genome_version}/gencode.*.annotation*.db")
 99 | 
100 |         for (genome_version, basic_or_comprehensive), gencode_gtf_path in gencode_gtf_paths.items():
101 |             # generate genePred files to use as gene tracks in IGV.js
102 |             if args.gencode_version:
103 |                 gene_pred_path = f"gencode.{args.gencode_version}.GRCh{genome_version}.txt"
104 |                 run(f"./gtfToGenePred -genePredExt -geneNameAsName2 {gencode_gtf_path} {gene_pred_path}")
105 | 
106 |                 print(f"Reading {gene_pred_path}")
107 |                 column_names = [
108 |                     "name",
109 |                     "chrom",
110 |                     "strand",
111 |                     "txStart",
112 |                     "txEnd",
113 |                     "cdsStart",
114 |                     "cdsEnd",
115 |                     "exonCount",
116 |                     "exonStarts",
117 |                     "exonEnds",
118 |                     "score",
119 |                     "name2",
120 |                     "cdsStartStat",
121 |                     "cdsEndStat",
122 |                     "exonFrames",
123 |                 ]
124 |                 df = pd.read_table(gene_pred_path, names=column_names)
125 |                 df["txStart"] = df["txStart"].astype(int)
126 |                 df["txEnd"] = df["txEnd"].astype(int)
127 |                 filter_exp = (df["txStart"] > 0) & (df["txEnd"] > 0)
128 |                 df2 = df[filter_exp]
129 |                 if len(df) - len(df2) > 0:
130 |                     print(f"Filtered out {len(df) - len(df2):,d} records from {gene_pred_path}:")
131 |                     print(df[~filter_exp])
132 | 
133 |                 df2 = df2.sort_values(["chrom", "txStart", "txEnd"])
134 |                 df2["i"] = df2["name2"].map({name: i for i, name in enumerate(df2.name2.unique())})
135 |                 df2 = df2[["i"] + column_names]
136 |                 sorted_gene_pred_path = gene_pred_path.replace(".txt", ".sorted.txt")
137 |                 df2.to_csv(sorted_gene_pred_path, header=False, index=False, sep="\t")
138 |                 run(f"bgzip -f {sorted_gene_pred_path}")
139 |                 run(f"tabix -s 3 -b 5 -e 6 -f {sorted_gene_pred_path}.gz")
140 | 
141 |                 run(f"gsutil -m cp {sorted_gene_pred_path}.gz* gs://tgg-viewer/ref/GRCh{genome_version}/gencode_{args.gencode_version}/")
142 | 
143 |             # generate SpliceAI annotation files
144 |             run(f"python3 ../annotations/generate_transcript_annotation_json.py {gencode_gtf_path}")
145 |             output_json_path = gencode_gtf_path.replace(".gtf.gz", ".transcript_annotations.json.gz")
146 |             run(f"python3 ../annotations/convert_gtf_to_SpliceAI_annotation_input_format.py -a {output_json_path} {gencode_gtf_path}")
147 |             if not os.path.isfile(output_json_path):
148 |                 raise ValueError(f"Unable to find {output_json_path}")
149 | 
150 |             run(f"mv {output_json_path} ./docker/ref/GRCh{genome_version}/")
151 |             run(f"mv {gencode_gtf_path.replace('.gtf.gz', '.txt.gz')} ./docker/spliceai/annotations/GRCh{genome_version}/")
152 | 
153 |             if genome_version == "37":
154 |                 gencode_gtf_path_without_chr_prefix = gencode_gtf_path.replace(".gtf.gz", ".without_chr_prefix.gtf.gz")
155 |                 run(f"gzcat {gencode_gtf_path} | sed 's/chr//g' | bgzip > {gencode_gtf_path_without_chr_prefix}")
156 |                 gencode_gtf_path = gencode_gtf_path_without_chr_prefix
157 | 
158 |             # generate Pangolin annotation files
159 |             run(f"python3 create_pangolin_db.py {gencode_gtf_path}")
160 |             run(f"mv {gencode_gtf_path.replace('.gtf.gz', '.db')} ./docker/pangolin/annotations/GRCh{genome_version}/")
161 | 
162 |         if args.gencode_version:
163 |             with open("server.py", "rt") as f:
164 |                 server_py = f.readlines()
165 | 
166 |             updated_line = False
167 |             with open("server.py", "wt") as f:
168 |                 for i, line in enumerate(server_py):
169 |                     if line.startswith("GENCODE_VERSION ="):
170 |                         new_gencode_line = f"GENCODE_VERSION = \"{args.gencode_version}\""
171 |                         f.write(f"{new_gencode_line}\n")
172 |                         updated_line = True
173 |                         print(f"Updated server.py line #{i} to {new_gencode_line}")
174 |                     else:
175 |                         f.write(line)
176 | 
177 |             with open("../index.html", "rt") as f:
178 |                 index_html = f.readlines()
179 | 
180 |             updated_line = False
181 |             with open("../index.html", "wt") as f:
182 |                 for i, line in enumerate(index_html):
183 |                     if "const GENCODE_VERSION = " in line:
184 |                         new_gencode_line = f"\tconst GENCODE_VERSION = \"{args.gencode_version}\""
185 |                         f.write(f"{new_gencode_line}\n")
186 |                         updated_line = True
187 |                         print(f"Updated index.html line #{i} to {new_gencode_line}")
188 |                     else:
189 |                         f.write(line)
190 | 
191 |             if not updated_line:
192 |                 print("WARNING: Unable to find GENCODE_VERSION line in index.html")
193 | 
194 |         return
195 | 
196 |     if args.command == "test2":
197 |         run(f"gcloud beta code dev")
198 |         return
199 | 
200 |     if args.command in {"test", "run"}:
201 |         if not args.genome_version:
202 |             parser.error(f"--genome-version is required for the {args.command} command")
203 |         if not args.tool:
204 |             parser.error(f"--tool is required for the {args.command} command")
205 | 
206 |         tag = get_tag(args.tool, args.genome_version)
207 | 
208 |         if args.command == "run":
209 |             print("Run this command: ")
210 |             print(f"{args.docker_command} run -it {tag}:latest /bin/bash")
211 |         elif args.command == "test":
212 |             run(f"{args.docker_command} run -p 8080:8080 {tag}:latest")
213 | 
214 |         return
215 | 
216 |     if not args.command or args.command in {"build", "deploy"}:
217 |         if args.docker_command == "podman":
218 |             print("WARNING: Google Cloud Run doesn't appear to work with images built using podman. "
219 |                   "Containers may fail to deploy to Google Cloud Run unless they are built using docker.")
220 |             time.sleep(10)
221 | 
222 |         for genome_version in genome_versions:
223 |             for tool in tools:
224 |                 tag = get_tag(tool, genome_version)
225 |                 dockerhub_tag = get_tag(tool, genome_version, repo_name="dockerhub")
226 |                 service = get_service_name(tool, genome_version)
227 |                 concurrency = 6    # if genome_version == '37' else 2
228 |                 min_instances = 0  # if tool == 'pangolin' else 2
229 |                 max_instances = 3
230 |                 if not args.command or args.command == "build":
231 |                     if args.docker_command == "podman":
232 |                         run(f"gcloud --project {GCLOUD_PROJECT} auth print-access-token | podman login -u oauth2accesstoken --password-stdin us-central1-docker.pkg.dev")
233 | 
234 |                     run(f"{args.docker_command} build -f docker/{tool}/Dockerfile --build-arg=\"CONCURRENCY={concurrency}\" --build-arg=\"GENOME_VERSION={genome_version}\" -t {tag}:latest -t {dockerhub_tag}:latest .")
235 |                     run(f"{args.docker_command} push {tag}:latest")
236 |                     run(f"{args.docker_command} push {dockerhub_tag}:latest")
237 | 
238 |                     run(f"{args.docker_command} pull {tag}:latest")
239 |                     run(f"{args.docker_command} inspect --format='{{{{index .RepoDigests 0}}}}' {tag}:latest | cut -f 2 -d @ > docker/{tool}/sha256_grch{genome_version}.txt")  # record the image's sha256
240 | 
241 |                 if not args.command or args.command == "deploy":
242 |                     with open(f"docker/{tool}/sha256_grch{genome_version}.txt") as f:
243 |                         sha256 = f.read().strip()
244 | 
245 |                     if not re.match("^sha256:[a-f0-9]{64}$", sha256):
246 |                         raise ValueError(f"Invalid sha256 value found in docker/{tool}/sha256_grch{genome_version}.txt: {sha256}")
247 | 
248 |                     print(f"Deploying {service} with image sha256 {sha256}")
249 | 
250 |                     run(f"""gcloud \
251 | --project {GCLOUD_PROJECT} beta run deploy {service} \
252 | --image {tag}@{sha256} \
253 | --min-instances {min_instances} \
254 | --service-min-instances {min_instances} \
255 | --max-instances {max_instances} \
256 | --concurrency {concurrency} \
257 | --service-account 1042618492363-compute@developer.gserviceaccount.com \
258 | --execution-environment gen2 \
259 | --region us-central1 \
260 | --update-secrets=DB_PASSWORD=spliceai-lookup-db-password:2 \
261 | --allow-unauthenticated \
262 | --memory 4Gi \
263 | --cpu 4
264 | """)
265 | 
266 |                                 # --add-volume=name=ref,type=cloud-storage,bucket=spliceai-lookup-reference-data,readonly=true \
267 |                 # --add-volume-mount=volume=ref,mount-path=/ref \
268 | 
269 | if __name__ == "__main__":
270 |     main()
271 | 


--------------------------------------------------------------------------------
/google_cloud_run_services/server.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import gzip
  3 | import json
  4 | import logging
  5 | import os
  6 | import psycopg2
  7 | import re
  8 | import time
  9 | import traceback
 10 | 
 11 | 
 12 | # used for DB connection pooling
 13 | from psycopg2.pool import SimpleConnectionPool
 14 | from contextlib import contextmanager
 15 | 
 16 | # flask imports
 17 | from flask import Flask, g, request, Response, send_from_directory
 18 | from flask_cors import CORS
 19 | from flask_talisman import Talisman
 20 | 
 21 | app = Flask(__name__)
 22 | 
 23 | CORS(app)
 24 | 
 25 | 
 26 | DEBUG = True # if socket.gethostname() == "spliceai-lookup" else True
 27 | if not DEBUG:
 28 |     Talisman(app)
 29 | 
 30 | logging.getLogger('werkzeug').disabled = True
 31 | 
 32 | DEFAULT_DISTANCE = 500  # maximum distance between the variant and gained/lost splice site, defaults to 500
 33 | MAX_DISTANCE_LIMIT = 10000
 34 | DEFAULT_MASK = 0        # mask scores representing annotated acceptor/donor gain and unannotated acceptor/donor loss, defaults to 0
 35 | 
 36 | SPLICEAI_EXAMPLE_URL = f"/spliceai/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G&bc=basic"
 37 | PANGOLIN_EXAMPLE_URL = f"/pangolin/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G&bc=basic"
 38 | 
 39 | 
 40 | VARIANT_RE = re.compile(
 41 |     "(chr)?(?P<chrom>[0-9XYMTt]{1,2})"
 42 |     "[-\s:]+"
 43 |     "(?P<pos>[0-9]{1,9})"
 44 |     "[-\s:]+"
 45 |     "(?P<ref>[ACGT]+)"
 46 |     "[-\s:>]+"
 47 |     "(?P<alt>[ACGT]+)"
 48 | )
 49 | 
 50 | FASTA_PATH = {
 51 |     "37": "/hg19.fa.gz",
 52 |     "38": "/hg38.fa.gz",
 53 | }
 54 | 
 55 | GENCODE_VERSION = "v49"
 56 | 
 57 | SHARED_TRANSCRIPT_ANNOTATIONS = {}
 58 | SHARED_TRANSCRIPT_ANNOTATION_PATHS = {
 59 |     ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.transcript_annotations.json.gz",
 60 |     ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.transcript_annotations.json.gz",
 61 |     ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.transcript_annotations.json.gz",
 62 |     ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.transcript_annotations.json.gz",
 63 | }
 64 | 
 65 | TRANSCRIPT_PRIORITY_ORDER = {
 66 |     "MS": 3,  # MANE select transcript
 67 |     "MP": 2,  # MANE plus clinical transcript
 68 |     "C": 1,   # canonical transcript
 69 |     "N": 0
 70 | }
 71 | 
 72 | TOOL = os.environ.get("TOOL")
 73 | GENOME_VERSION = os.environ.get("GENOME_VERSION")
 74 | if GENOME_VERSION not in ("37", "38"):
 75 |     raise ValueError(f'Environment variable "GENOME_VERSION" should be set to either "37" or "38" instead of: "{os.environ.get("GENOME_VERSION")}"')
 76 | 
 77 | if TOOL == "spliceai":
 78 |     from spliceai.utils import Annotator, get_delta_scores
 79 | 
 80 |     class VariantRecord:
 81 |         def __init__(self, chrom, pos, ref, alt):
 82 |             self.chrom = chrom
 83 |             self.pos = pos
 84 |             self.ref = ref
 85 |             self.alts = [alt]
 86 | 
 87 |         def __repr__(self):
 88 |             return f"{self.chrom}-{self.pos}-{self.ref}-{self.alts[0]}"
 89 | 
 90 |     SPLICEAI_ANNOTATOR = {}
 91 |     SPLICEAI_ANNOTATION_PATHS = {
 92 |         ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.txt.gz",
 93 |         ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.txt.gz",
 94 |         ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.txt.gz",
 95 |         ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.txt.gz",
 96 |     }
 97 | 
 98 | elif TOOL == "pangolin":
 99 |     from pkg_resources import resource_filename
100 |     from pangolin.pangolin import process_variant as process_variant_using_pangolin
101 |     from pangolin.model import torch, Pangolin, L, W, AR
102 |     import gffutils
103 | 
104 |     PANGOLIN_ANNOTATION_PATHS = {
105 |         ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.without_chr_prefix.db",
106 |         ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.db",
107 |         ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.without_chr_prefix.db",
108 |         ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.db",
109 |     }
110 | else:
111 |     raise ValueError(f'Environment variable "TOOL" should be set to either "spliceai" or "pangolin" instead of: "{os.environ.get("TOOL")}"')
112 | 
113 | 
114 | RATE_LIMIT_ERROR_MESSAGE = (
115 |     f"Rate limit exceeded. This server only supports interactive use. To process large numbers of variants programmatically, "
116 |     f"please install a local instance of the API server, or just run the prediction models directly. Attempts to query large "
117 |     f"numbers of variants programmatically will result in loss of access to this API for an extended period of time. Contact "
118 |     f"us at https://github.com/broadinstitute/SpliceAI-lookup/issues if you have any questions."
119 | )
120 | 
121 | 
122 | def init_spliceai(genome_version, basic_or_comprehensive):
123 |     
124 |     if (genome_version, basic_or_comprehensive) not in SPLICEAI_ANNOTATOR:
125 |         SPLICEAI_ANNOTATOR[(genome_version, basic_or_comprehensive)] = Annotator(
126 |             FASTA_PATH[genome_version],
127 |             SPLICEAI_ANNOTATION_PATHS[(genome_version, basic_or_comprehensive)]
128 |         )
129 | 
130 | 
131 | def init_transcript_annotations(genome_version, basic_or_comprehensive):
132 |     if (genome_version, basic_or_comprehensive) in SHARED_TRANSCRIPT_ANNOTATIONS:
133 |         return
134 | 
135 |     # init shared transcript annotations
136 |     with gzip.open(SHARED_TRANSCRIPT_ANNOTATION_PATHS[(genome_version, basic_or_comprehensive)], "rt") as ta_f:
137 |         SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive)] = json.load(ta_f)
138 | 
139 | 
140 | def error_response(error_message, source=None):
141 |     response_json = {"error": str(error_message)}
142 |     if source:
143 |         response_json["source"] = source
144 |     return Response(json.dumps(response_json), status=200, mimetype='application/json')
145 | 
146 | 
147 | def parse_variant(variant_str):
148 |     match = VARIANT_RE.match(variant_str)
149 |     if not match:
150 |         raise ValueError(f"Unable to parse variant: {variant_str}")
151 | 
152 |     return match['chrom'], int(match['pos']), match['ref'], match['alt']
153 | 
154 | 
155 | #while True:
156 | #    # https://groups.google.com/g/google-cloud-sql-discuss/c/mxsaf-YDrbA?pli=1
157 | #    # https://cloud.google.com/sql/docs/postgres/flags#gcloud
158 | #
159 | #    error_count = 0
160 | #    try:
161 | #        DATABASE_CONNECTION_POOL = SimpleConnectionPool(
162 | #            minconn=1,
163 | #            maxconn=5,
164 | #            dbname="spliceai-lookup-db",
165 | #            user="postgres",
166 | #            password=os.environ.get("DB_PASSWORD"),
167 | #            host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db",
168 | #            port="5432",
169 | #            connect_timeout=5,
170 | #        )
171 | #        print(f"Successfully connected to database", flush=True)
172 | #        break
173 | #    except psycopg2.Error as e:
174 | #        error_count += 1
175 | #        time.sleep(2)
176 | #        print(f"Error connecting to database: {e}", flush=True)
177 | #        traceback.print_exc()
178 | #        if error_count > 5:
179 | #            print(f"Error connecting to database. Exiting...", flush=True)
180 | #            sys.exit(1)
181 | 
182 | 
183 | @contextmanager
184 | def get_db_connection():
185 |     """Get a database connection"""
186 |     #conn = DATABASE_CONNECTION_POOL.getconn()
187 |     try:
188 |         conn = psycopg2.connect(
189 |             dbname="spliceai-lookup-db",
190 |             user="postgres",
191 |             password=os.environ.get("DB_PASSWORD"),
192 |             host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db",
193 |             port="5432",
194 |             connect_timeout=5,
195 |         )
196 |     except Exception as e:
197 |         print(f"ERROR: Unable to connect to SQL database: {e}")
198 |         conn = None
199 | 
200 |     try:
201 |         yield conn
202 |     finally:
203 |         if conn is not None:
204 |             conn.close()
205 |             #DATABASE_CONNECTION_POOL.putconn(conn)
206 | 
207 | @contextmanager
208 | def get_db_cursor(conn):
209 |     """Get a database cursor"""
210 |     if conn is None:
211 |         return
212 | 
213 |     cursor = conn.cursor()
214 |     try:
215 |         yield cursor
216 |         conn.commit()
217 |     finally:
218 |         cursor.close()
219 | 
220 | 
221 | def run_sql(conn, sql_query, *params):
222 |     if conn is None:
223 |         return
224 | 
225 |     with get_db_cursor(conn) as cursor:
226 |         cursor.execute(sql_query, *params)
227 |         try:
228 |             results = cursor.fetchall()
229 |         except:
230 |             results = []
231 |     return results
232 | 
233 | 
234 | #def does_table_exist(table_name):
235 | #    results = run_sql(f"SELECT EXISTS (SELECT 1 AS result FROM pg_tables WHERE tablename=%s)", (table_name,))
236 | #    does_table_already_exist = results[0][0]
237 | #    return does_table_already_exist
238 | 
239 | #if not does_table_exist("cache"):
240 | #    print("Creating cache table")
241 | #    run_sql("""CREATE TABLE cache (key TEXT UNIQUE, value TEXT, counter INT, accessed TIMESTAMP DEFAULT now())""")
242 | #    run_sql("""CREATE INDEX cache_index ON cache (key)""")
243 | 
244 | #if not does_table_exist("log"):
245 | #    print("Creating event_log table")
246 | #    run_sql("""CREATE TABLE log (event_name TEXT, ip TEXT, logtime TIMESTAMP DEFAULT now(), duration REAL, variant TEXT, genome VARCHAR(10), bc VARCHAR(20), distance INT, mask INT4, details TEXT, variant_consequence TEXT)""")
247 | #    run_sql("""CREATE INDEX idx_log_ip_logtime ON log USING btree (ip, logtime DESC)""")
248 | #    run_sql("""CREATE INDEX idx_log_event_name ON log USING btree (event_name)""")
249 | 
250 | #if not does_table_exist("restricted_ips"):
251 | #    print("Creating restricted_ips table")
252 | #    run_sql("""CREATE TABLE restricted_ips (ip TEXT UNIQUE, created TIMESTAMP DEFAULT now())""")
253 | #    run_sql("""CREATE INDEX idx_restricted_ips_created ON restricted_ips USING btree (created)""")
254 | 
255 | # Query to add ip to the restricted_ips table
256 | #run_sql("""INSERT INTO restricted_ips (ip) VALUES ('210.3.222.157')""")
257 | 
258 | def is_user_on_whitelist(conn, user_ip):
259 |     """Check if the user is on the whitelist"""
260 |     if conn is None or not user_ip:
261 |         return False
262 | 
263 |     if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", user_ip):
264 |         return False
265 | 
266 |     rows = run_sql(conn, "SELECT COUNT(ip) FROM whitelist_ips WHERE ip=%s", (user_ip,))
267 |     return rows and int(rows[0][0]) > 0
268 | 
269 | def exceeds_rate_limit(conn, user_ip, params):
270 |     """Rate limit requests based on user ip address"""
271 | 
272 |     #"""
273 |     #SELECT * FROM log WHERE event_name like '%computed' AND duration > 2 AND ip='210.3.222.157' AND logtime >= NOW() - INTERVAL '5 minutes' ;
274 |     #SELECT ip, count(*) FROM log WHERE event_name like '%computed' AND duration > 2 AND logtime >= NOW() - INTERVAL '20 minutes' GROUP BY ip ORDER BY count DESC;
275 |     #"""
276 | 
277 |     try:
278 |         if conn is None:
279 |             return False
280 | 
281 |         if is_user_on_whitelist(conn, params.get("ip")):
282 |             return False
283 | 
284 |         # check if the user has exceeded the rate limit or is on the list of restricted IPs
285 |         rows = run_sql(conn, "SELECT COUNT(ip) FROM restricted_ips WHERE ip=%s AND created >= NOW() - INTERVAL '1 weeks'", (user_ip,))
286 |         is_user_currently_blocked = rows and int(rows[0][0]) > 0
287 |         if is_user_currently_blocked:
288 |             return RATE_LIMIT_ERROR_MESSAGE
289 | 
290 |         rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name LIKE %s AND ip=%s AND logtime >= NOW() - INTERVAL '7 minutes'", ("%computed%", user_ip))
291 |         did_user_exceed_rate_limit = rows and int(rows[0][0]) >= 50
292 |         if did_user_exceed_rate_limit and not is_user_on_whitelist(conn, user_ip):
293 |             # the user has exceeded the rate limit: computing scores for 50 or more variants in the last 7 minutes
294 |             rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name='rate_limit_exceeded' AND ip=%s AND logtime >= NOW() - INTERVAL '5 minutes'", (user_ip,))
295 |             user_hit_rate_limit_exceeded_recently = rows and int(rows[0][0]) > 0
296 |             if not user_hit_rate_limit_exceeded_recently:
297 |                 # the user will receive at most one "rate_limit_exceeded" event every 5 minutes
298 |                 log(conn, f"rate_limit_exceeded", ip=user_ip)
299 |                 rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name='rate_limit_exceeded' AND ip=%s AND logtime >= NOW() - INTERVAL '1 days'", (user_ip,))
300 |                 user_triggered_too_many_rate_limit_exceeded_errors_today = rows and int(rows[0][0]) >= 5
301 |                 if user_triggered_too_many_rate_limit_exceeded_errors_today:
302 |                     # the user has hit the limit of 5 or more "rate_limit_exceeded" events during the last 24 hours
303 |                     rows = run_sql(conn, "SELECT COUNT(ip) FROM restricted_ips WHERE ip=%s", (user_ip,))
304 |                     need_to_delete_previous_restricted_ip_record = rows and int(rows[0][0]) > 0
305 |                     if need_to_delete_previous_restricted_ip_record:
306 |                         # delete the previous record
307 |                         run_sql(conn, "DELETE FROM restricted_ips WHERE ip=%s", (user_ip,))
308 | 
309 |                     # block the user's IP for 1 week
310 |                     run_sql(conn, "INSERT INTO restricted_ips (ip) VALUES (%s)", (user_ip,))
311 | 
312 |             return RATE_LIMIT_ERROR_MESSAGE
313 | 
314 |     except Exception as e:
315 |         print(f"Error while checking rate limit: {e}", flush=True)
316 |         # print traceback
317 |         traceback.print_exc()
318 |         return False
319 | 
320 | 
321 | def get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive="basic"):
322 |     return f"{tool_name}__{variant}__hg{genome_version}__d{distance}__m{mask}__{basic_or_comprehensive}"
323 | 
324 | 
325 | def get_splicing_scores_from_cache(conn, tool_name, variant, genome_version, distance, mask, basic_or_comprehensive="basic"):
326 |     results = {}
327 |     key = get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive)
328 |     try:
329 |         rows = run_sql(conn, f"SELECT value FROM cache WHERE key=%s", (key,))
330 |         if rows:
331 |             results = json.loads(rows[0][0])
332 |             results["source"] += ":cache"
333 |     except Exception as e:
334 |         print(f"Cache error: {e}", flush=True)
335 | 
336 |     return results
337 | 
338 | 
339 | def add_splicing_scores_to_cache(conn, tool_name, variant, genome_version, distance, mask, basic_or_comprehensive, results):
340 |     key = get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive)
341 |     try:
342 |         results_string = json.dumps(results)
343 | 
344 |         run_sql(conn,
345 |                 r"""INSERT INTO cache (key, value, counter, accessed) VALUES (%s, %s, 1, now()) """ +
346 |                 r"""ON CONFLICT (key) DO """ +
347 |                 r"""UPDATE SET key=%s, value=%s, counter=cache.counter+1, accessed=now()""", (key, results_string, key, results_string))
348 |     except Exception as e:
349 |         print(f"Cache error: {e}", flush=True)
350 | 
351 | 
352 | def get_spliceai_scores(variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param):
353 |     try:
354 |         chrom, pos, ref, alt = parse_variant(variant)
355 |     except ValueError as e:
356 |         return {
357 |             "variant": variant,
358 |             "source": "spliceai",
359 |             "error": str(e),
360 |         }
361 | 
362 |     # generate error message if variant falls outside annotated exons or introns
363 |     record = VariantRecord(chrom, pos, ref, alt)
364 |     try:
365 |         scores = get_delta_scores(
366 |             record,
367 |             SPLICEAI_ANNOTATOR[(genome_version, basic_or_comprehensive_param)],
368 |             distance_param,
369 |             mask_param)
370 |     except Exception as e:
371 |         print(f"ERROR while computing SpliceAI scores for {variant}: {e}")
372 |         traceback.print_exc()
373 |         return {
374 |             "variant": variant,
375 |             "source": "spliceai",
376 |             "error": f"{type(e)}: {e}",
377 |         }
378 | 
379 |     if not scores:
380 |         return {
381 |             "variant": variant,
382 |             "source": "spliceai",
383 |             "error": f"The SpliceAI model did not return any scores for {variant}. This may be because the variant does "
384 |                      f"not overlap any exons or introns defined by the GENCODE '{basic_or_comprehensive_param}' annotation.",
385 |         }
386 | 
387 |     #scores = [s[s.index("|")+1:] for s in scores]  # drop allele field
388 | 
389 |     # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript)
390 |     all_non_zero_scores = None
391 |     all_non_zero_scores_strand = None
392 |     all_non_zero_scores_transcript_id = None
393 |     all_non_zero_scores_transcript_priority = -1
394 |     max_delta_score_sum = 0
395 |     for i, transcript_scores in enumerate(scores):
396 |         if "ALL_NON_ZERO_SCORES" not in transcript_scores:
397 |             continue
398 | 
399 |         transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0]
400 | 
401 |         # get json annotations for this transcript
402 |         transcript_annotations = SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive_param)].get(transcript_id_without_version)
403 |         if transcript_annotations is None:
404 |             raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations")
405 | 
406 |         # add the extra transcript annotations from the json file to the transcript scores dict
407 |         transcript_scores.update(transcript_annotations)
408 | 
409 |         # decide whether to use ALL_NON_ZERO_SCORES from this transcript
410 |         current_transcript_priority = TRANSCRIPT_PRIORITY_ORDER[transcript_annotations["t_priority"]]
411 |         current_delta_score_sum = sum(abs(float(transcript_scores[key])) for key in ("DS_AG", "DS_AL", "DS_DG", "DS_DL"))
412 |         if current_transcript_priority > all_non_zero_scores_transcript_priority:
413 |             max_delta_score_sum = current_delta_score_sum
414 |             all_non_zero_scores_transcript_priority = current_transcript_priority
415 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
416 |             all_non_zero_scores_strand = transcript_scores["t_strand"]
417 |             all_non_zero_scores_transcript_id = transcript_scores["t_id"]
418 | 
419 |         elif current_transcript_priority == all_non_zero_scores_transcript_priority and current_delta_score_sum > max_delta_score_sum:
420 |             # select the one with the highest delta score sum
421 |             max_delta_score_sum = current_delta_score_sum
422 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
423 |             all_non_zero_scores_strand = transcript_scores["t_strand"]
424 |             all_non_zero_scores_transcript_id = transcript_scores["t_id"]
425 | 
426 |         for redundant_key in "ALLELE", "NAME", "STRAND", "ALL_NON_ZERO_SCORES":
427 |             del transcript_scores[redundant_key]
428 | 
429 |     return {
430 |         "variant": variant,
431 |         "genomeVersion": genome_version,
432 |         "chrom": chrom,
433 |         "pos": pos,
434 |         "ref": ref,
435 |         "alt": alt,
436 |         "distance": distance_param,
437 |         "mask": mask_param,
438 |         "scores": scores,
439 |         "source": "spliceai:model",
440 |         "allNonZeroScores": all_non_zero_scores,
441 |         "allNonZeroScoresStrand": all_non_zero_scores_strand,
442 |         "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id,
443 |     }
444 | 
445 | 
446 | def get_pangolin_scores(variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param):
447 |     if genome_version not in ("37", "38"):
448 |         raise ValueError(f"Invalid genome_version: {mask_param}")
449 | 
450 |     if mask_param not in ("True", "False"):
451 |         raise ValueError(f"Invalid mask_param: {mask_param}")
452 | 
453 |     if basic_or_comprehensive_param not in ("basic", "comprehensive"):
454 |         raise ValueError(f"Invalid basic_or_comprehensive_param: {basic_or_comprehensive_param}")
455 | 
456 |     try:
457 |         chrom, pos, ref, alt = parse_variant(variant)
458 |     except ValueError as e:
459 |         print(f"ERROR while parsing variant {variant}: {e}")
460 |         traceback.print_exc()
461 | 
462 |         return {
463 |             "variant": variant,
464 |             "source": "pangolin",
465 |             "error": str(e),
466 |         }
467 | 
468 |     if len(ref) > 1 and len(alt) > 1:
469 |         return {
470 |             "variant": variant,
471 |             "source": "pangolin",
472 |             "error": f"Pangolin does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}",
473 |         }
474 | 
475 |     class PangolinArgs:
476 |         reference_file = FASTA_PATH[genome_version]
477 |         distance = distance_param
478 |         mask = mask_param
479 |         score_cutoff = None
480 |         score_exons = "False"
481 | 
482 |     pangolin_models = []
483 | 
484 |     for i in 0, 2, 4, 6:
485 |         for j in 1, 2, 3:
486 |             model = Pangolin(L, W, AR)
487 |             if torch.cuda.is_available():
488 |                 model.cuda()
489 |                 weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)))
490 |             else:
491 |                 weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu'))
492 |             model.load_state_dict(weights)
493 |             model.eval()
494 |             pangolin_models.append(model)
495 | 
496 |     features_db = gffutils.FeatureDB(PANGOLIN_ANNOTATION_PATHS[(GENOME_VERSION, basic_or_comprehensive_param)])
497 |     scores = process_variant_using_pangolin(
498 |         0, chrom, int(pos), ref, alt, features_db, pangolin_models, PangolinArgs)
499 | 
500 |     if not scores:
501 |         return {
502 |             "variant": variant,
503 |             "source": "pangolin",
504 |             "error": f"Pangolin was unable to compute scores for this variant",
505 |         }
506 | 
507 |     # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript)
508 |     all_non_zero_scores = None
509 |     all_non_zero_scores_strand = None
510 |     all_non_zero_scores_transcript_id = None
511 |     max_delta_score_sum = 0
512 |     for i, transcript_scores in enumerate(scores):
513 |         if "ALL_NON_ZERO_SCORES" not in transcript_scores:
514 |             continue
515 | 
516 |         transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0]
517 | 
518 |         # get json annotations for this transcript
519 |         transcript_annotations = SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive_param)].get(transcript_id_without_version)
520 |         if transcript_annotations is None:
521 |             raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations")
522 | 
523 |         # add the extra transcript annotations from the json file to the transcript scores dict
524 |         transcript_scores.update(transcript_annotations)
525 | 
526 |         # decide whether to use ALL_NON_ZERO_SCORES from this gene
527 |         current_delta_score_sum = sum(abs(float(s.get("SG_ALT", 0)) - float(s.get("SG_REF", 0)))
528 |                               for s in transcript_scores["ALL_NON_ZERO_SCORES"])
529 |         current_delta_score_sum += sum(abs(float(s.get("SL_ALT", 0)) - float(s.get("SL_REF", 0)))
530 |                                for s in transcript_scores["ALL_NON_ZERO_SCORES"])
531 | 
532 |         # return all_non_zero_scores for the transcript or gene with the highest delta score sum
533 |         if current_delta_score_sum > max_delta_score_sum:
534 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
535 |             all_non_zero_scores_strand = transcript_scores["STRAND"]
536 |             all_non_zero_scores_transcript_id = transcript_scores["NAME"]
537 |             max_delta_score_sum = current_delta_score_sum
538 | 
539 |         for redundant_key in "NAME", "STRAND", "ALL_NON_ZERO_SCORES":
540 |             del transcript_scores[redundant_key]
541 | 
542 |     return {
543 |         "variant": variant,
544 |         "genomeVersion": genome_version,
545 |         "chrom": chrom,
546 |         "pos": pos,
547 |         "ref": ref,
548 |         "alt": alt,
549 |         "distance": distance_param,
550 |         "mask": mask_param,
551 |         "scores": scores,
552 |         "source": "pangolin:model",
553 |         "allNonZeroScores": all_non_zero_scores,
554 |         "allNonZeroScoresStrand": all_non_zero_scores_strand,
555 |         "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id,
556 |     }
557 | 
558 | 
559 | @app.route("/spliceai/", methods=['POST', 'GET'])
560 | def run_spliceai():
561 |     with get_db_connection() as conn:
562 |         return run_splice_prediction_tool(conn, tool_name="spliceai")
563 | 
564 | 
565 | @app.route("/pangolin/", methods=['POST', 'GET'])
566 | def run_pangolin():
567 |     with get_db_connection() as conn:
568 |         return run_splice_prediction_tool(conn, tool_name="pangolin")
569 | 
570 | 
571 | def run_splice_prediction_tool(conn, tool_name):
572 |     """Handles API request for splice prediction
573 | 
574 |     Args:
575 |         conn (psycopg2.connection): Database connection
576 |         tool_name (str): "spliceai" or "pangolin"
577 |     """
578 | 
579 |     if tool_name != TOOL:
580 |         return error_response(f"ERROR: This server is configured to run {TOOL} rather than {tool_name}.\n", source=tool_name)
581 | 
582 |     user_ip = get_user_ip(request)
583 | 
584 | 
585 |     start_time = datetime.now()
586 |     #logging_prefix = start_time.strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()} ip:{user_ip}"
587 |     logging_prefix = f"t{os.getpid()} ip:{user_ip}"
588 |     example_url = SPLICEAI_EXAMPLE_URL if tool_name == "spliceai" else PANGOLIN_EXAMPLE_URL
589 | 
590 |     # check params
591 |     params = {}
592 |     if request.values:
593 |         params.update(request.values)
594 | 
595 |     if 'variant' not in params:
596 |         params.update(request.get_json(force=True, silent=True) or {})
597 | 
598 |     variant = params.get('variant', '')
599 |     variant = variant.strip().strip("'").strip('"').strip(",")
600 |     if not variant:
601 |         return error_response(f'"variant" not specified.\n', source=tool_name)
602 | 
603 |     if not isinstance(variant, str):
604 |         return error_response(f'"variant" value must be a string rather than a {type(variant)}.\n', source=tool_name)
605 | 
606 |     genome_version = params.get("hg")
607 |     if not genome_version:
608 |         return error_response(f'"hg" not specified. The URL must include an "hg" arg: hg=37 or hg=38. For example: {example_url}\n', source=tool_name)
609 | 
610 |     if genome_version not in ("37", "38"):
611 |         return error_response(f'Invalid "hg" value: "{genome_version}". The value must be either "37" or "38". For example: {example_url}\n', source=tool_name)
612 | 
613 |     distance_param = params.get("distance", DEFAULT_DISTANCE)
614 |     try:
615 |         distance_param = int(distance_param)
616 |     except Exception as e:
617 |         return error_response(f'Invalid "distance": "{distance_param}". The value must be an integer.\n', source=tool_name)
618 | 
619 |     if distance_param > MAX_DISTANCE_LIMIT:
620 |         return error_response(f'Invalid "distance": "{distance_param}". The value must be < {MAX_DISTANCE_LIMIT}.\n', source=tool_name)
621 | 
622 |     mask_param = params.get("mask", str(DEFAULT_MASK))
623 |     if mask_param not in ("0", "1"):
624 |         return error_response(f'Invalid "mask" value: "{mask_param}". The value must be either "0" or "1". For example: {example_url}\n', source=tool_name)
625 | 
626 |     basic_or_comprehensive_param = params.get("bc", "basic")
627 |     if basic_or_comprehensive_param not in ("basic", "comprehensive"):
628 |         return error_response(f'Invalid "bc" value: "{basic_or_comprehensive_param}". The value must be either "basic" or "comprehensive". For example: {example_url}\n', source=tool_name)
629 | 
630 |     variant_consequence = params.get("variant_consequence")
631 | 
632 |     force = params.get("force")  # ie. don't use cache
633 | 
634 |     print(f"{logging_prefix}: ======================", flush=True)
635 |     print(f"{logging_prefix}: {variant} tool={tool_name} hg={genome_version}, distance={distance_param}, mask={mask_param}, bc={basic_or_comprehensive_param}", flush=True)
636 | 
637 |     if tool_name == "spliceai":
638 |         init_spliceai(genome_version, basic_or_comprehensive_param)
639 | 
640 |     init_transcript_annotations(genome_version, basic_or_comprehensive_param)
641 | 
642 |     # check cache before processing the variant
643 |     results = {}
644 |     if not force:
645 |         results = get_splicing_scores_from_cache(conn, tool_name, variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param)
646 | 
647 |     duration = (datetime.now() - start_time).total_seconds()
648 |     if results:
649 |         log(conn, f"{tool_name}:from-cache", ip=user_ip, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, bc=basic_or_comprehensive_param, variant_consequence=variant_consequence)
650 |     else:
651 |         error_message = exceeds_rate_limit(conn, user_ip, params)
652 |         if error_message:
653 |             print(f"{logging_prefix}: {user_ip}: response: {error_message}", flush=True)
654 |             return error_response(error_message, source=tool_name)
655 | 
656 |         try:
657 |             if tool_name == "spliceai":
658 |                 results = get_spliceai_scores(variant, genome_version, distance_param, int(mask_param), basic_or_comprehensive_param)
659 |             elif tool_name == "pangolin":
660 |                 pangolin_mask_param = "True" if mask_param == "1" else "False"
661 |                 results = get_pangolin_scores(variant, genome_version, distance_param, pangolin_mask_param, basic_or_comprehensive_param)
662 |             else:
663 |                 raise ValueError(f"Invalid tool_name: {tool_name}")
664 |         except Exception as e:
665 |             traceback.print_exc()
666 |             return error_response(f"ERROR: {e}", source=tool_name)
667 | 
668 |         duration = (datetime.now() - start_time).total_seconds()
669 |         log(conn, f"{tool_name}:computed", ip=user_ip, duration=duration, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, bc=basic_or_comprehensive_param, variant_consequence=variant_consequence)
670 | 
671 |         if "error" not in results:
672 |             add_splicing_scores_to_cache(conn, tool_name, variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param, results)
673 | 
674 |     if "error" in results:
675 |         log(conn, f"{tool_name}:error", ip=user_ip, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, details=results["error"], bc=basic_or_comprehensive_param, variant_consequence=variant_consequence)
676 | 
677 |     response_json = {}
678 |     response_json.update(params)  # copy input params to output
679 |     response_json.update(results)
680 | 
681 |     response_log_string = ", ".join([f"{k}: {v}" for k, v in response_json.items() if not k.startswith("allNonZeroScores")])
682 |     print(f"{logging_prefix}: {variant} response took {str(datetime.now() - start_time)}: {response_log_string}", flush=True)
683 | 
684 |     return Response(json.dumps(response_json), status=200, mimetype='application/json', headers=[
685 |         ('Access-Control-Allow-Origin', '*'),
686 |     ])
687 | 
688 | 
689 | def log(conn, event_name, ip=None, duration=None, variant=None, genome=None, distance=None, mask=None, bc=None, details=None, variant_consequence=None):
690 |     """Utility method for logging an event"""
691 | 
692 |     try:
693 |         if duration is not None: duration = float(duration)
694 |         if distance is not None: distance = int(distance)
695 |         if mask is not None: mask = int(mask)
696 |     except Exception as e:
697 |         print(f"Error parsing log params: {e}", flush=True)
698 |         return
699 | 
700 |     try:
701 |         run_sql(conn,
702 |                 r"INSERT INTO log (event_name, ip, duration, variant, genome, distance, mask, bc, details, variant_consequence) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
703 |                 (event_name, ip, duration, variant, genome, distance, mask, bc, details, variant_consequence))
704 |     except Exception as e:
705 |         print(f"Log error: {e}", flush=True)
706 | 
707 | 
708 | def get_user_ip(request):
709 |     return request.environ.get("HTTP_X_FORWARDED_FOR")
710 | 
711 | 
712 | @app.route('/log/<string:name>/', strict_slashes=False)
713 | def log_event(name):
714 | 
715 |     if name != "show_igv":
716 |         message = f"Log error: invalid event name: {name}"
717 |         print(message, flush=True)
718 |         return error_response(f"ERROR: {message}")
719 | 
720 |     # check params
721 |     params = {}
722 |     if request.values:
723 |         params.update(request.values)
724 |     if not params:
725 |         params.update(request.get_json(force=True, silent=True) or {})
726 | 
727 |     variant = params.get("variant")
728 |     genome_version = params.get("hg")
729 |     distance_param = params.get("distance")
730 |     mask_param = params.get("mask")
731 |     basic_or_comprehensive_param = params.get("bc")
732 |     details = params.get("details")
733 |     variant_consequence = params.get("variant_consequence")
734 |     if details:
735 |         details = str(details)
736 |         details = details[:2000]
737 | 
738 |     user_ip = get_user_ip(request)
739 |     logging_prefix = datetime.now().strftime("%m/%d/%Y %H:%M:%S") + f" {user_ip} t{os.getpid()}"
740 |     print(f"{logging_prefix}: ======================", flush=True)
741 |     print(f"{logging_prefix}: {variant} show igv with hg={genome_version}, distance={distance_param}, mask={mask_param}", flush=True)
742 | 
743 |     with get_db_connection() as conn:
744 |         log(conn,
745 |             name,
746 |             ip=user_ip,
747 |             variant=variant,
748 |             genome=genome_version,
749 |             distance=distance_param,
750 |             mask=mask_param,
751 |             bc=basic_or_comprehensive_param,
752 |             details=details,
753 |             variant_consequence=variant_consequence)
754 | 
755 |     return Response(json.dumps({"status": "Done"}), status=200, mimetype='application/json', headers=[
756 |         ('Access-Control-Allow-Origin', '*'),
757 |     ])
758 | 
759 | 
760 | @app.route('/', strict_slashes=False, defaults={'path': ''})
761 | @app.route('/<path:path>/')
762 | def catch_all(path):
763 |     return f"SpliceAI-lookup APIs: invalid endpoint {path}"
764 | 
765 | 
766 | if '__main__' == __name__ or os.environ.get('RUNNING_ON_GOOGLE_CLOUD_RUN'):
767 |     app.run(debug=DEBUG, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))
768 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from datetime import datetime
  3 | import json
  4 | import markdown2
  5 | import os
  6 | import pandas as pd
  7 | import re
  8 | import socket
  9 | import subprocess
 10 | import traceback
 11 | import tempfile
 12 | import time
 13 | 
 14 | # pangolin imports
 15 | from pkg_resources import resource_filename
 16 | from pangolin.model import torch, Pangolin, L, W, AR
 17 | from pangolin.pangolin import process_variant as process_variant_using_pangolin
 18 | import gffutils
 19 | 
 20 | # flask imports
 21 | from flask import Flask, request, Response, send_from_directory
 22 | from flask_cors import CORS
 23 | from flask_talisman import Talisman
 24 | from intervaltree import IntervalTree, Interval
 25 | from spliceai.utils import Annotator, get_delta_scores
 26 | 
 27 | # pandas output options
 28 | pd.options.display.float_format = "{:,.2f}".format
 29 | pd.set_option('display.max_rows', None)
 30 | pd.set_option('display.max_columns', None)
 31 | pd.set_option('display.expand_frame_repr', False)
 32 | pd.set_option('max_colwidth', None)
 33 | 
 34 | 
 35 | app = Flask(__name__)
 36 | 
 37 | CORS(app)
 38 | 
 39 | DEBUG = False if socket.gethostname() == "spliceai-lookup" else True
 40 | if not DEBUG:
 41 |     Talisman(app)
 42 | 
 43 | 
 44 | RATE_LIMIT_WINDOW_SIZE_IN_MINUTES = 1
 45 | RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE = {
 46 |     "spliceai:model": 6,
 47 |     "spliceai:total": 15,
 48 |     "pangolin:model": 6,
 49 |     "pangolin:total": 15,
 50 |     "liftover:total": 12,
 51 | }
 52 | 
 53 | RATE_LIMIT_COUNTER_WINDOW_SIZE_IN_DAYS = 3
 54 | RATE_LIMIT_OUTLIER_IPS_PATH = os.path.abspath("rate_limit_outlier_ips.txt")
 55 | 
 56 | def get_rate_limit_outlier_ips():
 57 |     print(f"Reading rate limit outlier IPs: {RATE_LIMIT_OUTLIER_IPS_PATH}")
 58 |     if os.path.isfile(RATE_LIMIT_OUTLIER_IPS_PATH):
 59 |         with open(RATE_LIMIT_OUTLIER_IPS_PATH, "rt") as f:
 60 |             rate_limit_outlier_ips = [l.strip() for l in f]
 61 |     else:
 62 |         rate_limit_outlier_ips = []
 63 | 
 64 |     print(f"Current list of rate limit outlier IPs: {rate_limit_outlier_ips}")
 65 |     return rate_limit_outlier_ips
 66 | 
 67 | 
 68 | RATE_LIMIT_OUTLIER_IPS = get_rate_limit_outlier_ips()
 69 | 
 70 | DISABLE_LOGGING_FOR_IPS = {f"63.143.42.{i}" for i in range(0, 256)}  # ignore uptimerobot.com IPs
 71 | 
 72 | 
 73 | HG19_FASTA_PATH = os.path.expanduser("~/hg19.fa")
 74 | HG38_FASTA_PATH = os.path.expanduser("~/hg38.fa")
 75 | T2T_FASTA_PATH = os.path.expanduser("~/chm13v2.0.fa")
 76 | 
 77 | GENCODE_VERSION = "v44"
 78 | SPLICEAI_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.txt.gz"
 79 | SPLICEAI_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.txt.gz"
 80 | PANGOLIN_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.without_chr_prefix.db"
 81 | PANGOLIN_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.db"
 82 | TRANSCRIPT_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.transcript_annotations.json"
 83 | TRANSCRIPT_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.transcript_annotations.json"
 84 | 
 85 | UCSC_LIFTOVER_TOOL = "UCSC liftover tool"
 86 | BCFTOOLS_LIFTOVER_TOOL = "bcftools liftover plugin"
 87 | 
 88 | PANGOLIN_MODELS = []
 89 | for i in 0, 2, 4, 6:
 90 |     for j in 1, 2, 3:
 91 |         model = Pangolin(L, W, AR)
 92 |         if torch.cuda.is_available():
 93 |             model.cuda()
 94 |             weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)))
 95 |         else:
 96 |             weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu'))
 97 | 
 98 |         model.load_state_dict(weights)
 99 |         model.eval()
100 |         PANGOLIN_MODELS.append(model)
101 | 
102 | 
103 | ANNOTATION_INTERVAL_TREES = {
104 |     "37": defaultdict(IntervalTree),
105 |     "38": defaultdict(IntervalTree),
106 | }
107 | 
108 | for genome_version, annotation_path in ("37", SPLICEAI_GRCH37_ANNOTATIONS), ("38", SPLICEAI_GRCH38_ANNOTATIONS):
109 |     print(f"Loading {annotation_path}", flush=True)
110 |     df = pd.read_table(annotation_path, dtype={"TX_START": int, "TX_END": int})
111 |     for _, row in df.iterrows():
112 |         chrom = row["CHROM"].replace("chr", "")
113 |         ANNOTATION_INTERVAL_TREES[genome_version][chrom].add(Interval(row["TX_START"], row["TX_END"] + 0.1, row["#NAME"]))
114 | 
115 | SPLICEAI_ANNOTATOR = {
116 |     "37": Annotator(HG19_FASTA_PATH, SPLICEAI_GRCH37_ANNOTATIONS),
117 |     "38": Annotator(HG38_FASTA_PATH, SPLICEAI_GRCH38_ANNOTATIONS),
118 | }
119 | 
120 | ta37_f = open(TRANSCRIPT_GRCH37_ANNOTATIONS, "rt")
121 | ta38_f = open(TRANSCRIPT_GRCH38_ANNOTATIONS, "rt")
122 | TRANSCRIPT_ANNOTATIONS = {
123 |     "37": json.load(ta37_f),
124 |     "38": json.load(ta38_f),
125 | }
126 | ta37_f.close()
127 | ta38_f.close()
128 | 
129 | TRANSCRIPT_PRIORITY_ORDER = {
130 |     "MS": 3,  # MANE select transcript
131 |     "MP": 2,  # MANE plus clinical transcript
132 |     "C": 1,   # canonical transcript
133 |     "N": 0
134 | }
135 | 
136 | # check that json annotations exist for all transcripts in the SpliceAI annotations file
137 | for genome_version in "37", "38":
138 |     json_transcript_ids = set(TRANSCRIPT_ANNOTATIONS[genome_version])
139 |     df = pd.read_table(SPLICEAI_GRCH37_ANNOTATIONS if genome_version == "37" else SPLICEAI_GRCH38_ANNOTATIONS)
140 |     spliceai_annotation_transcript_ids = set(df["#NAME"].apply(lambda t: t.split(".")[0]))
141 |     transcript_ids_without_annotations = spliceai_annotation_transcript_ids - json_transcript_ids
142 |     if len(transcript_ids_without_annotations) > 0:
143 |         raise ValueError(f"Missing {len(transcript_ids_without_annotations)} transcripts in {genome_version} annotations: {transcript_ids_without_annotations}")
144 | 
145 | SPLICEAI_MAX_DISTANCE_LIMIT = 10000
146 | SPLICEAI_DEFAULT_DISTANCE = 500  # maximum distance between the variant and gained/lost splice site, defaults to 500
147 | SPLICEAI_DEFAULT_MASK = 0        # mask scores representing annotated acceptor/donor gain and unannotated acceptor/donor loss, defaults to 0
148 | 
149 | SPLICEAI_EXAMPLE = f"/spliceai/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G"
150 | 
151 | VARIANT_RE = re.compile(
152 |     "(chr)?(?P<chrom>[0-9XYMTt]{1,2})"
153 |     "[-\s:]+"
154 |     "(?P<pos>[0-9]{1,9})"
155 |     "[-\s:]+"
156 |     "(?P<ref>[ACGT]+)"
157 |     "[-\s:>]+"
158 |     "(?P<alt>[ACGT]+)"
159 | )
160 | 
161 | USE_REDIS = True
162 | 
163 | if USE_REDIS:
164 |     import redis
165 |     REDIS = redis.Redis(host='localhost', port=6379, db=0)  # in-memory cache server which may or may not be running
166 | else:
167 |     REDIS = None
168 | 
169 | 
170 | def error_response(error_message, source=None):
171 |     response_json = {"error": str(error_message)}
172 |     if source:
173 |         response_json["source"] = source
174 |     return Response(json.dumps(response_json), status=200, mimetype='application/json')
175 | 
176 | 
177 | REVERSE_COMPLEMENT_MAP = dict(zip("ACGTN", "TGCAN"))
178 | 
179 | 
180 | def reverse_complement(seq):
181 |     return "".join([REVERSE_COMPLEMENT_MAP[n] for n in seq[::-1]])
182 | 
183 | 
184 | def parse_variant(variant_str):
185 |     match = VARIANT_RE.match(variant_str)
186 |     if not match:
187 |         raise ValueError(f"Unable to parse variant: {variant_str}")
188 | 
189 |     return match['chrom'], int(match['pos']), match['ref'], match['alt']
190 | 
191 | 
192 | class VariantRecord:
193 |     def __init__(self, chrom, pos, ref, alt):
194 |         self.chrom = chrom
195 |         self.pos = pos
196 |         self.ref = ref
197 |         self.alts = [alt]
198 | 
199 |     def __repr__(self):
200 |         return f"{self.chrom}-{self.pos}-{self.ref}-{self.alts[0]}"
201 | 
202 | 
203 | def get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask):
204 |     return f"{tool_name}__{variant}__hg{genome_version}__d{distance}__m{mask}"
205 | 
206 | 
207 | def get_splicing_scores_from_redis(tool_name, variant, genome_version, distance, mask):
208 |     if REDIS is None:
209 |         return None
210 | 
211 |     key = get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask)
212 |     results = None
213 |     try:
214 |         results_string = REDIS.get(key)
215 |         if results_string:
216 |             results = json.loads(results_string)
217 |             results["source"] += ":redis"
218 |     except Exception as e:
219 |         print(f"Redis error: {e}", flush=True)
220 | 
221 |     return results
222 | 
223 | 
224 | def add_splicing_scores_to_redis(tool_name, variant, genome_version, distance, mask, results):
225 |     if REDIS is None:
226 |         return
227 | 
228 |     key = get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask)
229 |     try:
230 |         results_string = json.dumps(results)
231 |         REDIS.set(key, results_string)
232 |     except Exception as e:
233 |         print(f"Redis error: {e}", flush=True)
234 | 
235 | 
236 | def exceeds_rate_limit(user_id, request_type):
237 |     """Checks whether the given address has exceeded rate limits
238 | 
239 |     Args:
240 |         user_id (str): unique user id
241 |         request_type (str): type of rate limit - can be "spliceai:total", "spliceai:model", or "liftover:total"
242 | 
243 |     Return str: error message about exceeding the rate limit, or None if the rate limit was not exceeded
244 |     """
245 |     if REDIS is None:
246 |         return False
247 | 
248 |     if request_type not in RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE:
249 |         raise ValueError(f"Invalid 'request_type' arg value: {request_type}")
250 | 
251 |     epoch_time = time.time()  # seconds since 1970
252 | 
253 |     if epoch_time - int(REDIS.get("rate_limit_outlier_ips_update_time") or 0) > 120:  # time 2 minutes
254 |         REDIS.set("rate_limit_outlier_ips_update_time", int(epoch_time))
255 |         global RATE_LIMIT_OUTLIER_IPS
256 |         RATE_LIMIT_OUTLIER_IPS = get_rate_limit_outlier_ips()
257 | 
258 |     if user_id in RATE_LIMIT_OUTLIER_IPS:
259 |         print(f"Rate limiting outlier list IP: {user_id}")
260 |         max_requests = 1
261 |     else:
262 |         max_requests_per_minute = RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type]
263 |         max_requests = RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * max_requests_per_minute
264 | 
265 |     try:
266 |         # check number of requests from this user in the last (RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * 60) minutes
267 |         redis_key_prefix = f"request {user_id} {request_type}"
268 |         keys = REDIS.keys(f"{redis_key_prefix}*")
269 |         if len(keys) >= max_requests:
270 |             redis_hit_limit_counter_key = f"request {user_id} rate limit counter"
271 |             redis_hit_limit_counter = REDIS.get(redis_hit_limit_counter_key) or 0
272 |             redis_hit_limit_counter = int(redis_hit_limit_counter) + 1
273 |             REDIS.set(redis_hit_limit_counter_key, redis_hit_limit_counter)
274 |             REDIS.expire(redis_hit_limit_counter_key, RATE_LIMIT_COUNTER_WINDOW_SIZE_IN_DAYS * 24 * 60 * 60)
275 | 
276 |             if redis_hit_limit_counter > 200:
277 |                 error_message = (
278 |                     f"ERROR: You have exceeded the rate limit {redis_hit_limit_counter} times so far "
279 |                     f"over the past few days. To prevent a single user from overwhelming the server and making it "
280 |                     f"unavailable to other users, this tool allows no more than "
281 |                     f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type]} computed requests per "
282 |                     f"minute per user. If you continue to exceed this limit, your IP address may be blocked."
283 |                 )
284 |             else:
285 |                 error_message = (
286 |                     f"ERROR: Rate limit reached. To prevent a user from overwhelming the server and making it "
287 |                     f"unavailable to other users, this tool allows no more than "
288 |                     f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type]} computed requests per minute per user."
289 |                 )
290 | 
291 |             return error_message
292 | 
293 |         # record this request
294 |         REDIS.set(f"{redis_key_prefix}: {epoch_time}", 1)
295 |         REDIS.expire(f"{redis_key_prefix}: {epoch_time}", RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * 60)
296 |     except Exception as e:
297 |         print(f"Redis error: {e}", flush=True)
298 | 
299 |     return None
300 | 
301 | 
302 | def get_spliceai_scores(variant, genome_version, distance_param, mask_param):
303 |     try:
304 |         chrom, pos, ref, alt = parse_variant(variant)
305 |     except ValueError as e:
306 |         return {
307 |             "variant": variant,
308 |             "source": "spliceai",
309 |             "error": f"ERROR: {e}",
310 |         }
311 | 
312 |     # generate error message if variant falls outside annotated exons or introns
313 |     OTHER_GENOME_VERSION = {"37": "38", "38": "37"}
314 |     chrom_without_chr = chrom.replace("chr", "")
315 |     if not ANNOTATION_INTERVAL_TREES[genome_version][chrom_without_chr].at(pos):
316 |         other_genome_version = OTHER_GENOME_VERSION[genome_version]
317 |         other_genome_overlapping_intervals = ANNOTATION_INTERVAL_TREES[other_genome_version][chrom_without_chr].at(pos)
318 |         if other_genome_overlapping_intervals:
319 |             other_genome_genes = " and ".join(sorted(set([str(i.data) for i in other_genome_overlapping_intervals])))
320 |             return {
321 |                 "variant": variant,
322 |                 "source": "spliceai",
323 |                 "error": f"ERROR: In GRCh{genome_version}, {chrom}-{pos}-{ref}-{alt} falls outside all gencode exons and introns."
324 |                          f"SpliceAI only works for variants within known exons or introns. However, in GRCh{other_genome_version}, "
325 |                          f"{chrom}:{pos} falls within {other_genome_genes}, so perhaps GRCh{genome_version} is not the correct genome version?"
326 |             }
327 |         else:
328 |             return {
329 |                 "variant": variant,
330 |                 "source": "spliceai",
331 |                 "error": f"ERROR: {chrom}-{pos}-{ref}-{alt} falls outside all Gencode exons and introns on "
332 |                 f"GRCh{genome_version}. SpliceAI only works for variants that are within known exons or introns.",
333 |             }
334 | 
335 |             """
336 |             NOTE: The reason SpliceAI currently works only for variants 
337 |             within annotated exons or introns is that, although the SpliceAI neural net takes any 
338 |             arbitrary nucleotide sequence as input, SpliceAI needs 1) the transcript strand 
339 |             to determine whether to reverse-complement the reference genome sequence before passing it 
340 |             to the neural net, and 2) transcript start and end positions to determine where to truncate 
341 |             the reference genome sequence.
342 |             """
343 | 
344 |     source = None
345 |     scores = []
346 | 
347 |     # run the SpliceAI model to compute the scores
348 |     if not scores:
349 |         error_message = exceeds_rate_limit(request.remote_addr, request_type="spliceai:model")
350 |         if error_message:
351 |             return {
352 |                 "variant": variant,
353 |                 "source": "spliceai",
354 |                 "error": error_message,
355 |             }
356 | 
357 |         record = VariantRecord(chrom, pos, ref, alt)
358 |         try:
359 |             scores = get_delta_scores(
360 |                 record,
361 |                 SPLICEAI_ANNOTATOR[genome_version],
362 |                 distance_param,
363 |                 mask_param)
364 |             source = "spliceai:model"
365 |         except Exception as e:
366 |             print(f"ERROR while computing SpliceAI scores for {variant}: {e}")
367 |             traceback.print_exc()
368 |             return {
369 |                 "variant": variant,
370 |                 "source": "spliceai",
371 |                 "error": f"ERROR: {type(e)}: {e}",
372 |             }
373 | 
374 |     if not scores:
375 |         return {
376 |             "variant": variant,
377 |             "source": "spliceai",
378 |             "error": f"ERROR: The SpliceAI model did not return any scores for {variant}. This may be due to the "
379 |                      f"variant falling outside of all Gencode exons and introns.",
380 |         }
381 | 
382 |     # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript)
383 |     all_non_zero_scores = None
384 |     all_non_zero_scores_strand = None
385 |     all_non_zero_scores_transcript_id = None
386 |     all_non_zero_scores_transcript_priority = -1
387 |     max_delta_score_sum = 0
388 |     for i, transcript_scores in enumerate(scores):
389 |         if "ALL_NON_ZERO_SCORES" not in transcript_scores:
390 |             continue
391 | 
392 |         transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0]
393 | 
394 |         # get json annotations for this transcript
395 |         transcript_annotations = TRANSCRIPT_ANNOTATIONS[genome_version].get(transcript_id_without_version)
396 |         if transcript_annotations is None:
397 |             raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations")
398 | 
399 |         # add the extra transcript annotations from the json file to the transcript scores dict
400 |         transcript_scores.update(transcript_annotations)
401 | 
402 |         current_transcript_priority = TRANSCRIPT_PRIORITY_ORDER[transcript_annotations["t_priority"]]
403 |         current_delta_score_sum = sum(float(transcript_scores.get(key, 0)) for key in ("DP_AG", "DP_AL", "DP_DG", "DP_DL"))
404 |         if current_transcript_priority > all_non_zero_scores_transcript_priority:
405 |             all_non_zero_scores_transcript_priority = current_transcript_priority
406 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
407 |             all_non_zero_scores_strand = transcript_scores["t_strand"]
408 |             all_non_zero_scores_transcript_id = transcript_scores["t_id"]
409 |         elif current_transcript_priority == all_non_zero_scores_transcript_priority and current_delta_score_sum > max_delta_score_sum:
410 |             # select the one with the highest delta score sum
411 |             max_delta_score_sum = current_delta_score_sum
412 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
413 |             all_non_zero_scores_strand = transcript_scores["t_strand"]
414 |             all_non_zero_scores_transcript_id = transcript_scores["t_id"]
415 | 
416 |         for redundant_key in "ALLELE", "NAME", "STRAND", "ALL_NON_ZERO_SCORES":
417 |             del transcript_scores[redundant_key]
418 | 
419 |     return {
420 |         "variant": variant,
421 |         "genomeVersion": genome_version,
422 |         "chrom": chrom,
423 |         "pos": pos,
424 |         "ref": ref,
425 |         "alt": alt,
426 |         "distance": distance_param,
427 |         "scores": scores,
428 |         "source": source,
429 | 
430 |         "allNonZeroScores": all_non_zero_scores,
431 |         "allNonZeroScoresStrand": all_non_zero_scores_strand,
432 |         "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id,
433 |     }
434 | 
435 | 
436 | def get_pangolin_scores(variant, genome_version, distance_param, mask_param):
437 |     if genome_version not in ("37", "38"):
438 |         raise ValueError(f"Invalid genome_version: {genome_version}")
439 | 
440 |     if mask_param not in ("True", "False"):
441 |         raise ValueError(f"Invalid mask_param: {mask_param}")
442 | 
443 |     try:
444 |         chrom, pos, ref, alt = parse_variant(variant)
445 |     except ValueError as e:
446 |         print(f"ERROR while parsing variant {variant}: {e}")
447 |         traceback.print_exc()
448 | 
449 |         return {
450 |             "variant": variant,
451 |             "source": "pangolin",
452 |             "error": f"ERROR: {e}",
453 |         }
454 | 
455 |     if len(ref) > 1 and len(alt) > 1:
456 |         return {
457 |             "variant": variant,
458 |             "source": "pangolin",
459 |             "error": f"ERROR: Pangolin does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}",
460 |         }
461 | 
462 |     error_message = exceeds_rate_limit(request.remote_addr, request_type="pangolin:model")
463 |     if error_message:
464 |         return {
465 |             "variant": variant,
466 |             "source": "pangolin",
467 |             "error": error_message,
468 |         }
469 | 
470 |     class PangolinArgs:
471 |         reference_file = HG19_FASTA_PATH if genome_version == "37" else HG38_FASTA_PATH
472 |         distance = distance_param
473 |         mask = mask_param
474 |         score_cutoff = None
475 |         score_exons = "False"
476 | 
477 |     if genome_version == "37":
478 |         pangolin_gene_db = gffutils.FeatureDB(PANGOLIN_GRCH37_ANNOTATIONS)
479 |     else:
480 |         pangolin_gene_db = gffutils.FeatureDB(PANGOLIN_GRCH38_ANNOTATIONS)
481 | 
482 |     scores = process_variant_using_pangolin(
483 |         0, chrom, int(pos), ref, alt, pangolin_gene_db, PANGOLIN_MODELS, PangolinArgs)
484 | 
485 |     if not scores:
486 |         return {
487 |             "variant": variant,
488 |             "source": "pangolin",
489 |             "error": f"ERROR: Pangolin was unable to compute scores for this variant",
490 |         }
491 | 
492 |     # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript)
493 |     all_non_zero_scores = None
494 |     all_non_zero_scores_strand = None
495 |     all_non_zero_scores_transcript_id = None
496 |     max_delta_score_sum = 0
497 |     for i, transcript_scores in enumerate(scores):
498 |         if "ALL_NON_ZERO_SCORES" not in transcript_scores:
499 |             continue
500 | 
501 |         transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0]
502 | 
503 |         # get json annotations for this transcript
504 |         transcript_annotations = TRANSCRIPT_ANNOTATIONS[genome_version].get(transcript_id_without_version)
505 |         if transcript_annotations is None:
506 |             raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations")
507 | 
508 |         # add the extra transcript annotations from the json file to the transcript scores dict
509 |         transcript_scores.update(transcript_annotations)
510 | 
511 |         # decide whether to use ALL_NON_ZERO_SCORES from this gene
512 |         current_delta_score_sum = sum(abs(float(s.get("SG_ALT", 0)) - float(s.get("SG_REF", 0)))
513 |                               for s in transcript_scores["ALL_NON_ZERO_SCORES"])
514 |         current_delta_score_sum += sum(abs(float(s.get("SL_ALT", 0)) - float(s.get("SL_REF", 0)))
515 |                                for s in transcript_scores["ALL_NON_ZERO_SCORES"])
516 | 
517 |         # return all_non_zero_scores for the transcript or gene with the highest delta score sum
518 |         if current_delta_score_sum > max_delta_score_sum:
519 |             all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"]
520 |             all_non_zero_scores_strand = transcript_scores["STRAND"]
521 |             all_non_zero_scores_transcript_id = transcript_scores["NAME"]
522 |             max_delta_score_sum = current_delta_score_sum
523 | 
524 |         for redundant_key in "NAME", "STRAND", "ALL_NON_ZERO_SCORES":
525 |             del transcript_scores[redundant_key]
526 | 
527 |     return {
528 |         "variant": variant,
529 |         "genomeVersion": genome_version,
530 |         "chrom": chrom,
531 |         "pos": pos,
532 |         "ref": ref,
533 |         "alt": alt,
534 |         "distance": distance_param,
535 |         "scores": scores,
536 |         "source": "pangolin",
537 |         "allNonZeroScores": all_non_zero_scores,
538 |         "allNonZeroScoresStrand": all_non_zero_scores_strand,
539 |         "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id,
540 |     }
541 | 
542 | 
543 | @app.route("/spliceai/", methods=['POST', 'GET'])
544 | def run_spliceai():
545 |     return run_splice_prediction_tool(tool_name="spliceai")
546 | 
547 | 
548 | @app.route("/pangolin/", methods=['POST', 'GET'])
549 | def run_pangolin():
550 |     return run_splice_prediction_tool(tool_name="pangolin")
551 | 
552 | 
553 | def run_splice_prediction_tool(tool_name):
554 |     """Handles API request for splice prediction
555 | 
556 |     Args:
557 |         tool_name (str): "spliceai" or "pangolin"
558 |     """
559 |     if tool_name not in ("spliceai", "pangolin"):
560 |         raise ValueError(f"Invalid tool_name: {tool_name}")
561 | 
562 |     start_time = datetime.now()
563 |     logging_prefix = start_time.strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()}"
564 | 
565 |     # check params
566 |     params = {}
567 |     if request.values:
568 |         params.update(request.values)
569 | 
570 |     if 'variant' not in params:
571 |         params.update(request.get_json(force=True, silent=True) or {})
572 | 
573 |     error_message = exceeds_rate_limit(request.remote_addr, request_type=f"{tool_name}:total")
574 |     if error_message:
575 |         print(f"{logging_prefix}: {request.remote_addr}: response: {error_message}", flush=True)
576 |         return error_response(error_message, source=tool_name)
577 | 
578 |     variant = params.get('variant', '')
579 |     variant = variant.strip().strip("'").strip('"').strip(",")
580 |     if not variant:
581 |         return error_response(f'"variant" not specified. For example: {SPLICEAI_EXAMPLE}\n', source=tool_name)
582 | 
583 |     if not isinstance(variant, str):
584 |         return error_response(f'"variant" value must be a string rather than a {type(variant)}.\n', source=tool_name)
585 | 
586 |     genome_version = params.get("hg")
587 |     if not genome_version:
588 |         return error_response(f'"hg" not specified. The URL must include an "hg" arg: hg=37 or hg=38. For example: {SPLICEAI_EXAMPLE}\n', source=tool_name)
589 | 
590 |     if genome_version not in ("37", "38"):
591 |         return error_response(f'Invalid "hg" value: "{genome_version}". The value must be either "37" or "38". For example: {SPLICEAI_EXAMPLE}\n', source=tool_name)
592 | 
593 |     distance_param = params.get("distance", SPLICEAI_DEFAULT_DISTANCE)
594 |     try:
595 |         distance_param = int(distance_param)
596 |     except Exception as e:
597 |         return error_response(f'Invalid "distance": "{distance_param}". The value must be an integer.\n', source=tool_name)
598 | 
599 |     if distance_param > SPLICEAI_MAX_DISTANCE_LIMIT:
600 |         return error_response(f'Invalid "distance": "{distance_param}". The value must be < {SPLICEAI_MAX_DISTANCE_LIMIT}.\n', source=tool_name)
601 | 
602 |     mask_param = params.get("mask", str(SPLICEAI_DEFAULT_MASK))
603 |     if mask_param not in ("0", "1"):
604 |         return error_response(f'Invalid "mask" value: "{mask_param}". The value must be either "0" or "1". For example: {SPLICEAI_EXAMPLE}\n', source=tool_name)
605 | 
606 |     if request.remote_addr not in DISABLE_LOGGING_FOR_IPS:
607 |         print(f"{logging_prefix}: {request.remote_addr}: ======================", flush=True)
608 |         print(f"{logging_prefix}: {request.remote_addr}: {variant} processing with hg={genome_version}, "
609 |               f"distance={distance_param}, mask={mask_param}", flush=True)
610 | 
611 |     # check REDIS cache before processing the variant
612 |     results = get_splicing_scores_from_redis(tool_name, variant, genome_version, distance_param, mask_param)
613 |     if not results:
614 |         try:
615 |             if tool_name == "spliceai":
616 |                 results = get_spliceai_scores(variant, genome_version, distance_param, int(mask_param))
617 |             elif tool_name == "pangolin":
618 |                 pangolin_mask_param = "True" if mask_param == "1" else "False"
619 |                 results = get_pangolin_scores(variant, genome_version, distance_param, pangolin_mask_param)
620 |             else:
621 |                 raise ValueError(f"Invalid tool_name: {tool_name}")
622 |         except Exception as e:
623 |             traceback.print_exc()
624 |             return error_response(f"ERROR: {e}", source=tool_name)
625 | 
626 |         if "error" not in results:
627 |             add_splicing_scores_to_redis(tool_name, variant, genome_version, distance_param, mask_param, results)
628 | 
629 |     response_json = {}
630 |     response_json.update(params)  # copy input params to output
631 |     response_json.update(results)
632 | 
633 |     duration = str(datetime.now() - start_time)
634 |     response_json['duration'] = duration
635 | 
636 |     if request.remote_addr not in DISABLE_LOGGING_FOR_IPS:
637 |         print(f"{logging_prefix}: {request.remote_addr}: {variant} took {duration}", flush=True)
638 | 
639 |     return Response(json.dumps(response_json), status=200, mimetype='application/json')
640 | 
641 | 
642 | LIFTOVER_EXAMPLE = f"/liftover/?hg=hg19-to-hg38&format=interval&chrom=chr8&start=140300615&end=140300620"
643 | 
644 | CHAIN_FILE_PATHS = {
645 |     "hg19-to-hg38": "hg19ToHg38.over.chain.gz",
646 |     "hg38-to-hg19": "hg38ToHg19.over.chain.gz",
647 |     "hg38-to-t2t": "hg38ToHs1.over.chain.gz", # replaced hg38-chm13v2.over.chain.gz based on advice from Giulio Genovese
648 |     "t2t-to-hg38": "hs1ToHg38.over.chain.gz", # replaced chm13v2-hg38.over.chain.gz based on advice from Giulio Genovese
649 | }
650 | 
651 | LIFTOVER_REFERENCE_PATHS = {
652 |     "hg19-to-hg38": (HG19_FASTA_PATH, HG38_FASTA_PATH),
653 |     "hg38-to-hg19": (HG38_FASTA_PATH, HG19_FASTA_PATH),
654 |     "hg38-to-t2t": (HG38_FASTA_PATH, T2T_FASTA_PATH),
655 |     "t2t-to-hg38": (T2T_FASTA_PATH, HG38_FASTA_PATH),
656 | }
657 | 
658 | def run_variant_liftover_tool(hg, chrom, pos, ref, alt, verbose=False):
659 |     if hg not in CHAIN_FILE_PATHS or hg not in LIFTOVER_REFERENCE_PATHS:
660 |         raise ValueError(f"Unexpected hg arg value: {hg}")
661 |     chain_file_path = CHAIN_FILE_PATHS[hg]
662 |     source_fasta_path, destination_fasta_path = LIFTOVER_REFERENCE_PATHS[hg]
663 | 
664 |     with tempfile.NamedTemporaryFile(suffix=".vcf", mode="wt", encoding="UTF-8") as input_file, \
665 |             tempfile.NamedTemporaryFile(suffix=".vcf", mode="rt", encoding="UTF-8") as output_file:
666 | 
667 |         #  command syntax: liftOver oldFile map.chain newFile unMapped
668 |         if hg == "hg19-to-hg38":
669 |             chrom = chrom.replace("chr", "")
670 |         else:
671 |             chrom = "chr" + chrom.replace("chr", "")
672 | 
673 |         input_file.write(f"""##fileformat=VCFv4.2
674 | ##contig=<ID={chrom},length=100000000>
675 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
676 | {chrom}	{pos}	.	{ref}	{alt}	60	.""")
677 |         input_file.flush()
678 |         command = (
679 |             f"cat {input_file.name} | "
680 |             f"bcftools plugin liftover -- --src-fasta-ref {source_fasta_path} --fasta-ref {destination_fasta_path} --chain {chain_file_path} | "
681 |             f"grep -v ^#  > {output_file.name}"
682 |         )
683 | 
684 |         try:
685 |             subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, encoding="UTF-8")
686 |             results = output_file.read()
687 | 
688 |             if verbose:
689 |                 print(f"{BCFTOOLS_LIFTOVER_TOOL} {hg} liftover on {chrom}:{pos} {ref}>{alt} returned: {results}", flush=True)
690 | 
691 |             # example: chr8	140300616	.	T	G	60	.	.
692 | 
693 |             result_fields = results.strip().split("\t")
694 |             if len(result_fields) > 5:
695 |                 result_fields[1] = int(result_fields[1])
696 | 
697 |                 return {
698 |                     "hg": hg,
699 |                     "chrom": chrom,
700 |                     "start": int(pos) - 1,
701 |                     "end": pos,
702 |                     "output_chrom": result_fields[0],
703 |                     "output_pos": result_fields[1],
704 |                     "output_ref": result_fields[3],
705 |                     "output_alt": result_fields[4],
706 |                     "liftover_tool": BCFTOOLS_LIFTOVER_TOOL,
707 |                     #"output_strand": "-" if "SWAP=-1" in results else "+",
708 |                 }
709 | 
710 |         except Exception as e:
711 |             variant = f"{hg}  {chrom}:{pos} {ref}>{alt}"
712 |             print(f"ERROR in {BCFTOOLS_LIFTOVER_TOOL} for {variant}: {e}")
713 |             print("Falling back on UCSC liftover tool..")
714 |             #traceback.print_exc()
715 |             #raise ValueError(f"liftOver command failed for {variant}: {e}")
716 | 
717 |         # if bcftools liftover failed, fall back on running UCSC liftover
718 |         chrom = "chr" + chrom.replace("chr", "")
719 |         result = run_UCSC_liftover_tool(hg, chrom, int(pos)-1, pos, verbose=False)
720 |         result["output_ref"] = ref
721 |         result["output_alt"] = alt
722 |         #if result["output_strand"] == "-":
723 |         #    result["output_ref"] = reverse_complement(result["output_ref"])
724 |         #    result["output_alt"] = reverse_complement(result["output_alt"])
725 |         return result
726 | 
727 | 
728 | def run_UCSC_liftover_tool(hg, chrom, start, end, verbose=False):
729 |     if hg not in CHAIN_FILE_PATHS:
730 |         raise ValueError(f"Unexpected hg arg value: {hg}")
731 |     chain_file_path = CHAIN_FILE_PATHS[hg]
732 | 
733 |     reason_liftover_failed = ""
734 |     with tempfile.NamedTemporaryFile(suffix=".bed", mode="wt", encoding="UTF-8") as input_file, \
735 |         tempfile.NamedTemporaryFile(suffix=".bed", mode="rt", encoding="UTF-8") as output_file, \
736 |         tempfile.NamedTemporaryFile(suffix=".bed", mode="rt", encoding="UTF-8") as unmapped_output_file:
737 | 
738 |         #  command syntax: liftOver oldFile map.chain newFile unMapped
739 |         chrom = "chr" + chrom.replace("chr", "")
740 |         input_file.write("\t".join(map(str, [chrom, start, end, ".", "0", "+"])) + "\n")
741 |         input_file.flush()
742 |         command = f"liftOver {input_file.name} {chain_file_path} {output_file.name} {unmapped_output_file.name}"
743 | 
744 |         try:
745 |             subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, encoding="UTF-8")
746 |             results = output_file.read()
747 |             if verbose:
748 |                 print(f"{UCSC_LIFTOVER_TOOL} {hg} liftover on {chrom}:{start}-{end} returned: {results}", flush=True)
749 | 
750 |             result_fields = results.strip().split("\t")
751 |             if len(result_fields) > 5:
752 |                 result_fields[1] = int(result_fields[1])
753 |                 result_fields[2] = int(result_fields[2])
754 | 
755 |                 return {
756 |                     "hg": hg,
757 |                     "chrom": chrom,
758 |                     "pos": int(start) + 1,
759 |                     "start": start,
760 |                     "end": end,
761 |                     "output_chrom": result_fields[0],
762 |                     "output_pos":   int(result_fields[1]) + 1,
763 |                     "output_start": result_fields[1],
764 |                     "output_end":    result_fields[2],
765 |                     "output_strand": result_fields[5],
766 |                     "liftover_tool": UCSC_LIFTOVER_TOOL,
767 |                 }
768 |             else:
769 |                 reason_liftover_failed = unmapped_output_file.readline().replace("#", "").strip()
770 | 
771 |         except Exception as e:
772 |             variant = f"{hg}  {chrom}:{start}-{end}"
773 |             print(f"ERROR during liftover for {variant}: {e}")
774 |             traceback.print_exc()
775 |             raise ValueError(f"liftOver command failed for {variant}: {e}")
776 | 
777 |     if reason_liftover_failed:
778 |         raise ValueError(f"{hg} liftover failed for {chrom}:{start}-{end} {reason_liftover_failed}")
779 |     else:
780 |         raise ValueError(f"{hg} liftover failed for {chrom}:{start}-{end} for unknown reasons")
781 | 
782 | 
783 | def get_liftover_from_redis(key):
784 |     if REDIS is None:
785 |         return None
786 | 
787 |     results = None
788 |     try:
789 |         results_string = REDIS.get(key)
790 |         if results_string:
791 |             results = json.loads(results_string)
792 |     except Exception as e:
793 |         print(f"Redis error: {e}", flush=True)
794 | 
795 |     return results
796 | 
797 | 
798 | def add_liftover_to_redis(key, result):
799 |     if REDIS is None:
800 |         return
801 | 
802 |     try:
803 |         results_string = json.dumps(result)
804 |         REDIS.set(key, results_string)
805 |     except Exception as e:
806 |         print(f"Redis error: {e}", flush=True)
807 | 
808 | 
809 | @app.route("/liftover/", methods=['POST', 'GET'])
810 | def run_liftover():
811 |     logging_prefix = datetime.now().strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()}"
812 | 
813 |     # check params
814 |     params = {}
815 |     if request.values:
816 |         params.update(request.values)
817 | 
818 |     if "format" not in params:
819 |         params.update(request.get_json(force=True, silent=True) or {})
820 | 
821 |     error_message = exceeds_rate_limit(request.remote_addr, request_type="liftover:total")
822 |     if error_message:
823 |         print(f"{logging_prefix}: {request.remote_addr}: response: {error_message}", flush=True)
824 |         return error_response(error_message)
825 | 
826 |     VALID_HG_VALUES = set(CHAIN_FILE_PATHS.keys())
827 |     hg = params.get("hg")
828 |     if not hg or hg not in VALID_HG_VALUES:
829 |         return error_response(f'"hg" param error. It should be set to {" or ".join(VALID_HG_VALUES)}. For example: {LIFTOVER_EXAMPLE}\n')
830 | 
831 |     VALID_FORMAT_VALUES = ("interval", "variant", "position")
832 |     format = params.get("format", "")
833 |     if not format or format not in VALID_FORMAT_VALUES:
834 |         return error_response(f'"format" param error. It should be set to {" or ".join(VALID_FORMAT_VALUES)}. For example: {LIFTOVER_EXAMPLE}\n')
835 | 
836 |     chrom = params.get("chrom")
837 |     if not chrom:
838 |         return error_response(f'"chrom" param not specified')
839 | 
840 |     if format == "interval":
841 |         for key in "start", "end":
842 |             if not params.get(key):
843 |                 return error_response(f'"{key}" param not specified')
844 |         start = params.get("start")
845 |         end = params.get("end")
846 |         redis_key = f"{hg}:{chrom}:{start}:{end}"
847 |         variant_log_string = f"{start}-{end}"
848 | 
849 |     elif format == "position":
850 |         try:
851 |             pos = int(params["pos"])
852 |         except Exception as e:
853 |             return error_response(f'"pos" param error: {e}')
854 | 
855 |         start = pos - 1
856 |         end = pos
857 |         redis_key = f"{hg}:{chrom}:{pos}"
858 |         variant_log_string = f"{pos} "
859 |     elif format == "variant":
860 |         for key in "pos", "ref", "alt":
861 |             if not params.get(key):
862 |                 return error_response(f'"{key}" param not specified')
863 |         pos = params.get("pos")
864 |         ref = params.get("ref")
865 |         alt = params.get("alt")
866 |         redis_key = f"{hg}:{chrom}:{pos}:{ref}:{alt}"
867 |         variant_log_string = f"{pos} {ref}>{alt}"
868 | 
869 |     verbose = request.remote_addr not in DISABLE_LOGGING_FOR_IPS
870 |     if verbose:
871 |         print(f"{logging_prefix}: {request.remote_addr}: ======================", flush=True)
872 |         print(f"{logging_prefix}: {request.remote_addr}: {hg} liftover {format}: {chrom}:{variant_log_string}", flush=True)
873 | 
874 |     # check REDIS cache before processing the variant
875 |     result = get_liftover_from_redis(redis_key)
876 |     if result and verbose:
877 |         print(f"{hg} liftover on {variant_log_string} got results from cache: {result}", flush=True)
878 | 
879 |     if not result:
880 |         try:
881 |             if format == "variant":
882 |                 result = run_variant_liftover_tool(hg, chrom, pos, ref, alt, verbose=verbose)
883 |             else:
884 |                 result = run_UCSC_liftover_tool(hg, chrom, start, end, verbose=verbose)
885 |         except Exception as e:
886 |             return error_response(str(e))
887 |     
888 |         add_liftover_to_redis(redis_key, result)
889 | 
890 |     result.update(params)
891 | 
892 |     return Response(json.dumps(result), mimetype='application/json')
893 | 
894 | # share static files from the annotations folder to support local installs
895 | @app.route('/annotations/', strict_slashes=False, defaults={'path': ''})
896 | @app.route('/annotations/<path:path>')
897 | def send_annotations(path):
898 |     if os.path.isfile(os.path.join("annotations", path)):
899 |         return send_from_directory('annotations', path)
900 | 
901 |     # return an html table of available annotation files
902 |     html = "<html><head><title>SpliceAI-lookup: Annotation Files</title></head>"
903 |     html += "<body><table>"
904 |     html += "<tr><th align=left>./annotation files</th><th align=left>last updated</th></tr>"
905 |     for filename in os.listdir("annotations"):
906 |         html += f"<tr><td><a href='/annotations/{filename}'>{filename}</a></td>"
907 |         last_modified = datetime.fromtimestamp(os.path.getmtime(os.path.join('annotations', filename)))
908 |         html += f"<td>{last_modified.strftime('%Y-%m-%d %H:%M:%S')}</td></tr>"
909 |     html += "</table></body></html>"
910 | 
911 |     return Response(html, mimetype='text/html')
912 | 
913 | 
914 | 
915 | @app.route('/', strict_slashes=False, defaults={'path': ''})
916 | @app.route('/<path:path>/')
917 | def catch_all(path):
918 |     if not path:
919 |         path = "index.html"
920 | 
921 |     if path in {"index.html", "igv.min.js"}:
922 |         with open(path, "rt") as f:
923 |             html = f.read()
924 |         return Response(html, mimetype='text/html')
925 |     elif path == "favicon.ico":
926 |         return send_from_directory('', 'favicon.ico')
927 |     else:
928 |         with open("README.md") as f:
929 |             return markdown2.markdown(f.read())
930 | 
931 | 
932 | print("Initialization completed.", flush=True)
933 | 
934 | if __name__ == "__main__":
935 |     app.run(debug=DEBUG, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))
936 | 


--------------------------------------------------------------------------------