├── CNAME ├── stop_server.sh ├── run_tests.sh ├── google_cloud_run_services ├── Makefile ├── docker │ ├── pangolin │ │ ├── sha256_grch37.txt │ │ ├── sha256_grch38.txt │ │ ├── requirements.txt │ │ └── Dockerfile │ └── spliceai │ │ ├── sha256_grch37.txt │ │ ├── sha256_grch38.txt │ │ ├── requirements.txt │ │ └── Dockerfile ├── database_admin.sh ├── README.md ├── create_pangolin_db.py ├── connect_to_db.sh ├── test_score_consistency.py ├── build_and_deploy.py └── server.py ├── icon.png ├── restart_server.sh ├── chm13v2-hg38.over.chain.gz ├── hg19ToHg38.over.chain.gz ├── hg38-chm13v2.over.chain.gz ├── hg38ToHg19.over.chain.gz ├── t2t-chm13-v1.0.hg38.over.chain.gz ├── t2t-chm13-v1.1.grch38.over.chain.gz ├── annotations ├── list_current_homo_sapiens_ensembl_dbs.sh ├── upload_annotations_to_server.sh ├── update_json_annotation_files.sh ├── convert_primate_ai_to_indexed_table.sh ├── update_pangolin_db_files.sh ├── README.md ├── update_SpliceAI_annotation_txt_files.sh ├── convert_SpliceAI_annotation_input_format_to_bed.py ├── combine_PrimateAI_scores_and_gene_threshold_tables.py ├── combine_score_tables.py ├── convert_gtf_to_SpliceAI_annotation_input_format.py └── generate_transcript_annotation_json.py ├── test_data ├── spliceai_scores.raw.snv.hg38_subset.vcf.gz ├── run_spliceai_on_test_vcf.sh ├── spliceai_scores.masked.snv.hg38_subset.vcf.gz ├── spliceai_scores.raw.indel.hg38_subset.vcf.gz ├── spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi ├── spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi ├── spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi └── test.vcf ├── temporarily_disable_liftover_rate_limit.py ├── start_server.sh ├── requirements.txt ├── start_local_server.sh ├── .github └── ISSUE_TEMPLATE │ └── issue-or-feature-request.md ├── LICENSE ├── .gitignore ├── test_spliceai.py ├── README.md └── server.py /CNAME: -------------------------------------------------------------------------------- 1 | spliceailookup.broadinstitute.org -------------------------------------------------------------------------------- /stop_server.sh: -------------------------------------------------------------------------------- 1 | pkill -9 gunicorn 2 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | python3.6 -m unittest test_spliceai 4 | -------------------------------------------------------------------------------- /google_cloud_run_services/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python3 build_and_deploy.py 3 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/icon.png -------------------------------------------------------------------------------- /restart_server.sh: -------------------------------------------------------------------------------- 1 | #redis-cli flushall 2 | 3 | kill -HUP $(pgrep gunicorn | head -n 1) 4 | -------------------------------------------------------------------------------- /chm13v2-hg38.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/chm13v2-hg38.over.chain.gz -------------------------------------------------------------------------------- /hg19ToHg38.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg19ToHg38.over.chain.gz -------------------------------------------------------------------------------- /hg38-chm13v2.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg38-chm13v2.over.chain.gz -------------------------------------------------------------------------------- /hg38ToHg19.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/hg38ToHg19.over.chain.gz -------------------------------------------------------------------------------- /t2t-chm13-v1.0.hg38.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/t2t-chm13-v1.0.hg38.over.chain.gz -------------------------------------------------------------------------------- /google_cloud_run_services/docker/pangolin/sha256_grch37.txt: -------------------------------------------------------------------------------- 1 | sha256:bcc5a434b184f9dc986528b2aca9744e7c953d7f9fa340172eec8e3130f1e1b6 2 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/pangolin/sha256_grch38.txt: -------------------------------------------------------------------------------- 1 | sha256:6e248671d1e83c5ea312a1a4bbc973838c6793a1922e238b9fbb060f7570247f 2 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/spliceai/sha256_grch37.txt: -------------------------------------------------------------------------------- 1 | sha256:4905bed9a69fd3a967b7ee820026543b9e4d3f82707bb601e670e6cf83e60d4c 2 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/spliceai/sha256_grch38.txt: -------------------------------------------------------------------------------- 1 | sha256:25275794ffc49033bcc6247441b14887e65a1a08e395aed97aea595eaef78fce 2 | -------------------------------------------------------------------------------- /t2t-chm13-v1.1.grch38.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/t2t-chm13-v1.1.grch38.over.chain.gz -------------------------------------------------------------------------------- /annotations/list_current_homo_sapiens_ensembl_dbs.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | mysql -h useastdb.ensembl.org -u anonymous -e "show databases;" | grep -i homo_sapiens_core 3 | -------------------------------------------------------------------------------- /annotations/upload_annotations_to_server.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | gcloud compute scp gencode.v44* weisburd@spliceai-lookup:/home/weisburd/SpliceAI-lookup/annotations/ 4 | -------------------------------------------------------------------------------- /test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz -------------------------------------------------------------------------------- /test_data/run_spliceai_on_test_vcf.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | spliceai -R ~/p1/ref/GRCh38/hg38.fa -I test.vcf -O results.vcf -A ../annotations/gencode.v43.annotation.txt.gz 4 | 5 | -------------------------------------------------------------------------------- /test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz -------------------------------------------------------------------------------- /test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz -------------------------------------------------------------------------------- /test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.snv.hg38_subset.vcf.gz.tbi -------------------------------------------------------------------------------- /test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz.tbi -------------------------------------------------------------------------------- /test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/SpliceAI-lookup/HEAD/test_data/spliceai_scores.masked.snv.hg38_subset.vcf.gz.tbi -------------------------------------------------------------------------------- /google_cloud_run_services/docker/spliceai/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask_cors 3 | flask-talisman==1.1.0 4 | gunicorn 5 | pandas==2.2.2 6 | biopython==1.83 7 | pyfastx==2.1.0 8 | 9 | # sql 10 | psycopg2==2.9.9 11 | -------------------------------------------------------------------------------- /temporarily_disable_liftover_rate_limit.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import time 3 | 4 | r = redis.Redis(host='localhost') 5 | 6 | while True: 7 | for key in r.keys("request *liftover*"): 8 | print("Deleting key: ", key.decode("UTF-8")) 9 | r.delete(key) 10 | time.sleep(1) 11 | -------------------------------------------------------------------------------- /start_server.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | #redis-cli flushall # clear all keys from redis 4 | while true 5 | do 6 | 7 | gunicorn -w 8 -t 1800 -b 0.0.0.0:80 -b 0.0.0.0:443 \ 8 | --keyfile=../spliceailookup-api.broadinstitute.org.key \ 9 | --certfile=../spliceailookup-api.broadinstitute.org.crt \ 10 | server:app 11 | 12 | done 13 | -------------------------------------------------------------------------------- /annotations/update_json_annotation_files.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | gencode_version=v44 3 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz gencode.${gencode_version}lift37.basic.annotation.gtf.gz; do 4 | log_path=process_$(echo $p | sed s/.gtf.gz//).log 5 | time python3 generate_transcript_annotation_json.py $p | tee -a ${log_path} 6 | done 7 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/pangolin/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask_cors 3 | flask-talisman==1.1.0 4 | gunicorn 5 | 6 | # pangolin dependencies: 7 | gffutils==0.13 8 | biopython==1.83 9 | pyfastx==2.1.0 10 | PyVCF3>=1.0.3 11 | 12 | # sql 13 | psycopg2==2.9.9 14 | 15 | # Pangolin dependencies 16 | numpy==1.26.4 17 | pandas==2.2.2 18 | torch==2.2.1 19 | 20 | -------------------------------------------------------------------------------- /annotations/convert_primate_ai_to_indexed_table.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | python3 -c 'pd.read_table("PrimateAI_3D.hg19.txt.gz").sort_values(["chr", "pos"], ascending=[True, True]).to_csv("PrimateAI_3D.hg19.sorted.txt.gz", sep="\t", header=True, index=False)' 4 | gunzip -c PrimateAI_3D.hg19.sorted.txt.gz | bgzip > PrimateAI_3D.hg19.txt.gz 5 | tabix -S 1 -s 1 -b 2 -e 2 PrimateAI_3D.hg19.txt.gz 6 | tabix -S 1 -s 1 -b 2 -e 2 PrimateAI_3D.hg38.txt.gz 7 | 8 | -------------------------------------------------------------------------------- /annotations/update_pangolin_db_files.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | gencode_version=v44 4 | gzcat gencode.${gencode_version}lift37.basic.annotation.gtf.gz | sed 's/chr//g' | bgzip > gencode.${gencode_version}lift37.basic.annotation.without_chr_prefix.gtf.gz 5 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz gencode.${gencode_version}lift37.basic.annotation.without_chr_prefix.gtf.gz; do 6 | set -x 7 | python3 ~/code/Pangolin/scripts/create_db.py $p & 8 | set +x 9 | done 10 | 11 | wait 12 | -------------------------------------------------------------------------------- /google_cloud_run_services/database_admin.sh: -------------------------------------------------------------------------------- 1 | 2 | gcloud --project spliceai-lookup-412920 sql instances list 3 | 4 | # view settings 5 | gcloud --project spliceai-lookup-412920 sql instances describe spliceai-lookup-db 6 | 7 | # adjust DB settings 8 | # https://cloud.google.com/sql/docs/postgres/flags#gcloud 9 | gcloud --project spliceai-lookup-412920 sql instances patch spliceai-lookup-db --database-flags=max_connections=50 10 | 11 | # restart DB 12 | gcloud --project spliceai-lookup-412920 sql instances restart spliceai-lookup-db 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.16.1 2 | keras>=3.0.0 3 | spliceai @ git+https://github.com/bw2/SpliceAI # if the latest version isn't being installed, clone the repo and install from the local directory by running python3 setup.py install 4 | flask 5 | flask-cors 6 | flask-talisman 7 | gunicorn 8 | intervaltree 9 | markdown2 10 | pandas 11 | pysam 12 | redis 13 | # pangolin dependencies: 14 | gffutils 15 | biopython 16 | pyfastx 17 | PyVCF3>=1.0.3 18 | pangolin @ git+https://github.com/bw2/Pangolin # if the latest version isn't being installed, clone the repo and install from the local directory by running python3 setup.py install 19 | -------------------------------------------------------------------------------- /start_local_server.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | 3 | NUM_THREADS=1 # set this to the number of cores on your machine (or a bit less) 4 | HOST=127.0.0.1 # set this to 0.0.0.0 instead of 127.0.0.1 to allow access from other computers 5 | PORT=8080 # set this to a port number that is not already in use 6 | TIMEOUT=1800 # kill the server thread if it takes more than this many seconds to compute a response 7 | 8 | # clear the redis cache to avoid reusing outdated or incorrectly formatted SpliceAI responses 9 | redis-cli flushall 10 | 11 | # start the gunicorn server 12 | gunicorn -w ${NUM_THREADS} -t ${TIMEOUT} -b ${HOST}:${PORT} server:app 13 | -------------------------------------------------------------------------------- /annotations/README.md: -------------------------------------------------------------------------------- 1 | To generate or update the transcript annotation files needed for running SpliceAI and Pangolin: 2 | 3 | 1. Download the latest "basic" gene annotations in GTF format from Gencode for both [GRCh38](https://www.gencodegenes.org/human/) and GRCh37. 4 | 2. Update the Gencode version string at the top of these bash scripts, and then run them: 5 | - [update_json_annotation_files.sh](update_json_annotation_files.sh) 6 | - [update_SpliceAI_annotation_txt_files.sh](update_SpliceAI_annotation_txt_files.sh) 7 | - [update_pangolin_db_files.sh](update_pangolin_db_files.sh) 8 | 3. Update the GENCODE_VERSION string in [../server.py](../server.py) 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue-or-feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue or Feature Request 3 | about: Issue or Feature Request 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | REQUIRED: If the issue is about a specific variant or results page (or you have an example), please copy-paste the variant here and/or provide a link to the results page where you see the issue. 11 | 12 | --- 13 | OPTIONAL: If you'd like to also share a screenshot: 14 | 15 | MacOS: Press Command+Shift+4, select a region of the page, then paste the image here. 16 | Windows: Press Windows Logo Key + PrtScn or Fn + Windows logo key + Space Bar. Then, in File Explorer, open the Pictures > Screenshots folder and drag the image here. 17 | -------------------------------------------------------------------------------- /annotations/update_SpliceAI_annotation_txt_files.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | # make sure annotation-utils is installed since it's a dependency of convert_gtf_to_SpliceAI_annotation_input_format.py 4 | python3 -m pip install git+https://github.com/bw2/annotation-utils 5 | 6 | gencode_version=v44 7 | for p in gencode.${gencode_version}.basic.annotation.gtf.gz gencode.${gencode_version}lift37.basic.annotation.gtf.gz; do 8 | log_path=process_$(echo ${p} | sed s/.gtf.gz//).log 9 | time python3 generate_transcript_annotation_json.py ${p} | tee -a ${log_path} 10 | json_path=$(echo ${p} | sed 's/.gtf.gz/.transcript_annotations.json/') 11 | time python3 convert_gtf_to_SpliceAI_annotation_input_format.py -a ${json_path} ${p} | tee ${log_path} 12 | done 13 | -------------------------------------------------------------------------------- /google_cloud_run_services/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the [Google Cloud Run](https://cloud.google.com/run) implementation of SpliceAI and Pangolin web service APIs used by [spliceai-lookup.broadinstitute.org](https://spliceai-lookup.broadinstitute.org) (NOTE: `Cloud Run` is different from Google's `Cloud Functions` service). 2 | 3 | The `build_and_deploy.py` script includes the following commands for building docker images, updating gencode annotations, updating the SpliceAI-lookup Google Cloud Run services, and running tests: 4 | 5 | * **build** the docker images for the SpliceAI and Pangolin services 6 | * **update_annotations** download Gencode annotations and reprocess them into the formats used by SpliceAI and Pangolin 7 | * **deploy** the services to Google Cloud Run 8 | * **test** run the service locally using a `docker run` command 9 | * **test2** run the service locally using the heavier-weight `gcloud beta code dev` command which uses kubectl 10 | * **run** open an interactive shell inside the container 11 | To perform any of these operations, run `python3 build_and_deploy.py `. 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 bw2 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/spliceai/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-bullseye 2 | 3 | RUN apt update && apt-get install --no-install-recommends -y \ 4 | ca-certificates \ 5 | wget \ 6 | bzip2 \ 7 | unzip \ 8 | git \ 9 | libcurl4-openssl-dev \ 10 | libbz2-dev \ 11 | liblzma-dev \ 12 | zlib1g-dev 13 | 14 | RUN python3 -m pip install tensorflow==2.16.1 15 | 16 | RUN apt update && apt-get install --no-install-recommends -y build-essential libpq-dev 17 | 18 | COPY docker/spliceai/requirements.txt / 19 | RUN python3 -m pip install --upgrade -r /requirements.txt 20 | 21 | ARG RANDOM=2 22 | RUN python3 -m pip install https://github.com/bw2/SpliceAI/archive/refs/heads/master.zip 23 | 24 | ARG CONCURRENCY="2" 25 | ARG GENOME_VERSION="unknown" 26 | 27 | COPY docker/ref/GRCh${GENOME_VERSION} / 28 | COPY docker/spliceai/annotations/GRCh${GENOME_VERSION} / 29 | COPY server.py / 30 | 31 | ENV TF_CPP_MIN_LOG_LEVEL=3 32 | 33 | ENV PORT=8080 34 | ENV TOOL=spliceai 35 | ENV GENOME_VERSION=${GENOME_VERSION} 36 | ENV CONCURRENCY=${CONCURRENCY} 37 | ENV RUNNING_ON_GOOGLE_CLOUD_RUN=1 38 | 39 | CMD exec gunicorn --preload --bind :$PORT --workers ${CONCURRENCY} --threads 1 --timeout 0 server:app 40 | -------------------------------------------------------------------------------- /google_cloud_run_services/docker/pangolin/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim-bullseye 2 | 3 | RUN apt update && apt-get install --no-install-recommends -y \ 4 | ca-certificates \ 5 | wget \ 6 | bzip2 \ 7 | unzip \ 8 | git \ 9 | libcurl4-openssl-dev \ 10 | libbz2-dev \ 11 | liblzma-dev \ 12 | zlib1g-dev 13 | 14 | RUN python3 -m pip install torch==2.2.1 -f https://download.pytorch.org/whl/torch_stable.html 15 | 16 | RUN apt update && apt-get install --no-install-recommends -y build-essential libpq-dev 17 | 18 | COPY docker/pangolin/requirements.txt / 19 | RUN python3 -m pip install --upgrade -r /requirements.txt 20 | 21 | RUN git clone https://github.com/bw2/Pangolin.git \ 22 | && cd Pangolin \ 23 | && python3 -m pip install . 24 | 25 | ARG CONCURRENCY="2" 26 | ARG GENOME_VERSION="unknown" 27 | 28 | COPY docker/ref/GRCh${GENOME_VERSION} / 29 | COPY docker/pangolin/annotations/GRCh${GENOME_VERSION} / 30 | COPY server.py / 31 | 32 | ENV PORT=8080 33 | ENV TOOL=pangolin 34 | ENV GENOME_VERSION=${GENOME_VERSION} 35 | ENV CONCURRENCY=${CONCURRENCY} 36 | ENV RUNNING_ON_GOOGLE_CLOUD_RUN=1 37 | 38 | CMD exec gunicorn --preload --bind :$PORT --workers ${CONCURRENCY} --threads 1 --timeout 0 server:app 39 | -------------------------------------------------------------------------------- /test_data/test.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##fileDate=20191004 3 | ##reference=GRCh38/hg38 4 | ##contig= 5 | ##contig= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | ##contig= 16 | ##contig= 17 | ##contig= 18 | ##contig= 19 | ##contig= 20 | ##contig= 21 | ##contig= 22 | ##contig= 23 | ##contig= 24 | ##contig= 25 | ##contig= 26 | ##contig= 27 | ##contig= 28 | ##INFO= 29 | #CHROM POS ID REF ALT QUAL FILTER INFO 30 | 1 69091 . A C . . SpliceAI=C|OR4F5|0.01|0.00|0.00|0.00|42|25|24|2 31 | 11 108301737 . CA TG . . . 32 | -------------------------------------------------------------------------------- /google_cloud_run_services/create_pangolin_db.py: -------------------------------------------------------------------------------- 1 | ## This script was copied from the Pangolin repo (https://github.com/tkzeng/Pangolin) 2 | 3 | import argparse 4 | import gffutils 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("annotation_file", help="GTF file containing gene annotations. For example, from https://www.gencodegenes.org/") 8 | parser.add_argument("--filter", default="Ensembl_canonical", help="Only keep GTF features with the specified tags. Format: tag1,tag2,... or None to keep all features. Default: Ensembl_canonical") 9 | args = parser.parse_args() 10 | 11 | gtf = args.annotation_file 12 | if gtf.endswith(".gtf"): 13 | prefix = gtf[:-4] 14 | elif gtf.endswith(".gtf.gz"): 15 | prefix = gtf[:-7] 16 | else: 17 | exit("ERROR, annotation_file should be a GTF file.") 18 | 19 | def filter(feat): 20 | if feat.featuretype not in ["gene","transcript","exon"]: 21 | return False 22 | elif args.filter != "None" and feat.featuretype in ["transcript","exon"]: 23 | present = False 24 | for tag in args.filter.split(','): 25 | if "tag" in feat.attributes and tag in feat["tag"]: 26 | present = True 27 | if not present: 28 | return False 29 | return feat 30 | 31 | db = gffutils.create_db(gtf, prefix+".db", force=True, 32 | disable_infer_genes=True, disable_infer_transcripts=True, 33 | transform=filter) 34 | 35 | print("Database created: %s.db" % prefix) 36 | -------------------------------------------------------------------------------- /google_cloud_run_services/connect_to_db.sh: -------------------------------------------------------------------------------- 1 | #set -ex 2 | 3 | PGPASSWORD=$(cat .pgpass) psql -h 34.173.33.168 -d spliceai-lookup-db -U postgres -d spliceai-lookup-db 4 | 5 | 6 | # useful queries: 7 | 8 | # count variant consequences (counted once per variant) 9 | # select variant_consequence, count(*) as c from (select variant_consequence, variant from log where length(variant_consequence) > 1 group by variant_consequence, variant) log group by variant_consequence order by c desc; 10 | 11 | # count queries per ip per day 12 | # select ip, logtime::timestamp::date, MAX(event_name), count(*) as c from log group by ip, logtime::timestamp::date ORDER BY logtime::timestamp::date desc, c desc 13 | 14 | 15 | # check intergenic variants 16 | # select distinct variant, genome from log where variant_consequence = 'intergenic' and genome='38'; 17 | 18 | 19 | # compute % of queried variants that are splice-region 20 | # select c as splice_region_variants, d as total_variants, c::float/d::float as percent from ( select count(*) as c from ( select variant_consequence, variant from log where length(variant_consequence) > 1 group by variant_consequence, variant ) temp1 where variant_consequence = 'splice_region_variant' or variant_consequence = 'splice_donor_variant' or variant_consequence = 'splice_acceptor_variant' or variant_consequence = 'splice_polypyrimidine_tract_variant' or variant_consequence = 'splice_donor_region_variant' ) temp2 full outer join ( select count(*) as d from ( select variant from log where length(variant_consequence) > 1 group by variant_consequence, variant ) temp3 ) temp4 on 1=1; 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.vcf* 2 | *.txt* 3 | *.tsv* 4 | *.json* 5 | 6 | service.backup_copy.yaml 7 | 8 | .pgpass 9 | .idea 10 | *.iml 11 | *.db 12 | *.txt.gz 13 | 14 | # Data 15 | *.fa.gz 16 | *.gz.fxi 17 | *.fa 18 | *.fai 19 | *.bed 20 | *.bed.gz 21 | *.gtf.gz 22 | *.tbi 23 | *.crt 24 | 25 | # Byte-compiled / optimized / DLL files 26 | __pycache__/ 27 | *.py[cod] 28 | *$py.class 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | pip-wheel-metadata/ 48 | share/python-wheels/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | MANIFEST 53 | 54 | # PyInstaller 55 | # Usually these files are written by a python script from a template 56 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 57 | *.manifest 58 | *.spec 59 | 60 | # Installer logs 61 | pip-log.txt 62 | pip-delete-this-directory.txt 63 | 64 | # Unit test / coverage reports 65 | htmlcov/ 66 | .tox/ 67 | .nox/ 68 | .coverage 69 | .coverage.* 70 | .cache 71 | nosetests.xml 72 | coverage.xml 73 | *.cover 74 | *.py,cover 75 | .hypothesis/ 76 | .pytest_cache/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | db.sqlite3 86 | db.sqlite3-journal 87 | 88 | # Flask stuff: 89 | instance/ 90 | .webassets-cache 91 | 92 | # Scrapy stuff: 93 | .scrapy 94 | 95 | # Sphinx documentation 96 | docs/_build/ 97 | 98 | # PyBuilder 99 | target/ 100 | 101 | # Jupyter Notebook 102 | .ipynb_checkpoints 103 | 104 | # IPython 105 | profile_default/ 106 | ipython_config.py 107 | 108 | # pyenv 109 | .python-version 110 | 111 | # pipenv 112 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 113 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 114 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 115 | # install all needed dependencies. 116 | #Pipfile.lock 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | -------------------------------------------------------------------------------- /annotations/convert_SpliceAI_annotation_input_format_to_bed.py: -------------------------------------------------------------------------------- 1 | """ 2 | The original SpliceAI annotation format is: 3 | 4 | #NAME CHROM STRAND TX_START TX_END EXON_START EXON_END 5 | OR4F5 1 + 69090 70008 69090, 70008, 6 | OR4F16 1 - 685715 686654 685715, 686654, 7 | ... 8 | 9 | Convert it to BED format so that it can be viewed in IGV. 10 | """ 11 | 12 | 13 | import argparse 14 | import gzip 15 | import os 16 | import re 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("-n", type=int, help="Number of lines to process (for testing)") 21 | parser.add_argument("-o", "--output-prefix", help="Output prefix for genePred file") 22 | parser.add_argument("spliceai_annotation_table") 23 | args = parser.parse_args() 24 | 25 | if not args.output_prefix: 26 | args.output_prefix = re.sub("(.tsv|.txt)(.gz)?$", "", os.path.basename(args.spliceai_annotation_table)) 27 | 28 | output_path = f"{args.output_prefix}.bed" 29 | 30 | line_count = 0 31 | fopen = gzip.open if args.spliceai_annotation_table.endswith("gz") else open 32 | with fopen(args.spliceai_annotation_table, "rt") as f: 33 | with open(output_path, "wt") as out: 34 | header = f.readline().strip().split("\t") 35 | if header != ["#NAME", "CHROM", "STRAND", "TX_START", "TX_END", "EXON_START", "EXON_END"]: 36 | raise ValueError(f"Unexpected header: {header}") 37 | 38 | for i, line in enumerate(f): 39 | line_count += 1 40 | fields = line.strip().split("\t") 41 | if len(fields) != 7: 42 | raise ValueError(f"Expected 7 fields, got {len(fields)}: {fields}") 43 | 44 | name, chrom, strand, tx_start, tx_end, exon_starts, exon_ends = fields 45 | 46 | 47 | exon_starts = exon_starts.strip(",").split(",") 48 | exon_ends = exon_ends.strip(",").split(",") 49 | if exon_starts.count(",") != exon_ends.count(","): 50 | raise ValueError(f"Mismatch in the number of exon starts and ends: {fields}") 51 | 52 | exon_sizes = [str(int(end) - int(start)) for start, end in zip(exon_starts, exon_ends)] 53 | exon_starts = [str(int(start) - int(tx_start)) for start in exon_starts] 54 | 55 | score = item_rgb = "." 56 | out.write("\t".join([ 57 | chrom, tx_start, tx_end, name, score, strand, 58 | tx_start, tx_end, item_rgb, 59 | str(len(exon_sizes)), ",".join(exon_sizes), ",".join(exon_starts), 60 | ]) + "\n") 61 | 62 | if args.n is not None and i > args.n: 63 | break 64 | 65 | os.system(f"bgzip -f {output_path}") 66 | os.system(f"tabix -f {output_path}.gz") 67 | 68 | print(f"Wrote {line_count:,d} lines to {output_path}.gz") 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /annotations/combine_PrimateAI_scores_and_gene_threshold_tables.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import os 4 | import pandas as pd 5 | import subprocess 6 | import tqdm 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-g", "--gene-thresholds-csv", default="PrimateAI_3D.per_gene_percentile_thresholds.csv.gz", 11 | help="Input CSV file") 12 | parser.add_argument("-p", "--show-progress-bar", action="store_true", help="Show progress bar") 13 | 14 | parser.add_argument("scores_table", help="Table with precomputed PrimateAI_3D scores") 15 | args = parser.parse_args() 16 | 17 | if not os.path.isfile(args.gene_thresholds_csv): 18 | parser.error(f"Gene thresholds CSV file not found: {args.gene_thresholds_csv}") 19 | 20 | if not args.scores_table.endswith(".txt.gz"): 21 | parser.error("Scores table must have a .txt.gz extension") 22 | 23 | if not os.path.isfile(args.scores_table): 24 | parser.error(f"Scores file not found: {args.scores_table}") 25 | 26 | return args 27 | 28 | def main(): 29 | 30 | args = parse_args() 31 | 32 | # Load gene thresholds 33 | print(f"Parsing {args.gene_thresholds_csv}") 34 | gene_thresholds_df = pd.read_csv(args.gene_thresholds_csv) 35 | transcript_to_percentile_threshold_map = dict( 36 | zip(gene_thresholds_df['Transcript'], gene_thresholds_df['PAI3D_Gene_Percentile_Threshold'])) 37 | 38 | print(f"Loaded {len(gene_thresholds_df):,d} gene thresholds") 39 | print(f"Parsing {args.scores_table}") 40 | with gzip.open(args.scores_table, "rt") as f, open(f"{args.scores_table}.unfinished", "wt") as out_f: 41 | header = next(f).strip().split("\t") 42 | transcript_id_index = 4 43 | percentile_index = 9 44 | assert header[transcript_id_index] == "gene_name" 45 | assert header[percentile_index] == "percentile_PAI3D" 46 | 47 | if args.show_progress_bar: 48 | f = tqdm.tqdm(f, unit=" lines", unit_scale=True) 49 | 50 | output_columns = [ 51 | "chrom", "pos", "ref", "alt", "PAI3D_percentile", "PAI3D_gene_threshold", 52 | ] 53 | out_f.write("\t".join(output_columns) + "\n") 54 | 55 | for i, line in enumerate(f): 56 | fields = line.strip().split("\t") 57 | output_row = fields[:4] 58 | transcript_id = fields[transcript_id_index] 59 | 60 | if transcript_id not in transcript_to_percentile_threshold_map: 61 | raise ValueError(f"Transcript ID {transcript_id} from {args.score_table} not found in {args.gene_thresholds_csv}") 62 | 63 | percentile = fields[percentile_index] 64 | output_row += [percentile, f"{float(transcript_to_percentile_threshold_map[transcript_id]):0.3f}"] 65 | out_f.write("\t".join(output_row) + "\n") 66 | 67 | output_table_path = args.scores_table.replace(".txt.gz", "") + ".with_gene_thresholds.txt.gz" 68 | subprocess.check_output(f"bgzip {args.scores_table}.unfinished", shell=True) 69 | subprocess.check_output(f"mv {args.scores_table}.unfinished.gz {output_table_path}", shell=True) 70 | subprocess.check_output(f"tabix -f -S 1 -s 1 -b 2 -e 2 {output_table_path}", shell=True) 71 | #subprocess.check_output(f"gsutil -m cp {output_table_path} {output_table_path}.tbi gs://spliceai-lookup-reference-data/", shell=True) 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /test_spliceai.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from server import SPLICEAI_ANNOTATOR, SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK, VariantRecord, parse_variant, process_variant 3 | 4 | 5 | class Test(unittest.TestCase): 6 | 7 | def test_parse_variant(self): 8 | self.assertEqual(parse_variant("chr3:12345 A>G"), ("3", 12345, "A", "G")) 9 | self.assertEqual(parse_variant("3:12345:A:G"), ("3", 12345, "A", "G")) 10 | self.assertEqual(parse_variant("chrX:12345:A:G"), ("X", 12345, "A", "G")) 11 | self.assertEqual(parse_variant("chrY:12345:A:G"), ("Y", 12345, "A", "G")) 12 | with self.assertRaises(ValueError): 13 | parse_variant("Z:12345:A:G") 14 | 15 | def test_spliceai_results(self): 16 | # from test_data/spliceai_scores.raw.indel.hg38_subset.vcf.gz 17 | # 1 69091 . A AA . . SpliceAI=AA|OR4F5|0.00|0.00|0.03|0.00|-15|42|2|24 18 | # 1 69124 . GATT G . . SpliceAI=G|OR4F5|0.00|0.02|0.00|0.06|18|9|27|-31 19 | 20 | variant = "1 69091 A AA" 21 | for distance in SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_DISTANCE - 1: 22 | result = process_variant(variant, "38", distance, SPLICEAI_DEFAULT_MASK) 23 | self.assertEqual(result['variant'], variant) 24 | self.assertEqual(result['chrom'], "1") 25 | self.assertEqual(result['pos'], 69091) 26 | self.assertEqual(result['ref'], "A") 27 | self.assertEqual(result['alt'], "AA") 28 | self.assertEqual(result['genome_version'], "38") 29 | self.assertEqual(result['source'], "lookup" if distance == SPLICEAI_DEFAULT_DISTANCE else "computed") 30 | self.assertListEqual(result['scores'], ["OR4F5|0.00|0.00|0.03|0.00|-15|42|2|24"]) 31 | 32 | variant = "1:69539:T:G" 33 | for masked in 0, 1: 34 | for distance in SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_DISTANCE - 1: 35 | result = process_variant(variant, "38", distance, masked) 36 | self.assertEqual(result['variant'], variant) 37 | self.assertEqual(result['chrom'], "1") 38 | self.assertEqual(result['pos'], 69539) 39 | self.assertEqual(result['ref'], "T") 40 | self.assertEqual(result['alt'], "G") 41 | self.assertEqual(result['genome_version'], "38") 42 | self.assertEqual(result['source'], "lookup" if distance == SPLICEAI_DEFAULT_DISTANCE else "computed") 43 | self.assertListEqual(result['scores'], ["OR4F5|0.00|0.01|0.11|0.29|20|-2|49|-2"] if not masked else ["OR4F5|0.00|0.00|0.11|0.00|20|-2|49|-2"]) 44 | 45 | #print(get_delta_scores(VariantRecord(*parse_variant("2-179531962-C-A")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK)) 46 | #print(get_delta_scores(VariantRecord(*parse_variant("2-179532167-A-G")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK)) 47 | #print(get_delta_scores(VariantRecord(*parse_variant("2-179529170-GACAGTTAAGAATGTACCTTTGACAGGTACA-G")), SPLICEAI_ANNOTATOR["37"], SPLICEAI_DEFAULT_DISTANCE, SPLICEAI_DEFAULT_MASK)) 48 | 49 | -------------------------------------------------------------------------------- /annotations/combine_score_tables.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import os 4 | import tqdm 5 | 6 | chrom_to_index = {f"chr{i}": i for i in range(1, 23)} 7 | chrom_to_index["chrX"] = 23 8 | chrom_to_index["chrY"] = 24 9 | index_to_chrom = {i: v for v, i in chrom_to_index.items()} 10 | base_to_index = {c: i for i, c in enumerate("ACGT")} 11 | index_to_base = {i: c for c, i in base_to_index.items()} 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-r", "--genome-version", choices=["hg38", "hg19"], required=True) 15 | args = parser.parse_args() 16 | 17 | hg38_or_hg19 = args.genome_version 18 | 19 | ## Process hg38 tables 20 | all_keys = set() 21 | table1_lookup = {} 22 | table1_path = os.path.expanduser(f"~/code/SpliceAI-lookup/annotations/PrimateAI_3D.{hg38_or_hg19}.with_gene_thresholds.txt.gz") 23 | 24 | chrom_column = "chr" if hg38_or_hg19 == "hg38" else "chrom" 25 | ref_column = "non_flipped_ref" if hg38_or_hg19 == "hg38" else "ref" 26 | alt_column = "non_flipped_alt" if hg38_or_hg19 == "hg38" else "alt" 27 | percentile_column = "percentile_PAI3D" if hg38_or_hg19 == "hg38" else "PAI3D_percentile" 28 | gene_threshold_column = "PAI3D_Gene_Percentile_Threshold" if hg38_or_hg19 == "hg38" else "PAI3D_gene_threshold" 29 | 30 | print(f"Reading table #1 from {table1_path}") 31 | with gzip.open(table1_path, "rt") as f: 32 | header = next(f).strip().split("\t") 33 | header_indices = {c: i for i, c in enumerate(header)} 34 | counter = 0 35 | for line in tqdm.tqdm(f, unit=" lines", unit_scale=True, total=70_667_467): 36 | fields = line.rstrip().split("\t") 37 | chrom = fields[header_indices[chrom_column]] 38 | if "_" in chrom: 39 | # skip supercontigs 40 | continue 41 | 42 | key = ( 43 | chrom_to_index[chrom], 44 | int(fields[header_indices["pos"]]), 45 | base_to_index[fields[header_indices[ref_column]]], 46 | base_to_index[fields[header_indices[alt_column]]] 47 | ) 48 | all_keys.add(key) 49 | table1_lookup[key] = [ 50 | float(fields[header_indices[percentile_column]]), 51 | float(fields[header_indices[gene_threshold_column]]), 52 | ] 53 | counter += 1 54 | 55 | print(f"Parsed {counter:,d} records from table #1") 56 | 57 | table2_lookup = {} 58 | if hg38_or_hg19 == "hg38": 59 | table2_path = os.path.expanduser("~/code/SpliceAI-lookup/annotations/promoterAI_tss500.tsv.gz") 60 | else: 61 | table2_path = os.path.expanduser("~/code/SpliceAI-lookup/annotations/promoterAI_tss500_hg19.tsv.gz") 62 | 63 | print(f"Reading table #2 from {table2_path}") 64 | with gzip.open(table2_path, "rt") as f: 65 | header = next(f).strip().split("\t") 66 | header_indices = {c: i for i, c in enumerate(header)} 67 | counter = 0 68 | for line in tqdm.tqdm(f, unit=" lines", unit_scale=True, total=261_666_406): 69 | fields = line.rstrip().split("\t") 70 | chrom = fields[header_indices[chrom_column]] 71 | if "_" in chrom: 72 | # skip supercontigs 73 | continue 74 | 75 | key = ( 76 | chrom_to_index[chrom], 77 | int(fields[header_indices["pos"]]), 78 | base_to_index[fields[header_indices["ref"]]], 79 | base_to_index[fields[header_indices["alt"]]] 80 | ) 81 | all_keys.add(key) 82 | table2_lookup[key] = float(fields[header_indices["promoterAI"]]) 83 | counter += 1 84 | 85 | print(f"Parsed {counter:,d} records from table #2") 86 | output_path = f"PrimateAI_and_PromoterAI_scores.{hg38_or_hg19}.tsv" 87 | print(f"Writing output to {output_path}") 88 | with open(output_path, "wt") as f: 89 | f.write("\t".join([ 90 | "chrom", 91 | "pos", 92 | "ref", 93 | "alt", 94 | "PAI3D_percentile", 95 | "PAI3D_gene_threshold", 96 | "PromoterAI_score", 97 | ]) + "\n") 98 | 99 | for key in tqdm.tqdm(sorted(all_keys), unit=" records", unit_scale=True, total=len(all_keys)): 100 | chrom_index, pos, ref_index, alt_index = key 101 | percentile_PAI3D, PAI3D_gene_threshold = table1_lookup.get(key, (None, None)) 102 | promoterAI_score = table2_lookup.get(key, None) 103 | 104 | f.write("\t".join([ 105 | index_to_chrom[chrom_index], 106 | str(pos), 107 | index_to_base[ref_index], 108 | index_to_base[alt_index], 109 | f"{percentile_PAI3D:.3f}" if percentile_PAI3D is not None else "", 110 | f"{PAI3D_gene_threshold:.2f}" if PAI3D_gene_threshold is not None else "", 111 | f"{promoterAI_score:.3f}" if promoterAI_score is not None else "", 112 | ]) + "\n") 113 | 114 | os.system(f"bgzip {output_path}") 115 | os.system(f"tabix -f -S 1 -s 1 -b 2 -e 2 {output_path}.gz") 116 | 117 | #%% 118 | -------------------------------------------------------------------------------- /annotations/convert_gtf_to_SpliceAI_annotation_input_format.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import argparse 4 | import collections 5 | import gzip 6 | import json 7 | import os 8 | import pandas as pd 9 | import re 10 | 11 | from annotation_utils.gtf_utils import parse_gtf 12 | 13 | 14 | def main(): 15 | p = argparse.ArgumentParser(description="""This script takes a Gencode .gtf.gz file 16 | and outputs an annotation file which can be passed to SpliceAI instead of 17 | the default SpliceAI annotations which are still on Gencode v24.""") 18 | 19 | p.add_argument("-a", "--annotation-json-path", required=True, help="Path of the transcript annotations JSON file " 20 | "created by generate_transcript_annotation_json.py") 21 | p.add_argument("--gtf-id-field", default="transcript_id", choices=["transcript_id", "gene_id"]) 22 | p.add_argument("gtf_gz_path", help="Path of gene annotations file in GTF format") 23 | args = p.parse_args() 24 | 25 | for path in args.annotation_json_path, args.gtf_gz_path: 26 | if not os.path.exists(path): 27 | p.error(f"File not found: {path}") 28 | 29 | fopen = gzip.open if args.gtf_gz_path.endswith("gz") else open 30 | with fopen(args.annotation_json_path, "rt") as f: 31 | transcript_annotations = json.load(f) 32 | 33 | print(f"Parsing {args.gtf_gz_path}") 34 | gtf_id_to_exons = collections.defaultdict(set) 35 | for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), "exon"): 36 | key = (record[args.gtf_id_field], record["strand"], record["chrom"]) 37 | exon_tuple = (record["start"], record["end"]) 38 | if exon_tuple in gtf_id_to_exons[key]: 39 | raise ValueError(f"Duplicate exon: {exon_tuple} in transcript {key}") 40 | gtf_id_to_exons[key].add(exon_tuple) 41 | 42 | output_records = [] 43 | # SpliceAI predictions (prior to 'masking') depend only on transcript chrom/start/end/strand. Often, transcripts 44 | # within the same gene have the same chrom/start/end/strand and differ only in their internal exon structure. 45 | # We can discard these redundant transcripts (while making sure to keep all MANE Select and canonical transcripts). 46 | output_records_transcript_keys = set() 47 | maybe_output_records = [] 48 | for (gtf_id, strand, chrom), exon_set in gtf_id_to_exons.items(): 49 | tx_start_0based = min([start_1based - 1 for start_1based, _ in exon_set]) 50 | tx_end_1based = max([end_1based for _, end_1based in exon_set]) 51 | gtf_id_without_version = gtf_id.split(".")[0] 52 | if gtf_id_without_version not in transcript_annotations: 53 | print(f"WARNING: transcript {gtf_id_without_version} not found in {args.annotation_json_path}") 54 | continue 55 | transcript_annotation = transcript_annotations[gtf_id_without_version] 56 | if transcript_annotation['t_priority'] == "N": 57 | output_list = maybe_output_records 58 | else: 59 | output_list = output_records 60 | transcript_key = (chrom, strand, str(tx_start_0based), str(tx_end_1based)) 61 | output_records_transcript_keys.add(transcript_key) 62 | 63 | # if it's a MANE Select, MANE Plus Clinical or canonical transcript 64 | exon_list = sorted(list(exon_set)) 65 | exon_starts_0based = [start_1based - 1 for start_1based, _ in exon_list] 66 | exon_ends_1based = [end_1based for _, end_1based in exon_list] 67 | 68 | # reformat the records into a list which can be turned into a pandas DataFrame 69 | output_list.append({ 70 | "#NAME": gtf_id, 71 | "CHROM": chrom, 72 | "STRAND": strand, 73 | "TX_START": str(tx_start_0based), 74 | "TX_END": str(tx_end_1based), 75 | "EXON_START": ",".join([str(s) for s in exon_starts_0based]) + ",", 76 | "EXON_END": ",".join([str(s) for s in exon_ends_1based]) + ",", 77 | }) 78 | 79 | transcripts_kept_counter1 = len(output_records) 80 | transcripts_kept_counter2 = 0 81 | for output_record in maybe_output_records: 82 | transcript_key = (output_record["CHROM"], output_record["STRAND"], output_record["TX_START"], output_record["TX_END"]) 83 | if transcript_key not in output_records_transcript_keys: 84 | # if this transcript has a chrom/start/end/strand that hasn't been seen before, add it to the output 85 | output_records_transcript_keys.add(transcript_key) 86 | output_records.append(output_record) 87 | transcripts_kept_counter2 += 1 88 | 89 | assert transcripts_kept_counter1 + transcripts_kept_counter2 == len(output_records) 90 | print(f"Kept {transcripts_kept_counter1:,d} transcripts which were MANE Select, MANE Plus Clinical or canonical.") 91 | print(f"Kept {transcripts_kept_counter2:,d} additional transcripts with unique transcript start/stop coords.") 92 | print(f"Discarded {len(maybe_output_records) - transcripts_kept_counter2:,d} out of {len(gtf_id_to_exons):,d} " 93 | f"({(len(maybe_output_records) - transcripts_kept_counter2) / len(gtf_id_to_exons):.1%}) transcripts " 94 | f"because they were redundant.") 95 | 96 | output_df = pd.DataFrame(output_records) 97 | output_df = output_df[["#NAME", "CHROM", "STRAND", "TX_START", "TX_END", "EXON_START", "EXON_END"]] 98 | output_path = re.sub(".gtf.gz$", "", os.path.basename(args.gtf_gz_path)) + ".txt.gz" 99 | output_df.to_csv(output_path, index=False, sep="\t") 100 | 101 | print(f"Wrote {len(output_df):,d} records to {os.path.abspath(output_path)}") 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains: 2 | - client-side code for [spliceailookup.broadinstitute.org](https://spliceailookup.broadinstitute.org/) - contained within the [index.html](index.html) file and hosted via GitHub Pages. 3 | - server-side code for SpliceAI and Pangolin REST APIs - contained within the [google_cloud_run_services/](google_cloud_run_services/) subdirectory and hosted on Google Cloud Run. 4 | 5 | --- 6 | 7 | #### SpliceAI, Pangolin APIs 8 | 9 | 10 | NOTE: These APIs are intended for interactive use only, and do not support more than several requests per user per minute. More frequent queries will trigger a "rate limit" error in the response. To process large batches of variants, please set up and query your own local instance of the API server. This is easy to do using the publicly available docker images (see below for details). Alternatively, you can intall and run the underlying SpliceAI and/or Pangolin models directly on your local infrastructure. Their source code is available @ [https://github.com/bw2/SpliceAI](https://github.com/bw2/SpliceAI) and [https://github.com/bw2/Pangolin](https://github.com/bw2/Pangolin).
11 |
12 | 13 | The SpliceAI and Pangolin APIs have different base urls for different genome versions: 14 | 15 | `https://spliceai-37-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=37&variant=` - SpliceAI for variants on GRCh37
16 | `https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=` - SpliceAI for variants on GRCh38
17 | `https://pangolin-37-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=37&variant=` - Pangolin for variants on GRCh37
18 | `https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=` - Pangolin for variants on GRCh38
19 | 20 | To query the API, append your variant of interest in `chrom-pos-ref-alt` format to the appropriate base url above. 21 | 22 | For example, to get SpliceAI scores for `chr8-140300616-T-G`:
23 | 24 | *[https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=chr8-140300616-T-G](https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg=38&variant=chr8-140300616-T-G)* 25 | 26 | or to get Pangolin scores while also setting the `distance` and `mask` parameters:
27 | 28 | *[https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=chr8-140300616-T-G&distance=1000&mask=1](https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/?hg=38&variant=chr8-140300616-T-G&distance=1000&mask=1)* 29 | 30 | #### API parameters 31 | 32 | Parameter descriptions: 33 | 34 | - **variant** (required) a variant in the format "chrom-pos-ref-alt" 35 | - **hg** (required) can be 37 or 38 36 | - **distance** (optional) distance parameter of SpliceAI model (default: 50) 37 | - **mask** (optional) can be 0 which means raw scores or 1 which means masked scores (default: 0). 38 | Splicing changes corresponding to strengthening annotated splice sites and weakening unannotated splice sites are typically much less pathogenic than weakening annotated splice sites and 39 | strengthening unannotated splice sites. When this parameter is = 1 (masked), the delta scores of such splicing changes are set to 0. SpliceAI developers recommend using raw (0) for alternative splicing analysis and masked (1) for variant interpretation. 40 | 41 | 42 | --- 43 | #### Running Your Own Local API Server 44 | 45 | If you have [docker](https://docs.docker.com/engine/install/) installed, you can easily start your own SpliceAI-lookup API server by running one of these commands (depending on which model and genome version you want to query): 46 | 47 | ``` 48 | docker run -p 8080:8080 docker.io/weisburd/spliceai-38:latest 49 | docker run -p 8080:8080 docker.io/weisburd/spliceai-37:latest 50 | docker run -p 8080:8080 docker.io/weisburd/pangolin-38:latest 51 | docker run -p 8080:8080 docker.io/weisburd/pangolin-37:latest 52 | ``` 53 | When it starts, it will print: 54 | ``` 55 | * Serving Flask app 'server' 56 | * Debug mode: on 57 | ``` 58 | 59 | Let's say you ran the `spliceai-38` instance. You should then be able to query it by, for example, opening http://localhost:8080/spliceai/?hg=38&variant=chr8-140300616-T-G in your browser. 60 | The docker container will initially print: 61 | ``` 62 | ERROR: Unable to connect to SQL database... 63 | WARNING:absl:No training configuration found... 64 | WARNING:tensorflow:... 65 | ``` 66 | but these messages can be ignored, and subsequent queries will run faster. 67 | 68 | 69 | If you would like to run your own API instance on Google Cloud instead of locally, see the [build_and_deploy.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/build_and_deploy.py#L224-L238) script which we use to deploy and update the SpliceAI-lookup API on [Google Cloud Run](https://cloud.google.com/run?hl=en). Submit a GitHub issue if you have any questions. 70 | 71 | --- 72 | #### Code Overview For Developers 73 | 74 | The [spliceailookup.broadinstitute.org](https://spliceailookup.broadinstitute.org) front-end is contained within [index.html](index.html). It uses ES6 javascript with [Semantic UI](https://semantic-ui.com) and [jQuery](https://en.wikipedia.org/wiki/JQuery). Also, it uses a [custom version of igv.js](https://github.com/bw2/igv.js) that includes new track types for visualizing the SpliceAI & Pangolin scores. The new server-side code is in the [google_cloud_run_services/](google_cloud_run_services/) subdirectory and includes Dockerfiles for building API server images, as well as the [build_and_deploy.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/build_and_deploy.py#L224-L238) script for deploying SpliceAI and Pangolin API services to [Google Cloud Run](https://cloud.google.com/run?hl=en). 75 | The API server logic is in [google_cloud_run_services/server.py](https://github.com/broadinstitute/SpliceAI-lookup/blob/master/google_cloud_run_services/server.py) and uses the [Flask](https://flask.palletsprojects.com/en/3.0.x) library. 76 | 77 | 78 | -------------------------------------------------------------------------------- /google_cloud_run_services/test_score_consistency.py: -------------------------------------------------------------------------------- 1 | """Use the cache to check if any scores have changed since the last time the scores were computed.""" 2 | 3 | import collections 4 | import configargparse 5 | import json 6 | import os 7 | import pandas as pd 8 | import psycopg2 9 | import re 10 | import requests 11 | import tqdm 12 | import time 13 | 14 | from contextlib import contextmanager 15 | 16 | @contextmanager 17 | def get_db_connection(): 18 | """Get a database connection""" 19 | #conn = DATABASE_CONNECTION_POOL.getconn() 20 | conn = psycopg2.connect( 21 | dbname="spliceai-lookup-db", 22 | user="postgres", 23 | password=os.environ.get("DB_PASSWORD"), 24 | host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db", 25 | port="5432", 26 | connect_timeout=5, 27 | ) 28 | 29 | try: 30 | yield conn 31 | finally: 32 | conn.close() 33 | 34 | @contextmanager 35 | def get_db_cursor(conn): 36 | """Get a database cursor""" 37 | cursor = conn.cursor() 38 | try: 39 | yield cursor 40 | conn.commit() 41 | finally: 42 | cursor.close() 43 | 44 | def run_sql(conn, sql_query, *params): 45 | with get_db_cursor(conn) as cursor: 46 | cursor.execute(sql_query, *params) 47 | try: 48 | results = cursor.fetchall() 49 | except: 50 | results = [] 51 | return results 52 | 53 | p = configargparse.ArgParser(default_config_files=["~/.spliceai_lookup_db_config"]) 54 | p.add_argument("--ip", required=True) 55 | p.add_argument("--user", required=True) 56 | p.add_argument("--password", required=True) 57 | p.add_argument("--db", default="spliceai-lookup-db") 58 | p.add_argument("-n", type=int, help="number of rows to query", default=1000) 59 | p.add_argument("-p", "--show-progress-bar", action="store_true") 60 | args, _ = p.parse_known_args() 61 | 62 | myip = requests.get("http://checkip.dyndns.com").text 63 | myip_match = re.search(r'Address: (\d+\.\d+\.\d+\.\d+)', myip) 64 | if myip_match: 65 | myip_match = myip_match.group(1) 66 | 67 | days_ago = 30 68 | conn = psycopg2.connect(f"dbname='{args.db}' user='{args.user}' host='{args.ip}' password='{args.password}'") 69 | #query = f"SELECT key, value, accessed FROM cache WHERE accessed < now() - INTERVAL '{days_ago} days' ORDER BY accessed ASC" 70 | #query = f"SELECT key, value, accessed FROM cache WHERE key LIKE 'pangolin%hg38%' AND accessed > now() - INTERVAL '{days_ago} days' ORDER BY accessed ASC" 71 | query = f"SELECT key, value, accessed FROM cache WHERE key LIKE 'pangolin%hg38%' ORDER BY accessed ASC" 72 | df = pd.read_sql_query(query, conn) 73 | print(f"Retrieved {len(df):,d} records from cache that were last accessed less than {days_ago} days ago.") 74 | if args.n: 75 | keep_every_kth_record = len(df)//args.n 76 | if keep_every_kth_record > 1: 77 | df = df[df.index % keep_every_kth_record == 0] 78 | print(f"Kept {len(df):,d} records after applying -n {args.n} arg") 79 | 80 | counter = collections.Counter() 81 | iterator = zip(df.key, df.value, df.accessed) 82 | if args.show_progress_bar: 83 | iterator = tqdm.tqdm(iterator, total=len(df), unit=" variants", unit_scale=True) 84 | 85 | for i, (cache_key, cache_value, last_accessed) in enumerate(iterator): 86 | print(f"{i+1:3,d}: Processing", cache_key, "which was last accessed on", last_accessed) 87 | data = json.loads(cache_value) 88 | 89 | if not data.get("scores"): 90 | print("ERROR: No scores found in cached value. Skipping...") 91 | continue 92 | 93 | tool = data["source"].split(":")[0] 94 | hg = data["genomeVersion"] 95 | distance = data["distance"] 96 | cache_key = cache_key.replace("__basic", "").replace("__comprehensive", "") 97 | assert cache_key[-2:] in ("m1", "m0") 98 | mask = cache_key[-1] 99 | variant = data["variant"] 100 | 101 | # get json response 102 | # time requests 103 | start_time = time.time() 104 | url = f"https://{tool}-{hg}-xwkwwwxdwq-uc.a.run.app/{tool}/?hg={hg}&distance={distance}&mask={mask}&variant={variant}&raw={variant}" 105 | # print(url) 106 | try: 107 | response_json = requests.get(f"{url}&force=1").json() 108 | except Exception as e: 109 | print(f"ERROR: {e} when retrieving {url} Skipping...") 110 | continue 111 | 112 | if not response_json.get("scores"): 113 | print(f"ERROR: {url} response doesn't contain scores: {response_json}. Skipping...") 114 | continue 115 | 116 | if myip_match: 117 | print(f"Deleting logs for ip {myip_match}") 118 | run_sql(conn, f"DELETE FROM log WHERE ip='{myip_match}'") 119 | 120 | 121 | elapsed_time = time.time() - start_time 122 | response_json["scores"] = list(sorted(response_json["scores"], key=lambda s: s.get("t_id"))) 123 | response_scores = response_json["scores"][0] 124 | 125 | data["scores"] = list(sorted(data["scores"], key=lambda s: s.get("t_id"))) 126 | cached_scores = data["scores"][0] 127 | 128 | if not response_scores.get("t_id") or response_scores.get("t_id") != cached_scores.get("t_id"): 129 | print("Transcript ids don't match:", response_scores.get("t_id"), "vs", cached_scores.get("t_id"), 130 | ". Skipping...") 131 | continue 132 | 133 | if not response_scores.get("g_id") or response_scores.get("g_id") != cached_scores.get("g_id"): 134 | print("Gene ids don't match:", response_scores.get("g_id"), "vs", cached_scores.get("g_id"), 135 | ". Skipping...") 136 | continue 137 | 138 | counter[f" {tool}"] += 1 139 | counter[f" hg{hg}"] += 1 140 | counter[f" m{mask}"] += 1 141 | 142 | missing_keys = set() 143 | mismatched_values = set() 144 | values_to_print = set() 145 | for k, v1 in cached_scores.items(): 146 | if k in ("t_refseq_ids", "t_id", "g_id", "g_name"): 147 | # differences in gene ids are not important 148 | continue 149 | 150 | if k not in response_scores: 151 | missing_keys.add(k) 152 | continue 153 | 154 | v2 = response_scores[k] 155 | try: 156 | diff = float(v1) - float(v2) 157 | except: 158 | diff = "?" 159 | 160 | values_to_print.add((k, v1, v2, diff)) 161 | if v1 != v2: 162 | mismatched_values.add((k, v1, v2, diff)) 163 | continue 164 | 165 | if missing_keys: 166 | print(f"ERROR: {cache_key} which was last accessed on {last_accessed} is missing keys: {missing_keys}. Response: {json.dumps(response_json, indent=1)}") 167 | 168 | if mismatched_values: 169 | counter["ERROR: mismatched_values"] += 1 170 | print(f"ERROR: {cache_key} which was last accessed on {last_accessed} has mismatched values for keys: " 171 | f"{', '.join(sorted([t[0] for t in mismatched_values]))} " 172 | f"with max delta_score_diff=" 173 | f"{max([abs(t[3]) for t in mismatched_values if t[0].startswith('DS')] or [None])} " 174 | f"and max raw_score_diff=" 175 | f"{max([abs(t[3]) for t in mismatched_values if t[0].startswith('S')] or [None])} ") 176 | 177 | for k, v1, v2, diff in sorted(values_to_print): 178 | print(f" {k}: {v1} vs {v2} diff: {diff}") 179 | 180 | #print(f" Cache: {json.dumps(data, indent=1)}") 181 | #print(f" Response: {json.dumps(response_json, indent=1)}") 182 | 183 | df = pd.read_sql_query(f"SELECT * FROM log WHERE variant='{variant}'", conn) 184 | print(f" Log:") 185 | print(df.to_string(index=False)) 186 | 187 | print(f"{i+1:3,d}: Done with", cache_key, f"elapsed_time={elapsed_time:.1f}s") 188 | 189 | conn.close() 190 | 191 | print(f"Done") 192 | 193 | print("Stats:") 194 | for key, value in sorted(counter.items()): 195 | print(f"{value:10,d} {key}") 196 | 197 | #%% 198 | -------------------------------------------------------------------------------- /annotations/generate_transcript_annotation_json.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | """This script creates a json file with annotation fields for each transcript id, including 4 | whether it's a "MANE select" transcript, canonical transcript, etc. 5 | """ 6 | 7 | import argparse 8 | import gzip 9 | import json 10 | import os 11 | import re 12 | 13 | from annotation_utils.get_ensembl_db_info import get_gene_id_to_canonical_transcript_id, \ 14 | get_ensembl_ENST_to_RefSeq_ids 15 | from annotation_utils.get_MANE_table import get_MANE_ensembl_transcript_table 16 | from annotation_utils.gtf_utils import parse_gtf 17 | 18 | 19 | # to get the latest database name, run: 20 | # mysql -h useastdb.ensembl.org -u anonymous -e "show databases;" | grep homo_sapiens_core 21 | DEFAULT_ENSEMBL_DATABASE = "homo_sapiens_core_115_38" 22 | 23 | # this is used to get the list of MANE select and MANE plus clinical ENST transcript ids. 24 | DEFAULT_MANE_URL_BASE = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.4" 25 | DEFAULT_MANE_SUMMARY_TABLE_FILENAME = "MANE.GRCh38.v1.4.summary.txt.gz" 26 | 27 | def main(): 28 | p = argparse.ArgumentParser(description="""This script takes a Gencode .gtf.gz file 29 | and outputs an annotation file which can be passed to SpliceAI instead of 30 | the default SpliceAI annotations which are still on Gencode v24. 31 | """) 32 | p.add_argument("--mane-url-base", default=DEFAULT_MANE_URL_BASE) 33 | p.add_argument("--mane-summary-table-filename", default=DEFAULT_MANE_SUMMARY_TABLE_FILENAME) 34 | p.add_argument("-e", "--ensembl-database", default=DEFAULT_ENSEMBL_DATABASE) 35 | p.add_argument("gtf_gz_path", help="Path of gene annotations file in GTF format") 36 | args = p.parse_args() 37 | 38 | mane_summary_table_url = os.path.join(args.mane_url_base, args.mane_summary_table_filename) 39 | MANE_df = get_MANE_ensembl_transcript_table(mane_summary_table_url=mane_summary_table_url) 40 | 41 | print(f"Initalizing transcript priority annotation function") 42 | compute_transcript_priority = get_transcript_priority_annotation_function( 43 | ensembl_database=args.ensembl_database, MANE_df=MANE_df) 44 | 45 | esnembl_ENST_to_RefSeq_ids = get_ensembl_ENST_to_RefSeq_ids(database=args.ensembl_database) 46 | print(f"Downloaded {len(esnembl_ENST_to_RefSeq_ids):,d} ENST to RefSeq mappings") 47 | for key, refseq_ids in esnembl_ENST_to_RefSeq_ids.items(): 48 | esnembl_ENST_to_RefSeq_ids[key] = list(sorted(refseq_ids)) 49 | 50 | MANE_df["ensembl_ENST_without_version"] = MANE_df["Ensembl_nuc"].apply(lambda s: s.split(".")[0]) 51 | MANE_ensembl_ENST_to_RefSeq_id = dict(MANE_df[["ensembl_ENST_without_version", "RefSeq_nuc"]].itertuples(index=False)) 52 | print(f"Got {len(MANE_ensembl_ENST_to_RefSeq_id):,d} MANE ENST to RefSeq mappings, of which " 53 | f"{len(set(MANE_ensembl_ENST_to_RefSeq_id) - set(esnembl_ENST_to_RefSeq_ids)):,d} are unique.") 54 | MANE_ensembl_ENST_to_RefSeq_id = {k: [v] for k, v in MANE_ensembl_ENST_to_RefSeq_id.items()} 55 | esnembl_ENST_to_RefSeq_ids.update(MANE_ensembl_ENST_to_RefSeq_id) 56 | 57 | gene_coordinates_lookup = {} # used for checking coordinate consistency between gene and transcript records (eg. GPR143) 58 | max_transcript_coordinates_lookup = {} 59 | for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), feature_type="gene"): 60 | gene_id_without_version = record["gene_id"].split(".")[0] 61 | gene_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = (record["chrom"], record["start"], record["end"]) 62 | 63 | print(f"Parsing {args.gtf_gz_path}") 64 | output_json = {} 65 | for record in parse_gtf(os.path.expanduser(args.gtf_gz_path), feature_type="transcript"): 66 | transcript_id_without_version = record["transcript_id"].split(".")[0] 67 | transcript_priority = compute_transcript_priority(transcript_id=transcript_id_without_version) 68 | refseq_transcript_ids = esnembl_ENST_to_RefSeq_ids.get(transcript_id_without_version) 69 | output_json[transcript_id_without_version] = { 70 | "g_name": record["gene_name"], 71 | "g_id": record["gene_id"], 72 | "t_id": record["transcript_id"], 73 | "t_type": record["transcript_type"], 74 | "t_strand": record["strand"], 75 | "t_priority": transcript_priority, 76 | "t_refseq_ids": refseq_transcript_ids, 77 | } 78 | 79 | gene_id_without_version = record["gene_id"].split(".")[0] 80 | if gene_id_without_version in max_transcript_coordinates_lookup: 81 | transcript_chrom, transcript_start, transcript_end = max_transcript_coordinates_lookup[gene_id_without_version] 82 | max_transcript_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = ( 83 | transcript_chrom, min(transcript_start, record["start"]), max(transcript_end, record["end"])) 84 | else: 85 | max_transcript_coordinates_lookup[(record["gene_name"], gene_id_without_version)] = ( 86 | record["chrom"], record["start"], record["end"]) 87 | 88 | # check consistency of gene vs. transcript start/end coordinates 89 | warning1_counter = 0 90 | warning2_counter = 0 91 | for gene_name, gene_id_without_version in gene_coordinates_lookup: 92 | key = gene_name, gene_id_without_version 93 | gene_chrom, gene_start, gene_end = gene_coordinates_lookup[key] 94 | transcript_chrom, transcript_start, transcript_end = max_transcript_coordinates_lookup.get(key, (None, None, None)) 95 | if transcript_chrom is None: 96 | warning1_counter += 1 97 | print(f"WARNING: Gene {gene_name} ({gene_id_without_version}) has no transcript records") 98 | elif transcript_start > gene_start or transcript_end < gene_end: 99 | warning2_counter += 1 100 | #start_diff = f". Start diff is {transcript_start - gene_start:,d}bp" if transcript_start - gene_start else "" 101 | #end_diff = f". End diff is {gene_end - transcript_end:,d}bp" if gene_end - transcript_end else "" 102 | #print(f"WARNING: Gene {gene_name} ({gene_id_without_version}) has inconsistent coordinates: " 103 | # f"gene={gene_chrom}:{gene_start}-{gene_end}, " 104 | # f"transcript={transcript_chrom}:{transcript_start}-{transcript_end}" 105 | # f"{start_diff}{end_diff}") 106 | if warning1_counter > 0: 107 | print(f"WARNING: {warning1_counter:,d} genes don't have any transcript records") 108 | if warning2_counter > 0: 109 | print(f"WARNING: {warning2_counter:,d} out of {len(gene_coordinates_lookup):,d} genes have a genomic interval " 110 | f"that is wider than the interval of any of their transcripts.") 111 | 112 | output_path = re.sub(".gtf.gz$", "", os.path.basename(args.gtf_gz_path)) + ".transcript_annotations.json.gz" 113 | with gzip.open(output_path, "wt") as f: 114 | json.dump(output_json, f, indent=4, sort_keys=True) 115 | 116 | print(f"Done writing {len(output_json):,d} transcript annotations to {output_path}") 117 | 118 | 119 | def get_transcript_priority_annotation_function(ensembl_database, MANE_df): 120 | """Initializes annotation data and returns the compute_transcript_priority function.""" 121 | 122 | MANE_select_transcript_ids = { 123 | t_id.split(".")[0] for t_id in MANE_df[MANE_df["MANE_status"] == "MANE Select"]["Ensembl_nuc"]} 124 | print(f"Got {len(MANE_select_transcript_ids):,d} MANE select transcript ids") 125 | 126 | MANE_plus_clinical_transcript_ids = { 127 | t_id.split(".")[0] for t_id in MANE_df[MANE_df["MANE_status"] == "MANE Plus Clinical"]["Ensembl_nuc"]} 128 | print(f"Got {len(MANE_plus_clinical_transcript_ids):,d} MANE plus clinical transcript ids") 129 | 130 | gene_id_to_canonical_transcript_id = get_gene_id_to_canonical_transcript_id(database=ensembl_database) 131 | canonical_transcript_ids = { 132 | t_id.split(".")[0] for t_id in gene_id_to_canonical_transcript_id.values()} 133 | print(f"Got {len(canonical_transcript_ids):,d} canonical transcript ids") 134 | 135 | def compute_transcript_priority(transcript_id): 136 | """Returns a string indicating the priority of the given transcript. 137 | The return value can be (in order from higher to lower priority): 138 | "MS" (for MANE select) 139 | "MP" (for MANE plus clinical) 140 | "C" (for canonical) 141 | "N" (for none of the above) 142 | """ 143 | transcript_id = transcript_id.split(".")[0] 144 | 145 | if transcript_id in MANE_select_transcript_ids: 146 | transcript_priority = "MS" 147 | elif transcript_id in MANE_plus_clinical_transcript_ids: 148 | transcript_priority = "MP" 149 | elif transcript_id in canonical_transcript_ids: 150 | transcript_priority = "C" 151 | else: 152 | transcript_priority = "N" 153 | 154 | return transcript_priority 155 | 156 | return compute_transcript_priority 157 | 158 | 159 | if __name__ == "__main__": 160 | main() 161 | -------------------------------------------------------------------------------- /google_cloud_run_services/build_and_deploy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import time 5 | 6 | import pandas as pd 7 | import re 8 | 9 | logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(message)s") 10 | 11 | VALID_COMMANDS = { 12 | "update_annotations", "build", "deploy", "test", "test2", "run", 13 | } 14 | 15 | GCLOUD_PROJECT = "spliceai-lookup-412920" 16 | DOCKERHUB_REPO = "docker.io/weisburd" 17 | 18 | def get_service_name(tool, genome_version): 19 | return f"{tool}-{genome_version}" 20 | 21 | def get_tag(tool, genome_version, repo_name="gcr.io"): 22 | if repo_name == "gcr.io": 23 | return f"us-central1-docker.pkg.dev/spliceai-lookup-412920/docker/{get_service_name(tool, genome_version)}" 24 | elif repo_name == "dockerhub": 25 | return f"{DOCKERHUB_REPO}/{get_service_name(tool, genome_version)}" 26 | else: 27 | raise ValueError(f"Invalid repo_name arg: {repo_name}") 28 | 29 | def run(c): 30 | logging.info(c) 31 | os.system(c) 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("-g", "--genome-version", choices=["37", "38"], help="If not specified, command will run for both GRCh37 and GRCh38") 36 | parser.add_argument("-t", "--tool", choices=["spliceai", "pangolin"], help="If not specified, command will run for both spliceai and pangolin") 37 | parser.add_argument("-d", "--docker-command", choices=["docker", "podman"], default="docker", help="Whether to use docker or podman to build the image") 38 | g = parser.add_mutually_exclusive_group() 39 | g.add_argument("--gencode-version", 40 | help="The gencode version to use for the 'update_annotations' command (example: 'v49'). Either this " 41 | "or --gencode-gtf must be specified for the 'update_annotations' command") 42 | g.add_argument("--gencode-gtf", 43 | help="Path of the newest 'basic' Gencode GTF file that was downloaded from " 44 | "https://www.gencodegenes.org/human/. Either this or --gencode-version must be specified for " 45 | "the 'update_annotations' command") 46 | 47 | parser.add_argument("command", nargs="?", choices=VALID_COMMANDS, 48 | help="Command to run. If not specified, it will run 'build' and then 'deploy'") 49 | 50 | args = parser.parse_args() 51 | 52 | if args.genome_version: 53 | genome_versions = [args.genome_version] 54 | else: 55 | genome_versions = ["38", "37"] 56 | 57 | if args.tool: 58 | tools = [args.tool] 59 | else: 60 | tools = ["spliceai", "pangolin"] 61 | 62 | if args.gencode_version: 63 | if not re.match("v[0-9][0-9]", args.gencode_version): 64 | parser.error("--gencode-version must be of the form 'v46'") 65 | gencode_version_number = int(args.gencode_version.lstrip("v")) 66 | else: 67 | gencode_version_number = None 68 | 69 | if args.command == "update_annotations": 70 | if not args.gencode_version and not args.gencode_gtf: 71 | parser.error("Either --gencode-version or --gencode-gtf must be specified for the update_annotations command") 72 | 73 | gencode_gtf_paths = {} 74 | if args.gencode_version: 75 | for genome_version in genome_versions: 76 | for basic_or_comprehensive in "", ".basic": 77 | if genome_version == "37": 78 | gencode_gtf_url = f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_version_number}/GRCh37_mapping/gencode.{args.gencode_version}lift37{basic_or_comprehensive}.annotation.gtf.gz" 79 | elif genome_version == "38": 80 | gencode_gtf_url = f"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_version_number}/gencode.{args.gencode_version}{basic_or_comprehensive}.annotation.gtf.gz" 81 | else: 82 | parser.error(f"Invalid genome version: {genome_version}") 83 | 84 | run(f"wget -nc {gencode_gtf_url}") 85 | run(f"wget -nc https://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/gtfToGenePred") 86 | run(f"chmod 777 gtfToGenePred") 87 | gencode_gtf_paths[(genome_version, basic_or_comprehensive)] = os.path.basename(gencode_gtf_url) 88 | else: 89 | if not args.genome_version: 90 | parser.error("If --gencode-gtf is specified, --genome-version is required") 91 | if not os.path.isfile(args.gencode_gtf): 92 | parser.error(f"File not found: {args.gencode_gtf}") 93 | gencode_gtf_paths[(args.genome_version, "basic")] = args.gencode_gtf 94 | 95 | for genome_version, _ in gencode_gtf_paths.keys(): 96 | run(f"rm ./docker/ref/GRCh{genome_version}/gencode.*.basic.annotation.transcript_annotations.json.gz") 97 | run(f"rm ./docker/spliceai/annotations/GRCh{genome_version}/gencode.*.annotation*.txt.gz") 98 | run(f"rm ./docker/pangolin/annotations/GRCh{genome_version}/gencode.*.annotation*.db") 99 | 100 | for (genome_version, basic_or_comprehensive), gencode_gtf_path in gencode_gtf_paths.items(): 101 | # generate genePred files to use as gene tracks in IGV.js 102 | if args.gencode_version: 103 | gene_pred_path = f"gencode.{args.gencode_version}.GRCh{genome_version}.txt" 104 | run(f"./gtfToGenePred -genePredExt -geneNameAsName2 {gencode_gtf_path} {gene_pred_path}") 105 | 106 | print(f"Reading {gene_pred_path}") 107 | column_names = [ 108 | "name", 109 | "chrom", 110 | "strand", 111 | "txStart", 112 | "txEnd", 113 | "cdsStart", 114 | "cdsEnd", 115 | "exonCount", 116 | "exonStarts", 117 | "exonEnds", 118 | "score", 119 | "name2", 120 | "cdsStartStat", 121 | "cdsEndStat", 122 | "exonFrames", 123 | ] 124 | df = pd.read_table(gene_pred_path, names=column_names) 125 | df["txStart"] = df["txStart"].astype(int) 126 | df["txEnd"] = df["txEnd"].astype(int) 127 | filter_exp = (df["txStart"] > 0) & (df["txEnd"] > 0) 128 | df2 = df[filter_exp] 129 | if len(df) - len(df2) > 0: 130 | print(f"Filtered out {len(df) - len(df2):,d} records from {gene_pred_path}:") 131 | print(df[~filter_exp]) 132 | 133 | df2 = df2.sort_values(["chrom", "txStart", "txEnd"]) 134 | df2["i"] = df2["name2"].map({name: i for i, name in enumerate(df2.name2.unique())}) 135 | df2 = df2[["i"] + column_names] 136 | sorted_gene_pred_path = gene_pred_path.replace(".txt", ".sorted.txt") 137 | df2.to_csv(sorted_gene_pred_path, header=False, index=False, sep="\t") 138 | run(f"bgzip -f {sorted_gene_pred_path}") 139 | run(f"tabix -s 3 -b 5 -e 6 -f {sorted_gene_pred_path}.gz") 140 | 141 | run(f"gsutil -m cp {sorted_gene_pred_path}.gz* gs://tgg-viewer/ref/GRCh{genome_version}/gencode_{args.gencode_version}/") 142 | 143 | # generate SpliceAI annotation files 144 | run(f"python3 ../annotations/generate_transcript_annotation_json.py {gencode_gtf_path}") 145 | output_json_path = gencode_gtf_path.replace(".gtf.gz", ".transcript_annotations.json.gz") 146 | run(f"python3 ../annotations/convert_gtf_to_SpliceAI_annotation_input_format.py -a {output_json_path} {gencode_gtf_path}") 147 | if not os.path.isfile(output_json_path): 148 | raise ValueError(f"Unable to find {output_json_path}") 149 | 150 | run(f"mv {output_json_path} ./docker/ref/GRCh{genome_version}/") 151 | run(f"mv {gencode_gtf_path.replace('.gtf.gz', '.txt.gz')} ./docker/spliceai/annotations/GRCh{genome_version}/") 152 | 153 | if genome_version == "37": 154 | gencode_gtf_path_without_chr_prefix = gencode_gtf_path.replace(".gtf.gz", ".without_chr_prefix.gtf.gz") 155 | run(f"gzcat {gencode_gtf_path} | sed 's/chr//g' | bgzip > {gencode_gtf_path_without_chr_prefix}") 156 | gencode_gtf_path = gencode_gtf_path_without_chr_prefix 157 | 158 | # generate Pangolin annotation files 159 | run(f"python3 create_pangolin_db.py {gencode_gtf_path}") 160 | run(f"mv {gencode_gtf_path.replace('.gtf.gz', '.db')} ./docker/pangolin/annotations/GRCh{genome_version}/") 161 | 162 | if args.gencode_version: 163 | with open("server.py", "rt") as f: 164 | server_py = f.readlines() 165 | 166 | updated_line = False 167 | with open("server.py", "wt") as f: 168 | for i, line in enumerate(server_py): 169 | if line.startswith("GENCODE_VERSION ="): 170 | new_gencode_line = f"GENCODE_VERSION = \"{args.gencode_version}\"" 171 | f.write(f"{new_gencode_line}\n") 172 | updated_line = True 173 | print(f"Updated server.py line #{i} to {new_gencode_line}") 174 | else: 175 | f.write(line) 176 | 177 | with open("../index.html", "rt") as f: 178 | index_html = f.readlines() 179 | 180 | updated_line = False 181 | with open("../index.html", "wt") as f: 182 | for i, line in enumerate(index_html): 183 | if "const GENCODE_VERSION = " in line: 184 | new_gencode_line = f"\tconst GENCODE_VERSION = \"{args.gencode_version}\"" 185 | f.write(f"{new_gencode_line}\n") 186 | updated_line = True 187 | print(f"Updated index.html line #{i} to {new_gencode_line}") 188 | else: 189 | f.write(line) 190 | 191 | if not updated_line: 192 | print("WARNING: Unable to find GENCODE_VERSION line in index.html") 193 | 194 | return 195 | 196 | if args.command == "test2": 197 | run(f"gcloud beta code dev") 198 | return 199 | 200 | if args.command in {"test", "run"}: 201 | if not args.genome_version: 202 | parser.error(f"--genome-version is required for the {args.command} command") 203 | if not args.tool: 204 | parser.error(f"--tool is required for the {args.command} command") 205 | 206 | tag = get_tag(args.tool, args.genome_version) 207 | 208 | if args.command == "run": 209 | print("Run this command: ") 210 | print(f"{args.docker_command} run -it {tag}:latest /bin/bash") 211 | elif args.command == "test": 212 | run(f"{args.docker_command} run -p 8080:8080 {tag}:latest") 213 | 214 | return 215 | 216 | if not args.command or args.command in {"build", "deploy"}: 217 | if args.docker_command == "podman": 218 | print("WARNING: Google Cloud Run doesn't appear to work with images built using podman. " 219 | "Containers may fail to deploy to Google Cloud Run unless they are built using docker.") 220 | time.sleep(10) 221 | 222 | for genome_version in genome_versions: 223 | for tool in tools: 224 | tag = get_tag(tool, genome_version) 225 | dockerhub_tag = get_tag(tool, genome_version, repo_name="dockerhub") 226 | service = get_service_name(tool, genome_version) 227 | concurrency = 6 # if genome_version == '37' else 2 228 | min_instances = 0 # if tool == 'pangolin' else 2 229 | max_instances = 3 230 | if not args.command or args.command == "build": 231 | if args.docker_command == "podman": 232 | run(f"gcloud --project {GCLOUD_PROJECT} auth print-access-token | podman login -u oauth2accesstoken --password-stdin us-central1-docker.pkg.dev") 233 | 234 | run(f"{args.docker_command} build -f docker/{tool}/Dockerfile --build-arg=\"CONCURRENCY={concurrency}\" --build-arg=\"GENOME_VERSION={genome_version}\" -t {tag}:latest -t {dockerhub_tag}:latest .") 235 | run(f"{args.docker_command} push {tag}:latest") 236 | run(f"{args.docker_command} push {dockerhub_tag}:latest") 237 | 238 | run(f"{args.docker_command} pull {tag}:latest") 239 | run(f"{args.docker_command} inspect --format='{{{{index .RepoDigests 0}}}}' {tag}:latest | cut -f 2 -d @ > docker/{tool}/sha256_grch{genome_version}.txt") # record the image's sha256 240 | 241 | if not args.command or args.command == "deploy": 242 | with open(f"docker/{tool}/sha256_grch{genome_version}.txt") as f: 243 | sha256 = f.read().strip() 244 | 245 | if not re.match("^sha256:[a-f0-9]{64}$", sha256): 246 | raise ValueError(f"Invalid sha256 value found in docker/{tool}/sha256_grch{genome_version}.txt: {sha256}") 247 | 248 | print(f"Deploying {service} with image sha256 {sha256}") 249 | 250 | run(f"""gcloud \ 251 | --project {GCLOUD_PROJECT} beta run deploy {service} \ 252 | --image {tag}@{sha256} \ 253 | --min-instances {min_instances} \ 254 | --service-min-instances {min_instances} \ 255 | --max-instances {max_instances} \ 256 | --concurrency {concurrency} \ 257 | --service-account 1042618492363-compute@developer.gserviceaccount.com \ 258 | --execution-environment gen2 \ 259 | --region us-central1 \ 260 | --update-secrets=DB_PASSWORD=spliceai-lookup-db-password:2 \ 261 | --allow-unauthenticated \ 262 | --memory 4Gi \ 263 | --cpu 4 264 | """) 265 | 266 | # --add-volume=name=ref,type=cloud-storage,bucket=spliceai-lookup-reference-data,readonly=true \ 267 | # --add-volume-mount=volume=ref,mount-path=/ref \ 268 | 269 | if __name__ == "__main__": 270 | main() 271 | -------------------------------------------------------------------------------- /google_cloud_run_services/server.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import gzip 3 | import json 4 | import logging 5 | import os 6 | import psycopg2 7 | import re 8 | import time 9 | import traceback 10 | 11 | 12 | # used for DB connection pooling 13 | from psycopg2.pool import SimpleConnectionPool 14 | from contextlib import contextmanager 15 | 16 | # flask imports 17 | from flask import Flask, g, request, Response, send_from_directory 18 | from flask_cors import CORS 19 | from flask_talisman import Talisman 20 | 21 | app = Flask(__name__) 22 | 23 | CORS(app) 24 | 25 | 26 | DEBUG = True # if socket.gethostname() == "spliceai-lookup" else True 27 | if not DEBUG: 28 | Talisman(app) 29 | 30 | logging.getLogger('werkzeug').disabled = True 31 | 32 | DEFAULT_DISTANCE = 500 # maximum distance between the variant and gained/lost splice site, defaults to 500 33 | MAX_DISTANCE_LIMIT = 10000 34 | DEFAULT_MASK = 0 # mask scores representing annotated acceptor/donor gain and unannotated acceptor/donor loss, defaults to 0 35 | 36 | SPLICEAI_EXAMPLE_URL = f"/spliceai/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G&bc=basic" 37 | PANGOLIN_EXAMPLE_URL = f"/pangolin/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G&bc=basic" 38 | 39 | 40 | VARIANT_RE = re.compile( 41 | "(chr)?(?P[0-9XYMTt]{1,2})" 42 | "[-\s:]+" 43 | "(?P[0-9]{1,9})" 44 | "[-\s:]+" 45 | "(?P[ACGT]+)" 46 | "[-\s:>]+" 47 | "(?P[ACGT]+)" 48 | ) 49 | 50 | FASTA_PATH = { 51 | "37": "/hg19.fa.gz", 52 | "38": "/hg38.fa.gz", 53 | } 54 | 55 | GENCODE_VERSION = "v49" 56 | 57 | SHARED_TRANSCRIPT_ANNOTATIONS = {} 58 | SHARED_TRANSCRIPT_ANNOTATION_PATHS = { 59 | ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.transcript_annotations.json.gz", 60 | ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.transcript_annotations.json.gz", 61 | ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.transcript_annotations.json.gz", 62 | ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.transcript_annotations.json.gz", 63 | } 64 | 65 | TRANSCRIPT_PRIORITY_ORDER = { 66 | "MS": 3, # MANE select transcript 67 | "MP": 2, # MANE plus clinical transcript 68 | "C": 1, # canonical transcript 69 | "N": 0 70 | } 71 | 72 | TOOL = os.environ.get("TOOL") 73 | GENOME_VERSION = os.environ.get("GENOME_VERSION") 74 | if GENOME_VERSION not in ("37", "38"): 75 | raise ValueError(f'Environment variable "GENOME_VERSION" should be set to either "37" or "38" instead of: "{os.environ.get("GENOME_VERSION")}"') 76 | 77 | if TOOL == "spliceai": 78 | from spliceai.utils import Annotator, get_delta_scores 79 | 80 | class VariantRecord: 81 | def __init__(self, chrom, pos, ref, alt): 82 | self.chrom = chrom 83 | self.pos = pos 84 | self.ref = ref 85 | self.alts = [alt] 86 | 87 | def __repr__(self): 88 | return f"{self.chrom}-{self.pos}-{self.ref}-{self.alts[0]}" 89 | 90 | SPLICEAI_ANNOTATOR = {} 91 | SPLICEAI_ANNOTATION_PATHS = { 92 | ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.txt.gz", 93 | ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.txt.gz", 94 | ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.txt.gz", 95 | ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.txt.gz", 96 | } 97 | 98 | elif TOOL == "pangolin": 99 | from pkg_resources import resource_filename 100 | from pangolin.pangolin import process_variant as process_variant_using_pangolin 101 | from pangolin.model import torch, Pangolin, L, W, AR 102 | import gffutils 103 | 104 | PANGOLIN_ANNOTATION_PATHS = { 105 | ("37", "basic"): f"/gencode.{GENCODE_VERSION}lift37.basic.annotation.without_chr_prefix.db", 106 | ("38", "basic"): f"/gencode.{GENCODE_VERSION}.basic.annotation.db", 107 | ("37", "comprehensive"): f"/gencode.{GENCODE_VERSION}lift37.annotation.without_chr_prefix.db", 108 | ("38", "comprehensive"): f"/gencode.{GENCODE_VERSION}.annotation.db", 109 | } 110 | else: 111 | raise ValueError(f'Environment variable "TOOL" should be set to either "spliceai" or "pangolin" instead of: "{os.environ.get("TOOL")}"') 112 | 113 | 114 | RATE_LIMIT_ERROR_MESSAGE = ( 115 | f"Rate limit exceeded. This server only supports interactive use. To process large numbers of variants programmatically, " 116 | f"please install a local instance of the API server, or just run the prediction models directly. Attempts to query large " 117 | f"numbers of variants programmatically will result in loss of access to this API for an extended period of time. Contact " 118 | f"us at https://github.com/broadinstitute/SpliceAI-lookup/issues if you have any questions." 119 | ) 120 | 121 | 122 | def init_spliceai(genome_version, basic_or_comprehensive): 123 | 124 | if (genome_version, basic_or_comprehensive) not in SPLICEAI_ANNOTATOR: 125 | SPLICEAI_ANNOTATOR[(genome_version, basic_or_comprehensive)] = Annotator( 126 | FASTA_PATH[genome_version], 127 | SPLICEAI_ANNOTATION_PATHS[(genome_version, basic_or_comprehensive)] 128 | ) 129 | 130 | 131 | def init_transcript_annotations(genome_version, basic_or_comprehensive): 132 | if (genome_version, basic_or_comprehensive) in SHARED_TRANSCRIPT_ANNOTATIONS: 133 | return 134 | 135 | # init shared transcript annotations 136 | with gzip.open(SHARED_TRANSCRIPT_ANNOTATION_PATHS[(genome_version, basic_or_comprehensive)], "rt") as ta_f: 137 | SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive)] = json.load(ta_f) 138 | 139 | 140 | def error_response(error_message, source=None): 141 | response_json = {"error": str(error_message)} 142 | if source: 143 | response_json["source"] = source 144 | return Response(json.dumps(response_json), status=200, mimetype='application/json') 145 | 146 | 147 | def parse_variant(variant_str): 148 | match = VARIANT_RE.match(variant_str) 149 | if not match: 150 | raise ValueError(f"Unable to parse variant: {variant_str}") 151 | 152 | return match['chrom'], int(match['pos']), match['ref'], match['alt'] 153 | 154 | 155 | #while True: 156 | # # https://groups.google.com/g/google-cloud-sql-discuss/c/mxsaf-YDrbA?pli=1 157 | # # https://cloud.google.com/sql/docs/postgres/flags#gcloud 158 | # 159 | # error_count = 0 160 | # try: 161 | # DATABASE_CONNECTION_POOL = SimpleConnectionPool( 162 | # minconn=1, 163 | # maxconn=5, 164 | # dbname="spliceai-lookup-db", 165 | # user="postgres", 166 | # password=os.environ.get("DB_PASSWORD"), 167 | # host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db", 168 | # port="5432", 169 | # connect_timeout=5, 170 | # ) 171 | # print(f"Successfully connected to database", flush=True) 172 | # break 173 | # except psycopg2.Error as e: 174 | # error_count += 1 175 | # time.sleep(2) 176 | # print(f"Error connecting to database: {e}", flush=True) 177 | # traceback.print_exc() 178 | # if error_count > 5: 179 | # print(f"Error connecting to database. Exiting...", flush=True) 180 | # sys.exit(1) 181 | 182 | 183 | @contextmanager 184 | def get_db_connection(): 185 | """Get a database connection""" 186 | #conn = DATABASE_CONNECTION_POOL.getconn() 187 | try: 188 | conn = psycopg2.connect( 189 | dbname="spliceai-lookup-db", 190 | user="postgres", 191 | password=os.environ.get("DB_PASSWORD"), 192 | host="/cloudsql/spliceai-lookup-412920:us-central1:spliceai-lookup-db", 193 | port="5432", 194 | connect_timeout=5, 195 | ) 196 | except Exception as e: 197 | print(f"ERROR: Unable to connect to SQL database: {e}") 198 | conn = None 199 | 200 | try: 201 | yield conn 202 | finally: 203 | if conn is not None: 204 | conn.close() 205 | #DATABASE_CONNECTION_POOL.putconn(conn) 206 | 207 | @contextmanager 208 | def get_db_cursor(conn): 209 | """Get a database cursor""" 210 | if conn is None: 211 | return 212 | 213 | cursor = conn.cursor() 214 | try: 215 | yield cursor 216 | conn.commit() 217 | finally: 218 | cursor.close() 219 | 220 | 221 | def run_sql(conn, sql_query, *params): 222 | if conn is None: 223 | return 224 | 225 | with get_db_cursor(conn) as cursor: 226 | cursor.execute(sql_query, *params) 227 | try: 228 | results = cursor.fetchall() 229 | except: 230 | results = [] 231 | return results 232 | 233 | 234 | #def does_table_exist(table_name): 235 | # results = run_sql(f"SELECT EXISTS (SELECT 1 AS result FROM pg_tables WHERE tablename=%s)", (table_name,)) 236 | # does_table_already_exist = results[0][0] 237 | # return does_table_already_exist 238 | 239 | #if not does_table_exist("cache"): 240 | # print("Creating cache table") 241 | # run_sql("""CREATE TABLE cache (key TEXT UNIQUE, value TEXT, counter INT, accessed TIMESTAMP DEFAULT now())""") 242 | # run_sql("""CREATE INDEX cache_index ON cache (key)""") 243 | 244 | #if not does_table_exist("log"): 245 | # print("Creating event_log table") 246 | # run_sql("""CREATE TABLE log (event_name TEXT, ip TEXT, logtime TIMESTAMP DEFAULT now(), duration REAL, variant TEXT, genome VARCHAR(10), bc VARCHAR(20), distance INT, mask INT4, details TEXT, variant_consequence TEXT)""") 247 | # run_sql("""CREATE INDEX idx_log_ip_logtime ON log USING btree (ip, logtime DESC)""") 248 | # run_sql("""CREATE INDEX idx_log_event_name ON log USING btree (event_name)""") 249 | 250 | #if not does_table_exist("restricted_ips"): 251 | # print("Creating restricted_ips table") 252 | # run_sql("""CREATE TABLE restricted_ips (ip TEXT UNIQUE, created TIMESTAMP DEFAULT now())""") 253 | # run_sql("""CREATE INDEX idx_restricted_ips_created ON restricted_ips USING btree (created)""") 254 | 255 | # Query to add ip to the restricted_ips table 256 | #run_sql("""INSERT INTO restricted_ips (ip) VALUES ('210.3.222.157')""") 257 | 258 | def is_user_on_whitelist(conn, user_ip): 259 | """Check if the user is on the whitelist""" 260 | if conn is None or not user_ip: 261 | return False 262 | 263 | if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", user_ip): 264 | return False 265 | 266 | rows = run_sql(conn, "SELECT COUNT(ip) FROM whitelist_ips WHERE ip=%s", (user_ip,)) 267 | return rows and int(rows[0][0]) > 0 268 | 269 | def exceeds_rate_limit(conn, user_ip, params): 270 | """Rate limit requests based on user ip address""" 271 | 272 | #""" 273 | #SELECT * FROM log WHERE event_name like '%computed' AND duration > 2 AND ip='210.3.222.157' AND logtime >= NOW() - INTERVAL '5 minutes' ; 274 | #SELECT ip, count(*) FROM log WHERE event_name like '%computed' AND duration > 2 AND logtime >= NOW() - INTERVAL '20 minutes' GROUP BY ip ORDER BY count DESC; 275 | #""" 276 | 277 | try: 278 | if conn is None: 279 | return False 280 | 281 | if is_user_on_whitelist(conn, params.get("ip")): 282 | return False 283 | 284 | # check if the user has exceeded the rate limit or is on the list of restricted IPs 285 | rows = run_sql(conn, "SELECT COUNT(ip) FROM restricted_ips WHERE ip=%s AND created >= NOW() - INTERVAL '1 weeks'", (user_ip,)) 286 | is_user_currently_blocked = rows and int(rows[0][0]) > 0 287 | if is_user_currently_blocked: 288 | return RATE_LIMIT_ERROR_MESSAGE 289 | 290 | rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name LIKE %s AND ip=%s AND logtime >= NOW() - INTERVAL '7 minutes'", ("%computed%", user_ip)) 291 | did_user_exceed_rate_limit = rows and int(rows[0][0]) >= 50 292 | if did_user_exceed_rate_limit and not is_user_on_whitelist(conn, user_ip): 293 | # the user has exceeded the rate limit: computing scores for 50 or more variants in the last 7 minutes 294 | rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name='rate_limit_exceeded' AND ip=%s AND logtime >= NOW() - INTERVAL '5 minutes'", (user_ip,)) 295 | user_hit_rate_limit_exceeded_recently = rows and int(rows[0][0]) > 0 296 | if not user_hit_rate_limit_exceeded_recently: 297 | # the user will receive at most one "rate_limit_exceeded" event every 5 minutes 298 | log(conn, f"rate_limit_exceeded", ip=user_ip) 299 | rows = run_sql(conn, "SELECT COUNT(ip) FROM log WHERE event_name='rate_limit_exceeded' AND ip=%s AND logtime >= NOW() - INTERVAL '1 days'", (user_ip,)) 300 | user_triggered_too_many_rate_limit_exceeded_errors_today = rows and int(rows[0][0]) >= 5 301 | if user_triggered_too_many_rate_limit_exceeded_errors_today: 302 | # the user has hit the limit of 5 or more "rate_limit_exceeded" events during the last 24 hours 303 | rows = run_sql(conn, "SELECT COUNT(ip) FROM restricted_ips WHERE ip=%s", (user_ip,)) 304 | need_to_delete_previous_restricted_ip_record = rows and int(rows[0][0]) > 0 305 | if need_to_delete_previous_restricted_ip_record: 306 | # delete the previous record 307 | run_sql(conn, "DELETE FROM restricted_ips WHERE ip=%s", (user_ip,)) 308 | 309 | # block the user's IP for 1 week 310 | run_sql(conn, "INSERT INTO restricted_ips (ip) VALUES (%s)", (user_ip,)) 311 | 312 | return RATE_LIMIT_ERROR_MESSAGE 313 | 314 | except Exception as e: 315 | print(f"Error while checking rate limit: {e}", flush=True) 316 | # print traceback 317 | traceback.print_exc() 318 | return False 319 | 320 | 321 | def get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive="basic"): 322 | return f"{tool_name}__{variant}__hg{genome_version}__d{distance}__m{mask}__{basic_or_comprehensive}" 323 | 324 | 325 | def get_splicing_scores_from_cache(conn, tool_name, variant, genome_version, distance, mask, basic_or_comprehensive="basic"): 326 | results = {} 327 | key = get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive) 328 | try: 329 | rows = run_sql(conn, f"SELECT value FROM cache WHERE key=%s", (key,)) 330 | if rows: 331 | results = json.loads(rows[0][0]) 332 | results["source"] += ":cache" 333 | except Exception as e: 334 | print(f"Cache error: {e}", flush=True) 335 | 336 | return results 337 | 338 | 339 | def add_splicing_scores_to_cache(conn, tool_name, variant, genome_version, distance, mask, basic_or_comprehensive, results): 340 | key = get_splicing_scores_cache_key(tool_name, variant, genome_version, distance, mask, basic_or_comprehensive) 341 | try: 342 | results_string = json.dumps(results) 343 | 344 | run_sql(conn, 345 | r"""INSERT INTO cache (key, value, counter, accessed) VALUES (%s, %s, 1, now()) """ + 346 | r"""ON CONFLICT (key) DO """ + 347 | r"""UPDATE SET key=%s, value=%s, counter=cache.counter+1, accessed=now()""", (key, results_string, key, results_string)) 348 | except Exception as e: 349 | print(f"Cache error: {e}", flush=True) 350 | 351 | 352 | def get_spliceai_scores(variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param): 353 | try: 354 | chrom, pos, ref, alt = parse_variant(variant) 355 | except ValueError as e: 356 | return { 357 | "variant": variant, 358 | "source": "spliceai", 359 | "error": str(e), 360 | } 361 | 362 | # generate error message if variant falls outside annotated exons or introns 363 | record = VariantRecord(chrom, pos, ref, alt) 364 | try: 365 | scores = get_delta_scores( 366 | record, 367 | SPLICEAI_ANNOTATOR[(genome_version, basic_or_comprehensive_param)], 368 | distance_param, 369 | mask_param) 370 | except Exception as e: 371 | print(f"ERROR while computing SpliceAI scores for {variant}: {e}") 372 | traceback.print_exc() 373 | return { 374 | "variant": variant, 375 | "source": "spliceai", 376 | "error": f"{type(e)}: {e}", 377 | } 378 | 379 | if not scores: 380 | return { 381 | "variant": variant, 382 | "source": "spliceai", 383 | "error": f"The SpliceAI model did not return any scores for {variant}. This may be because the variant does " 384 | f"not overlap any exons or introns defined by the GENCODE '{basic_or_comprehensive_param}' annotation.", 385 | } 386 | 387 | #scores = [s[s.index("|")+1:] for s in scores] # drop allele field 388 | 389 | # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript) 390 | all_non_zero_scores = None 391 | all_non_zero_scores_strand = None 392 | all_non_zero_scores_transcript_id = None 393 | all_non_zero_scores_transcript_priority = -1 394 | max_delta_score_sum = 0 395 | for i, transcript_scores in enumerate(scores): 396 | if "ALL_NON_ZERO_SCORES" not in transcript_scores: 397 | continue 398 | 399 | transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0] 400 | 401 | # get json annotations for this transcript 402 | transcript_annotations = SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive_param)].get(transcript_id_without_version) 403 | if transcript_annotations is None: 404 | raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations") 405 | 406 | # add the extra transcript annotations from the json file to the transcript scores dict 407 | transcript_scores.update(transcript_annotations) 408 | 409 | # decide whether to use ALL_NON_ZERO_SCORES from this transcript 410 | current_transcript_priority = TRANSCRIPT_PRIORITY_ORDER[transcript_annotations["t_priority"]] 411 | current_delta_score_sum = sum(abs(float(transcript_scores[key])) for key in ("DS_AG", "DS_AL", "DS_DG", "DS_DL")) 412 | if current_transcript_priority > all_non_zero_scores_transcript_priority: 413 | max_delta_score_sum = current_delta_score_sum 414 | all_non_zero_scores_transcript_priority = current_transcript_priority 415 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 416 | all_non_zero_scores_strand = transcript_scores["t_strand"] 417 | all_non_zero_scores_transcript_id = transcript_scores["t_id"] 418 | 419 | elif current_transcript_priority == all_non_zero_scores_transcript_priority and current_delta_score_sum > max_delta_score_sum: 420 | # select the one with the highest delta score sum 421 | max_delta_score_sum = current_delta_score_sum 422 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 423 | all_non_zero_scores_strand = transcript_scores["t_strand"] 424 | all_non_zero_scores_transcript_id = transcript_scores["t_id"] 425 | 426 | for redundant_key in "ALLELE", "NAME", "STRAND", "ALL_NON_ZERO_SCORES": 427 | del transcript_scores[redundant_key] 428 | 429 | return { 430 | "variant": variant, 431 | "genomeVersion": genome_version, 432 | "chrom": chrom, 433 | "pos": pos, 434 | "ref": ref, 435 | "alt": alt, 436 | "distance": distance_param, 437 | "mask": mask_param, 438 | "scores": scores, 439 | "source": "spliceai:model", 440 | "allNonZeroScores": all_non_zero_scores, 441 | "allNonZeroScoresStrand": all_non_zero_scores_strand, 442 | "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id, 443 | } 444 | 445 | 446 | def get_pangolin_scores(variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param): 447 | if genome_version not in ("37", "38"): 448 | raise ValueError(f"Invalid genome_version: {mask_param}") 449 | 450 | if mask_param not in ("True", "False"): 451 | raise ValueError(f"Invalid mask_param: {mask_param}") 452 | 453 | if basic_or_comprehensive_param not in ("basic", "comprehensive"): 454 | raise ValueError(f"Invalid basic_or_comprehensive_param: {basic_or_comprehensive_param}") 455 | 456 | try: 457 | chrom, pos, ref, alt = parse_variant(variant) 458 | except ValueError as e: 459 | print(f"ERROR while parsing variant {variant}: {e}") 460 | traceback.print_exc() 461 | 462 | return { 463 | "variant": variant, 464 | "source": "pangolin", 465 | "error": str(e), 466 | } 467 | 468 | if len(ref) > 1 and len(alt) > 1: 469 | return { 470 | "variant": variant, 471 | "source": "pangolin", 472 | "error": f"Pangolin does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}", 473 | } 474 | 475 | class PangolinArgs: 476 | reference_file = FASTA_PATH[genome_version] 477 | distance = distance_param 478 | mask = mask_param 479 | score_cutoff = None 480 | score_exons = "False" 481 | 482 | pangolin_models = [] 483 | 484 | for i in 0, 2, 4, 6: 485 | for j in 1, 2, 3: 486 | model = Pangolin(L, W, AR) 487 | if torch.cuda.is_available(): 488 | model.cuda() 489 | weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i))) 490 | else: 491 | weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu')) 492 | model.load_state_dict(weights) 493 | model.eval() 494 | pangolin_models.append(model) 495 | 496 | features_db = gffutils.FeatureDB(PANGOLIN_ANNOTATION_PATHS[(GENOME_VERSION, basic_or_comprehensive_param)]) 497 | scores = process_variant_using_pangolin( 498 | 0, chrom, int(pos), ref, alt, features_db, pangolin_models, PangolinArgs) 499 | 500 | if not scores: 501 | return { 502 | "variant": variant, 503 | "source": "pangolin", 504 | "error": f"Pangolin was unable to compute scores for this variant", 505 | } 506 | 507 | # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript) 508 | all_non_zero_scores = None 509 | all_non_zero_scores_strand = None 510 | all_non_zero_scores_transcript_id = None 511 | max_delta_score_sum = 0 512 | for i, transcript_scores in enumerate(scores): 513 | if "ALL_NON_ZERO_SCORES" not in transcript_scores: 514 | continue 515 | 516 | transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0] 517 | 518 | # get json annotations for this transcript 519 | transcript_annotations = SHARED_TRANSCRIPT_ANNOTATIONS[(genome_version, basic_or_comprehensive_param)].get(transcript_id_without_version) 520 | if transcript_annotations is None: 521 | raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations") 522 | 523 | # add the extra transcript annotations from the json file to the transcript scores dict 524 | transcript_scores.update(transcript_annotations) 525 | 526 | # decide whether to use ALL_NON_ZERO_SCORES from this gene 527 | current_delta_score_sum = sum(abs(float(s.get("SG_ALT", 0)) - float(s.get("SG_REF", 0))) 528 | for s in transcript_scores["ALL_NON_ZERO_SCORES"]) 529 | current_delta_score_sum += sum(abs(float(s.get("SL_ALT", 0)) - float(s.get("SL_REF", 0))) 530 | for s in transcript_scores["ALL_NON_ZERO_SCORES"]) 531 | 532 | # return all_non_zero_scores for the transcript or gene with the highest delta score sum 533 | if current_delta_score_sum > max_delta_score_sum: 534 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 535 | all_non_zero_scores_strand = transcript_scores["STRAND"] 536 | all_non_zero_scores_transcript_id = transcript_scores["NAME"] 537 | max_delta_score_sum = current_delta_score_sum 538 | 539 | for redundant_key in "NAME", "STRAND", "ALL_NON_ZERO_SCORES": 540 | del transcript_scores[redundant_key] 541 | 542 | return { 543 | "variant": variant, 544 | "genomeVersion": genome_version, 545 | "chrom": chrom, 546 | "pos": pos, 547 | "ref": ref, 548 | "alt": alt, 549 | "distance": distance_param, 550 | "mask": mask_param, 551 | "scores": scores, 552 | "source": "pangolin:model", 553 | "allNonZeroScores": all_non_zero_scores, 554 | "allNonZeroScoresStrand": all_non_zero_scores_strand, 555 | "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id, 556 | } 557 | 558 | 559 | @app.route("/spliceai/", methods=['POST', 'GET']) 560 | def run_spliceai(): 561 | with get_db_connection() as conn: 562 | return run_splice_prediction_tool(conn, tool_name="spliceai") 563 | 564 | 565 | @app.route("/pangolin/", methods=['POST', 'GET']) 566 | def run_pangolin(): 567 | with get_db_connection() as conn: 568 | return run_splice_prediction_tool(conn, tool_name="pangolin") 569 | 570 | 571 | def run_splice_prediction_tool(conn, tool_name): 572 | """Handles API request for splice prediction 573 | 574 | Args: 575 | conn (psycopg2.connection): Database connection 576 | tool_name (str): "spliceai" or "pangolin" 577 | """ 578 | 579 | if tool_name != TOOL: 580 | return error_response(f"ERROR: This server is configured to run {TOOL} rather than {tool_name}.\n", source=tool_name) 581 | 582 | user_ip = get_user_ip(request) 583 | 584 | 585 | start_time = datetime.now() 586 | #logging_prefix = start_time.strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()} ip:{user_ip}" 587 | logging_prefix = f"t{os.getpid()} ip:{user_ip}" 588 | example_url = SPLICEAI_EXAMPLE_URL if tool_name == "spliceai" else PANGOLIN_EXAMPLE_URL 589 | 590 | # check params 591 | params = {} 592 | if request.values: 593 | params.update(request.values) 594 | 595 | if 'variant' not in params: 596 | params.update(request.get_json(force=True, silent=True) or {}) 597 | 598 | variant = params.get('variant', '') 599 | variant = variant.strip().strip("'").strip('"').strip(",") 600 | if not variant: 601 | return error_response(f'"variant" not specified.\n', source=tool_name) 602 | 603 | if not isinstance(variant, str): 604 | return error_response(f'"variant" value must be a string rather than a {type(variant)}.\n', source=tool_name) 605 | 606 | genome_version = params.get("hg") 607 | if not genome_version: 608 | return error_response(f'"hg" not specified. The URL must include an "hg" arg: hg=37 or hg=38. For example: {example_url}\n', source=tool_name) 609 | 610 | if genome_version not in ("37", "38"): 611 | return error_response(f'Invalid "hg" value: "{genome_version}". The value must be either "37" or "38". For example: {example_url}\n', source=tool_name) 612 | 613 | distance_param = params.get("distance", DEFAULT_DISTANCE) 614 | try: 615 | distance_param = int(distance_param) 616 | except Exception as e: 617 | return error_response(f'Invalid "distance": "{distance_param}". The value must be an integer.\n', source=tool_name) 618 | 619 | if distance_param > MAX_DISTANCE_LIMIT: 620 | return error_response(f'Invalid "distance": "{distance_param}". The value must be < {MAX_DISTANCE_LIMIT}.\n', source=tool_name) 621 | 622 | mask_param = params.get("mask", str(DEFAULT_MASK)) 623 | if mask_param not in ("0", "1"): 624 | return error_response(f'Invalid "mask" value: "{mask_param}". The value must be either "0" or "1". For example: {example_url}\n', source=tool_name) 625 | 626 | basic_or_comprehensive_param = params.get("bc", "basic") 627 | if basic_or_comprehensive_param not in ("basic", "comprehensive"): 628 | return error_response(f'Invalid "bc" value: "{basic_or_comprehensive_param}". The value must be either "basic" or "comprehensive". For example: {example_url}\n', source=tool_name) 629 | 630 | variant_consequence = params.get("variant_consequence") 631 | 632 | force = params.get("force") # ie. don't use cache 633 | 634 | print(f"{logging_prefix}: ======================", flush=True) 635 | print(f"{logging_prefix}: {variant} tool={tool_name} hg={genome_version}, distance={distance_param}, mask={mask_param}, bc={basic_or_comprehensive_param}", flush=True) 636 | 637 | if tool_name == "spliceai": 638 | init_spliceai(genome_version, basic_or_comprehensive_param) 639 | 640 | init_transcript_annotations(genome_version, basic_or_comprehensive_param) 641 | 642 | # check cache before processing the variant 643 | results = {} 644 | if not force: 645 | results = get_splicing_scores_from_cache(conn, tool_name, variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param) 646 | 647 | duration = (datetime.now() - start_time).total_seconds() 648 | if results: 649 | log(conn, f"{tool_name}:from-cache", ip=user_ip, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, bc=basic_or_comprehensive_param, variant_consequence=variant_consequence) 650 | else: 651 | error_message = exceeds_rate_limit(conn, user_ip, params) 652 | if error_message: 653 | print(f"{logging_prefix}: {user_ip}: response: {error_message}", flush=True) 654 | return error_response(error_message, source=tool_name) 655 | 656 | try: 657 | if tool_name == "spliceai": 658 | results = get_spliceai_scores(variant, genome_version, distance_param, int(mask_param), basic_or_comprehensive_param) 659 | elif tool_name == "pangolin": 660 | pangolin_mask_param = "True" if mask_param == "1" else "False" 661 | results = get_pangolin_scores(variant, genome_version, distance_param, pangolin_mask_param, basic_or_comprehensive_param) 662 | else: 663 | raise ValueError(f"Invalid tool_name: {tool_name}") 664 | except Exception as e: 665 | traceback.print_exc() 666 | return error_response(f"ERROR: {e}", source=tool_name) 667 | 668 | duration = (datetime.now() - start_time).total_seconds() 669 | log(conn, f"{tool_name}:computed", ip=user_ip, duration=duration, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, bc=basic_or_comprehensive_param, variant_consequence=variant_consequence) 670 | 671 | if "error" not in results: 672 | add_splicing_scores_to_cache(conn, tool_name, variant, genome_version, distance_param, mask_param, basic_or_comprehensive_param, results) 673 | 674 | if "error" in results: 675 | log(conn, f"{tool_name}:error", ip=user_ip, variant=variant, genome=genome_version, distance=distance_param, mask=mask_param, details=results["error"], bc=basic_or_comprehensive_param, variant_consequence=variant_consequence) 676 | 677 | response_json = {} 678 | response_json.update(params) # copy input params to output 679 | response_json.update(results) 680 | 681 | response_log_string = ", ".join([f"{k}: {v}" for k, v in response_json.items() if not k.startswith("allNonZeroScores")]) 682 | print(f"{logging_prefix}: {variant} response took {str(datetime.now() - start_time)}: {response_log_string}", flush=True) 683 | 684 | return Response(json.dumps(response_json), status=200, mimetype='application/json', headers=[ 685 | ('Access-Control-Allow-Origin', '*'), 686 | ]) 687 | 688 | 689 | def log(conn, event_name, ip=None, duration=None, variant=None, genome=None, distance=None, mask=None, bc=None, details=None, variant_consequence=None): 690 | """Utility method for logging an event""" 691 | 692 | try: 693 | if duration is not None: duration = float(duration) 694 | if distance is not None: distance = int(distance) 695 | if mask is not None: mask = int(mask) 696 | except Exception as e: 697 | print(f"Error parsing log params: {e}", flush=True) 698 | return 699 | 700 | try: 701 | run_sql(conn, 702 | r"INSERT INTO log (event_name, ip, duration, variant, genome, distance, mask, bc, details, variant_consequence) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", 703 | (event_name, ip, duration, variant, genome, distance, mask, bc, details, variant_consequence)) 704 | except Exception as e: 705 | print(f"Log error: {e}", flush=True) 706 | 707 | 708 | def get_user_ip(request): 709 | return request.environ.get("HTTP_X_FORWARDED_FOR") 710 | 711 | 712 | @app.route('/log//', strict_slashes=False) 713 | def log_event(name): 714 | 715 | if name != "show_igv": 716 | message = f"Log error: invalid event name: {name}" 717 | print(message, flush=True) 718 | return error_response(f"ERROR: {message}") 719 | 720 | # check params 721 | params = {} 722 | if request.values: 723 | params.update(request.values) 724 | if not params: 725 | params.update(request.get_json(force=True, silent=True) or {}) 726 | 727 | variant = params.get("variant") 728 | genome_version = params.get("hg") 729 | distance_param = params.get("distance") 730 | mask_param = params.get("mask") 731 | basic_or_comprehensive_param = params.get("bc") 732 | details = params.get("details") 733 | variant_consequence = params.get("variant_consequence") 734 | if details: 735 | details = str(details) 736 | details = details[:2000] 737 | 738 | user_ip = get_user_ip(request) 739 | logging_prefix = datetime.now().strftime("%m/%d/%Y %H:%M:%S") + f" {user_ip} t{os.getpid()}" 740 | print(f"{logging_prefix}: ======================", flush=True) 741 | print(f"{logging_prefix}: {variant} show igv with hg={genome_version}, distance={distance_param}, mask={mask_param}", flush=True) 742 | 743 | with get_db_connection() as conn: 744 | log(conn, 745 | name, 746 | ip=user_ip, 747 | variant=variant, 748 | genome=genome_version, 749 | distance=distance_param, 750 | mask=mask_param, 751 | bc=basic_or_comprehensive_param, 752 | details=details, 753 | variant_consequence=variant_consequence) 754 | 755 | return Response(json.dumps({"status": "Done"}), status=200, mimetype='application/json', headers=[ 756 | ('Access-Control-Allow-Origin', '*'), 757 | ]) 758 | 759 | 760 | @app.route('/', strict_slashes=False, defaults={'path': ''}) 761 | @app.route('//') 762 | def catch_all(path): 763 | return f"SpliceAI-lookup APIs: invalid endpoint {path}" 764 | 765 | 766 | if '__main__' == __name__ or os.environ.get('RUNNING_ON_GOOGLE_CLOUD_RUN'): 767 | app.run(debug=DEBUG, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) 768 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from datetime import datetime 3 | import json 4 | import markdown2 5 | import os 6 | import pandas as pd 7 | import re 8 | import socket 9 | import subprocess 10 | import traceback 11 | import tempfile 12 | import time 13 | 14 | # pangolin imports 15 | from pkg_resources import resource_filename 16 | from pangolin.model import torch, Pangolin, L, W, AR 17 | from pangolin.pangolin import process_variant as process_variant_using_pangolin 18 | import gffutils 19 | 20 | # flask imports 21 | from flask import Flask, request, Response, send_from_directory 22 | from flask_cors import CORS 23 | from flask_talisman import Talisman 24 | from intervaltree import IntervalTree, Interval 25 | from spliceai.utils import Annotator, get_delta_scores 26 | 27 | # pandas output options 28 | pd.options.display.float_format = "{:,.2f}".format 29 | pd.set_option('display.max_rows', None) 30 | pd.set_option('display.max_columns', None) 31 | pd.set_option('display.expand_frame_repr', False) 32 | pd.set_option('max_colwidth', None) 33 | 34 | 35 | app = Flask(__name__) 36 | 37 | CORS(app) 38 | 39 | DEBUG = False if socket.gethostname() == "spliceai-lookup" else True 40 | if not DEBUG: 41 | Talisman(app) 42 | 43 | 44 | RATE_LIMIT_WINDOW_SIZE_IN_MINUTES = 1 45 | RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE = { 46 | "spliceai:model": 6, 47 | "spliceai:total": 15, 48 | "pangolin:model": 6, 49 | "pangolin:total": 15, 50 | "liftover:total": 12, 51 | } 52 | 53 | RATE_LIMIT_COUNTER_WINDOW_SIZE_IN_DAYS = 3 54 | RATE_LIMIT_OUTLIER_IPS_PATH = os.path.abspath("rate_limit_outlier_ips.txt") 55 | 56 | def get_rate_limit_outlier_ips(): 57 | print(f"Reading rate limit outlier IPs: {RATE_LIMIT_OUTLIER_IPS_PATH}") 58 | if os.path.isfile(RATE_LIMIT_OUTLIER_IPS_PATH): 59 | with open(RATE_LIMIT_OUTLIER_IPS_PATH, "rt") as f: 60 | rate_limit_outlier_ips = [l.strip() for l in f] 61 | else: 62 | rate_limit_outlier_ips = [] 63 | 64 | print(f"Current list of rate limit outlier IPs: {rate_limit_outlier_ips}") 65 | return rate_limit_outlier_ips 66 | 67 | 68 | RATE_LIMIT_OUTLIER_IPS = get_rate_limit_outlier_ips() 69 | 70 | DISABLE_LOGGING_FOR_IPS = {f"63.143.42.{i}" for i in range(0, 256)} # ignore uptimerobot.com IPs 71 | 72 | 73 | HG19_FASTA_PATH = os.path.expanduser("~/hg19.fa") 74 | HG38_FASTA_PATH = os.path.expanduser("~/hg38.fa") 75 | T2T_FASTA_PATH = os.path.expanduser("~/chm13v2.0.fa") 76 | 77 | GENCODE_VERSION = "v44" 78 | SPLICEAI_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.txt.gz" 79 | SPLICEAI_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.txt.gz" 80 | PANGOLIN_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.without_chr_prefix.db" 81 | PANGOLIN_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.db" 82 | TRANSCRIPT_GRCH37_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}lift37.basic.annotation.transcript_annotations.json" 83 | TRANSCRIPT_GRCH38_ANNOTATIONS = f"./annotations/gencode.{GENCODE_VERSION}.basic.annotation.transcript_annotations.json" 84 | 85 | UCSC_LIFTOVER_TOOL = "UCSC liftover tool" 86 | BCFTOOLS_LIFTOVER_TOOL = "bcftools liftover plugin" 87 | 88 | PANGOLIN_MODELS = [] 89 | for i in 0, 2, 4, 6: 90 | for j in 1, 2, 3: 91 | model = Pangolin(L, W, AR) 92 | if torch.cuda.is_available(): 93 | model.cuda() 94 | weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i))) 95 | else: 96 | weights = torch.load(resource_filename("pangolin", "models/final.%s.%s.3.v2" % (j, i)), map_location=torch.device('cpu')) 97 | 98 | model.load_state_dict(weights) 99 | model.eval() 100 | PANGOLIN_MODELS.append(model) 101 | 102 | 103 | ANNOTATION_INTERVAL_TREES = { 104 | "37": defaultdict(IntervalTree), 105 | "38": defaultdict(IntervalTree), 106 | } 107 | 108 | for genome_version, annotation_path in ("37", SPLICEAI_GRCH37_ANNOTATIONS), ("38", SPLICEAI_GRCH38_ANNOTATIONS): 109 | print(f"Loading {annotation_path}", flush=True) 110 | df = pd.read_table(annotation_path, dtype={"TX_START": int, "TX_END": int}) 111 | for _, row in df.iterrows(): 112 | chrom = row["CHROM"].replace("chr", "") 113 | ANNOTATION_INTERVAL_TREES[genome_version][chrom].add(Interval(row["TX_START"], row["TX_END"] + 0.1, row["#NAME"])) 114 | 115 | SPLICEAI_ANNOTATOR = { 116 | "37": Annotator(HG19_FASTA_PATH, SPLICEAI_GRCH37_ANNOTATIONS), 117 | "38": Annotator(HG38_FASTA_PATH, SPLICEAI_GRCH38_ANNOTATIONS), 118 | } 119 | 120 | ta37_f = open(TRANSCRIPT_GRCH37_ANNOTATIONS, "rt") 121 | ta38_f = open(TRANSCRIPT_GRCH38_ANNOTATIONS, "rt") 122 | TRANSCRIPT_ANNOTATIONS = { 123 | "37": json.load(ta37_f), 124 | "38": json.load(ta38_f), 125 | } 126 | ta37_f.close() 127 | ta38_f.close() 128 | 129 | TRANSCRIPT_PRIORITY_ORDER = { 130 | "MS": 3, # MANE select transcript 131 | "MP": 2, # MANE plus clinical transcript 132 | "C": 1, # canonical transcript 133 | "N": 0 134 | } 135 | 136 | # check that json annotations exist for all transcripts in the SpliceAI annotations file 137 | for genome_version in "37", "38": 138 | json_transcript_ids = set(TRANSCRIPT_ANNOTATIONS[genome_version]) 139 | df = pd.read_table(SPLICEAI_GRCH37_ANNOTATIONS if genome_version == "37" else SPLICEAI_GRCH38_ANNOTATIONS) 140 | spliceai_annotation_transcript_ids = set(df["#NAME"].apply(lambda t: t.split(".")[0])) 141 | transcript_ids_without_annotations = spliceai_annotation_transcript_ids - json_transcript_ids 142 | if len(transcript_ids_without_annotations) > 0: 143 | raise ValueError(f"Missing {len(transcript_ids_without_annotations)} transcripts in {genome_version} annotations: {transcript_ids_without_annotations}") 144 | 145 | SPLICEAI_MAX_DISTANCE_LIMIT = 10000 146 | SPLICEAI_DEFAULT_DISTANCE = 500 # maximum distance between the variant and gained/lost splice site, defaults to 500 147 | SPLICEAI_DEFAULT_MASK = 0 # mask scores representing annotated acceptor/donor gain and unannotated acceptor/donor loss, defaults to 0 148 | 149 | SPLICEAI_EXAMPLE = f"/spliceai/?hg=38&distance=500&mask=0&variant=chr8-140300615-C-G" 150 | 151 | VARIANT_RE = re.compile( 152 | "(chr)?(?P[0-9XYMTt]{1,2})" 153 | "[-\s:]+" 154 | "(?P[0-9]{1,9})" 155 | "[-\s:]+" 156 | "(?P[ACGT]+)" 157 | "[-\s:>]+" 158 | "(?P[ACGT]+)" 159 | ) 160 | 161 | USE_REDIS = True 162 | 163 | if USE_REDIS: 164 | import redis 165 | REDIS = redis.Redis(host='localhost', port=6379, db=0) # in-memory cache server which may or may not be running 166 | else: 167 | REDIS = None 168 | 169 | 170 | def error_response(error_message, source=None): 171 | response_json = {"error": str(error_message)} 172 | if source: 173 | response_json["source"] = source 174 | return Response(json.dumps(response_json), status=200, mimetype='application/json') 175 | 176 | 177 | REVERSE_COMPLEMENT_MAP = dict(zip("ACGTN", "TGCAN")) 178 | 179 | 180 | def reverse_complement(seq): 181 | return "".join([REVERSE_COMPLEMENT_MAP[n] for n in seq[::-1]]) 182 | 183 | 184 | def parse_variant(variant_str): 185 | match = VARIANT_RE.match(variant_str) 186 | if not match: 187 | raise ValueError(f"Unable to parse variant: {variant_str}") 188 | 189 | return match['chrom'], int(match['pos']), match['ref'], match['alt'] 190 | 191 | 192 | class VariantRecord: 193 | def __init__(self, chrom, pos, ref, alt): 194 | self.chrom = chrom 195 | self.pos = pos 196 | self.ref = ref 197 | self.alts = [alt] 198 | 199 | def __repr__(self): 200 | return f"{self.chrom}-{self.pos}-{self.ref}-{self.alts[0]}" 201 | 202 | 203 | def get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask): 204 | return f"{tool_name}__{variant}__hg{genome_version}__d{distance}__m{mask}" 205 | 206 | 207 | def get_splicing_scores_from_redis(tool_name, variant, genome_version, distance, mask): 208 | if REDIS is None: 209 | return None 210 | 211 | key = get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask) 212 | results = None 213 | try: 214 | results_string = REDIS.get(key) 215 | if results_string: 216 | results = json.loads(results_string) 217 | results["source"] += ":redis" 218 | except Exception as e: 219 | print(f"Redis error: {e}", flush=True) 220 | 221 | return results 222 | 223 | 224 | def add_splicing_scores_to_redis(tool_name, variant, genome_version, distance, mask, results): 225 | if REDIS is None: 226 | return 227 | 228 | key = get_splicing_scores_redis_key(tool_name, variant, genome_version, distance, mask) 229 | try: 230 | results_string = json.dumps(results) 231 | REDIS.set(key, results_string) 232 | except Exception as e: 233 | print(f"Redis error: {e}", flush=True) 234 | 235 | 236 | def exceeds_rate_limit(user_id, request_type): 237 | """Checks whether the given address has exceeded rate limits 238 | 239 | Args: 240 | user_id (str): unique user id 241 | request_type (str): type of rate limit - can be "spliceai:total", "spliceai:model", or "liftover:total" 242 | 243 | Return str: error message about exceeding the rate limit, or None if the rate limit was not exceeded 244 | """ 245 | if REDIS is None: 246 | return False 247 | 248 | if request_type not in RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE: 249 | raise ValueError(f"Invalid 'request_type' arg value: {request_type}") 250 | 251 | epoch_time = time.time() # seconds since 1970 252 | 253 | if epoch_time - int(REDIS.get("rate_limit_outlier_ips_update_time") or 0) > 120: # time 2 minutes 254 | REDIS.set("rate_limit_outlier_ips_update_time", int(epoch_time)) 255 | global RATE_LIMIT_OUTLIER_IPS 256 | RATE_LIMIT_OUTLIER_IPS = get_rate_limit_outlier_ips() 257 | 258 | if user_id in RATE_LIMIT_OUTLIER_IPS: 259 | print(f"Rate limiting outlier list IP: {user_id}") 260 | max_requests = 1 261 | else: 262 | max_requests_per_minute = RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type] 263 | max_requests = RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * max_requests_per_minute 264 | 265 | try: 266 | # check number of requests from this user in the last (RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * 60) minutes 267 | redis_key_prefix = f"request {user_id} {request_type}" 268 | keys = REDIS.keys(f"{redis_key_prefix}*") 269 | if len(keys) >= max_requests: 270 | redis_hit_limit_counter_key = f"request {user_id} rate limit counter" 271 | redis_hit_limit_counter = REDIS.get(redis_hit_limit_counter_key) or 0 272 | redis_hit_limit_counter = int(redis_hit_limit_counter) + 1 273 | REDIS.set(redis_hit_limit_counter_key, redis_hit_limit_counter) 274 | REDIS.expire(redis_hit_limit_counter_key, RATE_LIMIT_COUNTER_WINDOW_SIZE_IN_DAYS * 24 * 60 * 60) 275 | 276 | if redis_hit_limit_counter > 200: 277 | error_message = ( 278 | f"ERROR: You have exceeded the rate limit {redis_hit_limit_counter} times so far " 279 | f"over the past few days. To prevent a single user from overwhelming the server and making it " 280 | f"unavailable to other users, this tool allows no more than " 281 | f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type]} computed requests per " 282 | f"minute per user. If you continue to exceed this limit, your IP address may be blocked." 283 | ) 284 | else: 285 | error_message = ( 286 | f"ERROR: Rate limit reached. To prevent a user from overwhelming the server and making it " 287 | f"unavailable to other users, this tool allows no more than " 288 | f"{RATE_LIMIT_REQUESTS_PER_USER_PER_MINUTE[request_type]} computed requests per minute per user." 289 | ) 290 | 291 | return error_message 292 | 293 | # record this request 294 | REDIS.set(f"{redis_key_prefix}: {epoch_time}", 1) 295 | REDIS.expire(f"{redis_key_prefix}: {epoch_time}", RATE_LIMIT_WINDOW_SIZE_IN_MINUTES * 60) 296 | except Exception as e: 297 | print(f"Redis error: {e}", flush=True) 298 | 299 | return None 300 | 301 | 302 | def get_spliceai_scores(variant, genome_version, distance_param, mask_param): 303 | try: 304 | chrom, pos, ref, alt = parse_variant(variant) 305 | except ValueError as e: 306 | return { 307 | "variant": variant, 308 | "source": "spliceai", 309 | "error": f"ERROR: {e}", 310 | } 311 | 312 | # generate error message if variant falls outside annotated exons or introns 313 | OTHER_GENOME_VERSION = {"37": "38", "38": "37"} 314 | chrom_without_chr = chrom.replace("chr", "") 315 | if not ANNOTATION_INTERVAL_TREES[genome_version][chrom_without_chr].at(pos): 316 | other_genome_version = OTHER_GENOME_VERSION[genome_version] 317 | other_genome_overlapping_intervals = ANNOTATION_INTERVAL_TREES[other_genome_version][chrom_without_chr].at(pos) 318 | if other_genome_overlapping_intervals: 319 | other_genome_genes = " and ".join(sorted(set([str(i.data) for i in other_genome_overlapping_intervals]))) 320 | return { 321 | "variant": variant, 322 | "source": "spliceai", 323 | "error": f"ERROR: In GRCh{genome_version}, {chrom}-{pos}-{ref}-{alt} falls outside all gencode exons and introns." 324 | f"SpliceAI only works for variants within known exons or introns. However, in GRCh{other_genome_version}, " 325 | f"{chrom}:{pos} falls within {other_genome_genes}, so perhaps GRCh{genome_version} is not the correct genome version?" 326 | } 327 | else: 328 | return { 329 | "variant": variant, 330 | "source": "spliceai", 331 | "error": f"ERROR: {chrom}-{pos}-{ref}-{alt} falls outside all Gencode exons and introns on " 332 | f"GRCh{genome_version}. SpliceAI only works for variants that are within known exons or introns.", 333 | } 334 | 335 | """ 336 | NOTE: The reason SpliceAI currently works only for variants 337 | within annotated exons or introns is that, although the SpliceAI neural net takes any 338 | arbitrary nucleotide sequence as input, SpliceAI needs 1) the transcript strand 339 | to determine whether to reverse-complement the reference genome sequence before passing it 340 | to the neural net, and 2) transcript start and end positions to determine where to truncate 341 | the reference genome sequence. 342 | """ 343 | 344 | source = None 345 | scores = [] 346 | 347 | # run the SpliceAI model to compute the scores 348 | if not scores: 349 | error_message = exceeds_rate_limit(request.remote_addr, request_type="spliceai:model") 350 | if error_message: 351 | return { 352 | "variant": variant, 353 | "source": "spliceai", 354 | "error": error_message, 355 | } 356 | 357 | record = VariantRecord(chrom, pos, ref, alt) 358 | try: 359 | scores = get_delta_scores( 360 | record, 361 | SPLICEAI_ANNOTATOR[genome_version], 362 | distance_param, 363 | mask_param) 364 | source = "spliceai:model" 365 | except Exception as e: 366 | print(f"ERROR while computing SpliceAI scores for {variant}: {e}") 367 | traceback.print_exc() 368 | return { 369 | "variant": variant, 370 | "source": "spliceai", 371 | "error": f"ERROR: {type(e)}: {e}", 372 | } 373 | 374 | if not scores: 375 | return { 376 | "variant": variant, 377 | "source": "spliceai", 378 | "error": f"ERROR: The SpliceAI model did not return any scores for {variant}. This may be due to the " 379 | f"variant falling outside of all Gencode exons and introns.", 380 | } 381 | 382 | # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript) 383 | all_non_zero_scores = None 384 | all_non_zero_scores_strand = None 385 | all_non_zero_scores_transcript_id = None 386 | all_non_zero_scores_transcript_priority = -1 387 | max_delta_score_sum = 0 388 | for i, transcript_scores in enumerate(scores): 389 | if "ALL_NON_ZERO_SCORES" not in transcript_scores: 390 | continue 391 | 392 | transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0] 393 | 394 | # get json annotations for this transcript 395 | transcript_annotations = TRANSCRIPT_ANNOTATIONS[genome_version].get(transcript_id_without_version) 396 | if transcript_annotations is None: 397 | raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations") 398 | 399 | # add the extra transcript annotations from the json file to the transcript scores dict 400 | transcript_scores.update(transcript_annotations) 401 | 402 | current_transcript_priority = TRANSCRIPT_PRIORITY_ORDER[transcript_annotations["t_priority"]] 403 | current_delta_score_sum = sum(float(transcript_scores.get(key, 0)) for key in ("DP_AG", "DP_AL", "DP_DG", "DP_DL")) 404 | if current_transcript_priority > all_non_zero_scores_transcript_priority: 405 | all_non_zero_scores_transcript_priority = current_transcript_priority 406 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 407 | all_non_zero_scores_strand = transcript_scores["t_strand"] 408 | all_non_zero_scores_transcript_id = transcript_scores["t_id"] 409 | elif current_transcript_priority == all_non_zero_scores_transcript_priority and current_delta_score_sum > max_delta_score_sum: 410 | # select the one with the highest delta score sum 411 | max_delta_score_sum = current_delta_score_sum 412 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 413 | all_non_zero_scores_strand = transcript_scores["t_strand"] 414 | all_non_zero_scores_transcript_id = transcript_scores["t_id"] 415 | 416 | for redundant_key in "ALLELE", "NAME", "STRAND", "ALL_NON_ZERO_SCORES": 417 | del transcript_scores[redundant_key] 418 | 419 | return { 420 | "variant": variant, 421 | "genomeVersion": genome_version, 422 | "chrom": chrom, 423 | "pos": pos, 424 | "ref": ref, 425 | "alt": alt, 426 | "distance": distance_param, 427 | "scores": scores, 428 | "source": source, 429 | 430 | "allNonZeroScores": all_non_zero_scores, 431 | "allNonZeroScoresStrand": all_non_zero_scores_strand, 432 | "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id, 433 | } 434 | 435 | 436 | def get_pangolin_scores(variant, genome_version, distance_param, mask_param): 437 | if genome_version not in ("37", "38"): 438 | raise ValueError(f"Invalid genome_version: {genome_version}") 439 | 440 | if mask_param not in ("True", "False"): 441 | raise ValueError(f"Invalid mask_param: {mask_param}") 442 | 443 | try: 444 | chrom, pos, ref, alt = parse_variant(variant) 445 | except ValueError as e: 446 | print(f"ERROR while parsing variant {variant}: {e}") 447 | traceback.print_exc() 448 | 449 | return { 450 | "variant": variant, 451 | "source": "pangolin", 452 | "error": f"ERROR: {e}", 453 | } 454 | 455 | if len(ref) > 1 and len(alt) > 1: 456 | return { 457 | "variant": variant, 458 | "source": "pangolin", 459 | "error": f"ERROR: Pangolin does not currently support complex InDels like {chrom}-{pos}-{ref}-{alt}", 460 | } 461 | 462 | error_message = exceeds_rate_limit(request.remote_addr, request_type="pangolin:model") 463 | if error_message: 464 | return { 465 | "variant": variant, 466 | "source": "pangolin", 467 | "error": error_message, 468 | } 469 | 470 | class PangolinArgs: 471 | reference_file = HG19_FASTA_PATH if genome_version == "37" else HG38_FASTA_PATH 472 | distance = distance_param 473 | mask = mask_param 474 | score_cutoff = None 475 | score_exons = "False" 476 | 477 | if genome_version == "37": 478 | pangolin_gene_db = gffutils.FeatureDB(PANGOLIN_GRCH37_ANNOTATIONS) 479 | else: 480 | pangolin_gene_db = gffutils.FeatureDB(PANGOLIN_GRCH38_ANNOTATIONS) 481 | 482 | scores = process_variant_using_pangolin( 483 | 0, chrom, int(pos), ref, alt, pangolin_gene_db, PANGOLIN_MODELS, PangolinArgs) 484 | 485 | if not scores: 486 | return { 487 | "variant": variant, 488 | "source": "pangolin", 489 | "error": f"ERROR: Pangolin was unable to compute scores for this variant", 490 | } 491 | 492 | # to reduce the response size, return all non-zero scores only for the canonial transcript (or the 1st transcript) 493 | all_non_zero_scores = None 494 | all_non_zero_scores_strand = None 495 | all_non_zero_scores_transcript_id = None 496 | max_delta_score_sum = 0 497 | for i, transcript_scores in enumerate(scores): 498 | if "ALL_NON_ZERO_SCORES" not in transcript_scores: 499 | continue 500 | 501 | transcript_id_without_version = transcript_scores.get("NAME", "").split(".")[0] 502 | 503 | # get json annotations for this transcript 504 | transcript_annotations = TRANSCRIPT_ANNOTATIONS[genome_version].get(transcript_id_without_version) 505 | if transcript_annotations is None: 506 | raise ValueError(f"Missing annotations for {transcript_id_without_version} in {genome_version} annotations") 507 | 508 | # add the extra transcript annotations from the json file to the transcript scores dict 509 | transcript_scores.update(transcript_annotations) 510 | 511 | # decide whether to use ALL_NON_ZERO_SCORES from this gene 512 | current_delta_score_sum = sum(abs(float(s.get("SG_ALT", 0)) - float(s.get("SG_REF", 0))) 513 | for s in transcript_scores["ALL_NON_ZERO_SCORES"]) 514 | current_delta_score_sum += sum(abs(float(s.get("SL_ALT", 0)) - float(s.get("SL_REF", 0))) 515 | for s in transcript_scores["ALL_NON_ZERO_SCORES"]) 516 | 517 | # return all_non_zero_scores for the transcript or gene with the highest delta score sum 518 | if current_delta_score_sum > max_delta_score_sum: 519 | all_non_zero_scores = transcript_scores["ALL_NON_ZERO_SCORES"] 520 | all_non_zero_scores_strand = transcript_scores["STRAND"] 521 | all_non_zero_scores_transcript_id = transcript_scores["NAME"] 522 | max_delta_score_sum = current_delta_score_sum 523 | 524 | for redundant_key in "NAME", "STRAND", "ALL_NON_ZERO_SCORES": 525 | del transcript_scores[redundant_key] 526 | 527 | return { 528 | "variant": variant, 529 | "genomeVersion": genome_version, 530 | "chrom": chrom, 531 | "pos": pos, 532 | "ref": ref, 533 | "alt": alt, 534 | "distance": distance_param, 535 | "scores": scores, 536 | "source": "pangolin", 537 | "allNonZeroScores": all_non_zero_scores, 538 | "allNonZeroScoresStrand": all_non_zero_scores_strand, 539 | "allNonZeroScoresTranscriptId": all_non_zero_scores_transcript_id, 540 | } 541 | 542 | 543 | @app.route("/spliceai/", methods=['POST', 'GET']) 544 | def run_spliceai(): 545 | return run_splice_prediction_tool(tool_name="spliceai") 546 | 547 | 548 | @app.route("/pangolin/", methods=['POST', 'GET']) 549 | def run_pangolin(): 550 | return run_splice_prediction_tool(tool_name="pangolin") 551 | 552 | 553 | def run_splice_prediction_tool(tool_name): 554 | """Handles API request for splice prediction 555 | 556 | Args: 557 | tool_name (str): "spliceai" or "pangolin" 558 | """ 559 | if tool_name not in ("spliceai", "pangolin"): 560 | raise ValueError(f"Invalid tool_name: {tool_name}") 561 | 562 | start_time = datetime.now() 563 | logging_prefix = start_time.strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()}" 564 | 565 | # check params 566 | params = {} 567 | if request.values: 568 | params.update(request.values) 569 | 570 | if 'variant' not in params: 571 | params.update(request.get_json(force=True, silent=True) or {}) 572 | 573 | error_message = exceeds_rate_limit(request.remote_addr, request_type=f"{tool_name}:total") 574 | if error_message: 575 | print(f"{logging_prefix}: {request.remote_addr}: response: {error_message}", flush=True) 576 | return error_response(error_message, source=tool_name) 577 | 578 | variant = params.get('variant', '') 579 | variant = variant.strip().strip("'").strip('"').strip(",") 580 | if not variant: 581 | return error_response(f'"variant" not specified. For example: {SPLICEAI_EXAMPLE}\n', source=tool_name) 582 | 583 | if not isinstance(variant, str): 584 | return error_response(f'"variant" value must be a string rather than a {type(variant)}.\n', source=tool_name) 585 | 586 | genome_version = params.get("hg") 587 | if not genome_version: 588 | return error_response(f'"hg" not specified. The URL must include an "hg" arg: hg=37 or hg=38. For example: {SPLICEAI_EXAMPLE}\n', source=tool_name) 589 | 590 | if genome_version not in ("37", "38"): 591 | return error_response(f'Invalid "hg" value: "{genome_version}". The value must be either "37" or "38". For example: {SPLICEAI_EXAMPLE}\n', source=tool_name) 592 | 593 | distance_param = params.get("distance", SPLICEAI_DEFAULT_DISTANCE) 594 | try: 595 | distance_param = int(distance_param) 596 | except Exception as e: 597 | return error_response(f'Invalid "distance": "{distance_param}". The value must be an integer.\n', source=tool_name) 598 | 599 | if distance_param > SPLICEAI_MAX_DISTANCE_LIMIT: 600 | return error_response(f'Invalid "distance": "{distance_param}". The value must be < {SPLICEAI_MAX_DISTANCE_LIMIT}.\n', source=tool_name) 601 | 602 | mask_param = params.get("mask", str(SPLICEAI_DEFAULT_MASK)) 603 | if mask_param not in ("0", "1"): 604 | return error_response(f'Invalid "mask" value: "{mask_param}". The value must be either "0" or "1". For example: {SPLICEAI_EXAMPLE}\n', source=tool_name) 605 | 606 | if request.remote_addr not in DISABLE_LOGGING_FOR_IPS: 607 | print(f"{logging_prefix}: {request.remote_addr}: ======================", flush=True) 608 | print(f"{logging_prefix}: {request.remote_addr}: {variant} processing with hg={genome_version}, " 609 | f"distance={distance_param}, mask={mask_param}", flush=True) 610 | 611 | # check REDIS cache before processing the variant 612 | results = get_splicing_scores_from_redis(tool_name, variant, genome_version, distance_param, mask_param) 613 | if not results: 614 | try: 615 | if tool_name == "spliceai": 616 | results = get_spliceai_scores(variant, genome_version, distance_param, int(mask_param)) 617 | elif tool_name == "pangolin": 618 | pangolin_mask_param = "True" if mask_param == "1" else "False" 619 | results = get_pangolin_scores(variant, genome_version, distance_param, pangolin_mask_param) 620 | else: 621 | raise ValueError(f"Invalid tool_name: {tool_name}") 622 | except Exception as e: 623 | traceback.print_exc() 624 | return error_response(f"ERROR: {e}", source=tool_name) 625 | 626 | if "error" not in results: 627 | add_splicing_scores_to_redis(tool_name, variant, genome_version, distance_param, mask_param, results) 628 | 629 | response_json = {} 630 | response_json.update(params) # copy input params to output 631 | response_json.update(results) 632 | 633 | duration = str(datetime.now() - start_time) 634 | response_json['duration'] = duration 635 | 636 | if request.remote_addr not in DISABLE_LOGGING_FOR_IPS: 637 | print(f"{logging_prefix}: {request.remote_addr}: {variant} took {duration}", flush=True) 638 | 639 | return Response(json.dumps(response_json), status=200, mimetype='application/json') 640 | 641 | 642 | LIFTOVER_EXAMPLE = f"/liftover/?hg=hg19-to-hg38&format=interval&chrom=chr8&start=140300615&end=140300620" 643 | 644 | CHAIN_FILE_PATHS = { 645 | "hg19-to-hg38": "hg19ToHg38.over.chain.gz", 646 | "hg38-to-hg19": "hg38ToHg19.over.chain.gz", 647 | "hg38-to-t2t": "hg38ToHs1.over.chain.gz", # replaced hg38-chm13v2.over.chain.gz based on advice from Giulio Genovese 648 | "t2t-to-hg38": "hs1ToHg38.over.chain.gz", # replaced chm13v2-hg38.over.chain.gz based on advice from Giulio Genovese 649 | } 650 | 651 | LIFTOVER_REFERENCE_PATHS = { 652 | "hg19-to-hg38": (HG19_FASTA_PATH, HG38_FASTA_PATH), 653 | "hg38-to-hg19": (HG38_FASTA_PATH, HG19_FASTA_PATH), 654 | "hg38-to-t2t": (HG38_FASTA_PATH, T2T_FASTA_PATH), 655 | "t2t-to-hg38": (T2T_FASTA_PATH, HG38_FASTA_PATH), 656 | } 657 | 658 | def run_variant_liftover_tool(hg, chrom, pos, ref, alt, verbose=False): 659 | if hg not in CHAIN_FILE_PATHS or hg not in LIFTOVER_REFERENCE_PATHS: 660 | raise ValueError(f"Unexpected hg arg value: {hg}") 661 | chain_file_path = CHAIN_FILE_PATHS[hg] 662 | source_fasta_path, destination_fasta_path = LIFTOVER_REFERENCE_PATHS[hg] 663 | 664 | with tempfile.NamedTemporaryFile(suffix=".vcf", mode="wt", encoding="UTF-8") as input_file, \ 665 | tempfile.NamedTemporaryFile(suffix=".vcf", mode="rt", encoding="UTF-8") as output_file: 666 | 667 | # command syntax: liftOver oldFile map.chain newFile unMapped 668 | if hg == "hg19-to-hg38": 669 | chrom = chrom.replace("chr", "") 670 | else: 671 | chrom = "chr" + chrom.replace("chr", "") 672 | 673 | input_file.write(f"""##fileformat=VCFv4.2 674 | ##contig= 675 | #CHROM POS ID REF ALT QUAL FILTER INFO 676 | {chrom} {pos} . {ref} {alt} 60 .""") 677 | input_file.flush() 678 | command = ( 679 | f"cat {input_file.name} | " 680 | f"bcftools plugin liftover -- --src-fasta-ref {source_fasta_path} --fasta-ref {destination_fasta_path} --chain {chain_file_path} | " 681 | f"grep -v ^# > {output_file.name}" 682 | ) 683 | 684 | try: 685 | subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, encoding="UTF-8") 686 | results = output_file.read() 687 | 688 | if verbose: 689 | print(f"{BCFTOOLS_LIFTOVER_TOOL} {hg} liftover on {chrom}:{pos} {ref}>{alt} returned: {results}", flush=True) 690 | 691 | # example: chr8 140300616 . T G 60 . . 692 | 693 | result_fields = results.strip().split("\t") 694 | if len(result_fields) > 5: 695 | result_fields[1] = int(result_fields[1]) 696 | 697 | return { 698 | "hg": hg, 699 | "chrom": chrom, 700 | "start": int(pos) - 1, 701 | "end": pos, 702 | "output_chrom": result_fields[0], 703 | "output_pos": result_fields[1], 704 | "output_ref": result_fields[3], 705 | "output_alt": result_fields[4], 706 | "liftover_tool": BCFTOOLS_LIFTOVER_TOOL, 707 | #"output_strand": "-" if "SWAP=-1" in results else "+", 708 | } 709 | 710 | except Exception as e: 711 | variant = f"{hg} {chrom}:{pos} {ref}>{alt}" 712 | print(f"ERROR in {BCFTOOLS_LIFTOVER_TOOL} for {variant}: {e}") 713 | print("Falling back on UCSC liftover tool..") 714 | #traceback.print_exc() 715 | #raise ValueError(f"liftOver command failed for {variant}: {e}") 716 | 717 | # if bcftools liftover failed, fall back on running UCSC liftover 718 | chrom = "chr" + chrom.replace("chr", "") 719 | result = run_UCSC_liftover_tool(hg, chrom, int(pos)-1, pos, verbose=False) 720 | result["output_ref"] = ref 721 | result["output_alt"] = alt 722 | #if result["output_strand"] == "-": 723 | # result["output_ref"] = reverse_complement(result["output_ref"]) 724 | # result["output_alt"] = reverse_complement(result["output_alt"]) 725 | return result 726 | 727 | 728 | def run_UCSC_liftover_tool(hg, chrom, start, end, verbose=False): 729 | if hg not in CHAIN_FILE_PATHS: 730 | raise ValueError(f"Unexpected hg arg value: {hg}") 731 | chain_file_path = CHAIN_FILE_PATHS[hg] 732 | 733 | reason_liftover_failed = "" 734 | with tempfile.NamedTemporaryFile(suffix=".bed", mode="wt", encoding="UTF-8") as input_file, \ 735 | tempfile.NamedTemporaryFile(suffix=".bed", mode="rt", encoding="UTF-8") as output_file, \ 736 | tempfile.NamedTemporaryFile(suffix=".bed", mode="rt", encoding="UTF-8") as unmapped_output_file: 737 | 738 | # command syntax: liftOver oldFile map.chain newFile unMapped 739 | chrom = "chr" + chrom.replace("chr", "") 740 | input_file.write("\t".join(map(str, [chrom, start, end, ".", "0", "+"])) + "\n") 741 | input_file.flush() 742 | command = f"liftOver {input_file.name} {chain_file_path} {output_file.name} {unmapped_output_file.name}" 743 | 744 | try: 745 | subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, encoding="UTF-8") 746 | results = output_file.read() 747 | if verbose: 748 | print(f"{UCSC_LIFTOVER_TOOL} {hg} liftover on {chrom}:{start}-{end} returned: {results}", flush=True) 749 | 750 | result_fields = results.strip().split("\t") 751 | if len(result_fields) > 5: 752 | result_fields[1] = int(result_fields[1]) 753 | result_fields[2] = int(result_fields[2]) 754 | 755 | return { 756 | "hg": hg, 757 | "chrom": chrom, 758 | "pos": int(start) + 1, 759 | "start": start, 760 | "end": end, 761 | "output_chrom": result_fields[0], 762 | "output_pos": int(result_fields[1]) + 1, 763 | "output_start": result_fields[1], 764 | "output_end": result_fields[2], 765 | "output_strand": result_fields[5], 766 | "liftover_tool": UCSC_LIFTOVER_TOOL, 767 | } 768 | else: 769 | reason_liftover_failed = unmapped_output_file.readline().replace("#", "").strip() 770 | 771 | except Exception as e: 772 | variant = f"{hg} {chrom}:{start}-{end}" 773 | print(f"ERROR during liftover for {variant}: {e}") 774 | traceback.print_exc() 775 | raise ValueError(f"liftOver command failed for {variant}: {e}") 776 | 777 | if reason_liftover_failed: 778 | raise ValueError(f"{hg} liftover failed for {chrom}:{start}-{end} {reason_liftover_failed}") 779 | else: 780 | raise ValueError(f"{hg} liftover failed for {chrom}:{start}-{end} for unknown reasons") 781 | 782 | 783 | def get_liftover_from_redis(key): 784 | if REDIS is None: 785 | return None 786 | 787 | results = None 788 | try: 789 | results_string = REDIS.get(key) 790 | if results_string: 791 | results = json.loads(results_string) 792 | except Exception as e: 793 | print(f"Redis error: {e}", flush=True) 794 | 795 | return results 796 | 797 | 798 | def add_liftover_to_redis(key, result): 799 | if REDIS is None: 800 | return 801 | 802 | try: 803 | results_string = json.dumps(result) 804 | REDIS.set(key, results_string) 805 | except Exception as e: 806 | print(f"Redis error: {e}", flush=True) 807 | 808 | 809 | @app.route("/liftover/", methods=['POST', 'GET']) 810 | def run_liftover(): 811 | logging_prefix = datetime.now().strftime("%m/%d/%Y %H:%M:%S") + f" t{os.getpid()}" 812 | 813 | # check params 814 | params = {} 815 | if request.values: 816 | params.update(request.values) 817 | 818 | if "format" not in params: 819 | params.update(request.get_json(force=True, silent=True) or {}) 820 | 821 | error_message = exceeds_rate_limit(request.remote_addr, request_type="liftover:total") 822 | if error_message: 823 | print(f"{logging_prefix}: {request.remote_addr}: response: {error_message}", flush=True) 824 | return error_response(error_message) 825 | 826 | VALID_HG_VALUES = set(CHAIN_FILE_PATHS.keys()) 827 | hg = params.get("hg") 828 | if not hg or hg not in VALID_HG_VALUES: 829 | return error_response(f'"hg" param error. It should be set to {" or ".join(VALID_HG_VALUES)}. For example: {LIFTOVER_EXAMPLE}\n') 830 | 831 | VALID_FORMAT_VALUES = ("interval", "variant", "position") 832 | format = params.get("format", "") 833 | if not format or format not in VALID_FORMAT_VALUES: 834 | return error_response(f'"format" param error. It should be set to {" or ".join(VALID_FORMAT_VALUES)}. For example: {LIFTOVER_EXAMPLE}\n') 835 | 836 | chrom = params.get("chrom") 837 | if not chrom: 838 | return error_response(f'"chrom" param not specified') 839 | 840 | if format == "interval": 841 | for key in "start", "end": 842 | if not params.get(key): 843 | return error_response(f'"{key}" param not specified') 844 | start = params.get("start") 845 | end = params.get("end") 846 | redis_key = f"{hg}:{chrom}:{start}:{end}" 847 | variant_log_string = f"{start}-{end}" 848 | 849 | elif format == "position": 850 | try: 851 | pos = int(params["pos"]) 852 | except Exception as e: 853 | return error_response(f'"pos" param error: {e}') 854 | 855 | start = pos - 1 856 | end = pos 857 | redis_key = f"{hg}:{chrom}:{pos}" 858 | variant_log_string = f"{pos} " 859 | elif format == "variant": 860 | for key in "pos", "ref", "alt": 861 | if not params.get(key): 862 | return error_response(f'"{key}" param not specified') 863 | pos = params.get("pos") 864 | ref = params.get("ref") 865 | alt = params.get("alt") 866 | redis_key = f"{hg}:{chrom}:{pos}:{ref}:{alt}" 867 | variant_log_string = f"{pos} {ref}>{alt}" 868 | 869 | verbose = request.remote_addr not in DISABLE_LOGGING_FOR_IPS 870 | if verbose: 871 | print(f"{logging_prefix}: {request.remote_addr}: ======================", flush=True) 872 | print(f"{logging_prefix}: {request.remote_addr}: {hg} liftover {format}: {chrom}:{variant_log_string}", flush=True) 873 | 874 | # check REDIS cache before processing the variant 875 | result = get_liftover_from_redis(redis_key) 876 | if result and verbose: 877 | print(f"{hg} liftover on {variant_log_string} got results from cache: {result}", flush=True) 878 | 879 | if not result: 880 | try: 881 | if format == "variant": 882 | result = run_variant_liftover_tool(hg, chrom, pos, ref, alt, verbose=verbose) 883 | else: 884 | result = run_UCSC_liftover_tool(hg, chrom, start, end, verbose=verbose) 885 | except Exception as e: 886 | return error_response(str(e)) 887 | 888 | add_liftover_to_redis(redis_key, result) 889 | 890 | result.update(params) 891 | 892 | return Response(json.dumps(result), mimetype='application/json') 893 | 894 | # share static files from the annotations folder to support local installs 895 | @app.route('/annotations/', strict_slashes=False, defaults={'path': ''}) 896 | @app.route('/annotations/') 897 | def send_annotations(path): 898 | if os.path.isfile(os.path.join("annotations", path)): 899 | return send_from_directory('annotations', path) 900 | 901 | # return an html table of available annotation files 902 | html = "SpliceAI-lookup: Annotation Files" 903 | html += "" 904 | html += "" 905 | for filename in os.listdir("annotations"): 906 | html += f"" 907 | last_modified = datetime.fromtimestamp(os.path.getmtime(os.path.join('annotations', filename))) 908 | html += f"" 909 | html += "
./annotation fileslast updated
{filename}{last_modified.strftime('%Y-%m-%d %H:%M:%S')}
" 910 | 911 | return Response(html, mimetype='text/html') 912 | 913 | 914 | 915 | @app.route('/', strict_slashes=False, defaults={'path': ''}) 916 | @app.route('//') 917 | def catch_all(path): 918 | if not path: 919 | path = "index.html" 920 | 921 | if path in {"index.html", "igv.min.js"}: 922 | with open(path, "rt") as f: 923 | html = f.read() 924 | return Response(html, mimetype='text/html') 925 | elif path == "favicon.ico": 926 | return send_from_directory('', 'favicon.ico') 927 | else: 928 | with open("README.md") as f: 929 | return markdown2.markdown(f.read()) 930 | 931 | 932 | print("Initialization completed.", flush=True) 933 | 934 | if __name__ == "__main__": 935 | app.run(debug=DEBUG, host='0.0.0.0', port=int(os.environ.get('PORT', 8080))) 936 | --------------------------------------------------------------------------------