├── data ├── accessions_no_params.csv ├── accessions_large_n2.csv ├── gcp │ ├── star_indices_hg.csv │ ├── barcodes_n2.csv │ ├── barcodes.csv │ └── star_indices.csv ├── accessions_no-read2.csv ├── accessions_small_n2.csv ├── star_indices_hg.csv ├── accessions_small_n3.csv ├── accessions_org_n2.csv ├── barcodes_n2.csv ├── accessions_problems.csv ├── accessions_small_n10.csv ├── barcodes.csv ├── accessions_small_n2_params.csv ├── accessions_var.csv ├── star_indices.csv └── accessions_all-org.csv ├── envs ├── read_qc.yml ├── star.yml ├── download.yml └── tiledb.yml ├── .gitignore ├── scripts ├── gcp-upload │ ├── conda-env.yml │ ├── config │ │ ├── utils.config │ │ ├── process.config │ │ └── profiles.config │ ├── nextflow.config │ ├── bin │ │ ├── agg-obs-metadata.py │ │ └── db-to-parquet.py │ ├── README.md │ └── main.nf ├── tiledb-loader │ ├── conda-env.yml │ ├── config │ │ ├── utils.config │ │ ├── process.config │ │ └── profiles.config │ ├── nextflow.config │ ├── main.nf │ ├── README.md │ └── bin │ │ ├── mtx-to-h5ad.py │ │ ├── h5ad-to-db.py │ │ └── find-mtx.py ├── gcp-loader-tahoe100.py ├── gcp-find-soft-delete.py ├── tiledb-loader-tahoe100.py ├── gcp2chimera.py ├── search-cloud-run-job-logs.py ├── purge-srx.py ├── extract-from-result-files.py └── acc2srr.py ├── docker ├── sc-recounter-run │ ├── environment.yml │ ├── entrypoint.sh │ ├── Dockerfile │ ├── README.md │ └── cleanup.py ├── README.md ├── sc-recounter-star │ ├── README.md │ └── Dockerfile └── sc-recounter-download │ ├── README.md │ └── Dockerfile ├── config ├── utils.config ├── process.config └── profiles.config ├── lib ├── utils.nf ├── download.groovy ├── star_params.groovy └── utils.groovy ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── LICENSE ├── workflows ├── read_qc.nf ├── db_acc.nf ├── reads.nf └── download.nf ├── bin ├── csv-merge.py ├── subsample.py ├── upload-final-star-params.py ├── format-star-params.py ├── star-summary.py ├── sra-stat.py ├── parallel-fastq-dump.py ├── get-db-accessions.py └── prefetch.py ├── main.nf ├── nextflow.config └── README.md /data/accessions_no_params.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | SRX19162973,SRR23215162,human -------------------------------------------------------------------------------- /data/accessions_large_n2.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | sample1,SRR13711613,human 3 | sample2,SRR13960234,human -------------------------------------------------------------------------------- /data/gcp/star_indices_hg.csv: -------------------------------------------------------------------------------- 1 | organism,star_index 2 | human,gs://arc-ctc-references/STAR/star_refData_2020_hg38/ -------------------------------------------------------------------------------- /data/accessions_no-read2.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | no_read2_s1,SRR25778815,human 3 | no_read2_s2,ERR11746860,human -------------------------------------------------------------------------------- /data/accessions_small_n2.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | SRX24914804,SRR13112659,human 3 | SRX24914805,SRR13112660,human -------------------------------------------------------------------------------- /data/star_indices_hg.csv: -------------------------------------------------------------------------------- 1 | organism,star_index 2 | human,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38 3 | -------------------------------------------------------------------------------- /data/accessions_small_n3.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | SRX24914804,SRR13112659,human 3 | SRX24914805,SRR13112660,human 4 | SRX20274301,SRR24488917,mouse -------------------------------------------------------------------------------- /envs/read_qc.yml: -------------------------------------------------------------------------------- 1 | name: read_qc 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.11 7 | - pandas=2.2 8 | - seqkit=2.8 -------------------------------------------------------------------------------- /data/accessions_org_n2.csv: -------------------------------------------------------------------------------- 1 | sample,accession,entrez_id,tech_10x,organism 2 | ERX10987225,ERR11583756,34046074,3_prime_gex,human 3 | ERX10987225,ERR11583807,34046074,3_prime_gex,human 4 | -------------------------------------------------------------------------------- /data/gcp/barcodes_n2.csv: -------------------------------------------------------------------------------- 1 | name,cell_barcode_length,umi_length,file_path 2 | 737K-august-2016,16,10,gs://arc-ctc-references/cellranger/barcodes/737K-august-2016.txt 3 | 737K-arc-v1,16,12,gs://arc-ctc-references/cellranger/barcodes/737K-arc-v1.txt -------------------------------------------------------------------------------- /data/barcodes_n2.csv: -------------------------------------------------------------------------------- 1 | name,cell_barcode_length,umi_length,file_path 2 | 737K-august-2016,16,10,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt 3 | 3M-february-2018,16,12,/common_datasets/external/references/cellranger/barcodes/3M-february-2018.txt -------------------------------------------------------------------------------- /envs/star.yml: -------------------------------------------------------------------------------- 1 | name: star 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.11 7 | - pandas=2.2 8 | - star=2.7 9 | - psycopg2-binary=2.9 10 | - pypika=0.48 11 | - python-dotenv=1.0 12 | - google-cloud-secret-manager=2.22 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ._* 2 | *~ 3 | *.log 4 | .env 5 | .gcp/ 6 | tmp/ 7 | TMP/ 8 | logs/ 9 | work/ 10 | results/ 11 | archive/ 12 | star_ref/ 13 | ignore/ 14 | .nextflow/ 15 | .nextflow* 16 | screenlog.* 17 | *.pyc 18 | SRAgent/ 19 | scripts/db_utils.py 20 | notebooks/tiledb/ 21 | docker/sc-recounter-run/db_utils.py -------------------------------------------------------------------------------- /envs/download.yml: -------------------------------------------------------------------------------- 1 | name: download 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.11 7 | - pandas=2.2 8 | - seqkit=2.8 9 | - sra-tools=3.1 10 | - psycopg2-binary=2.9 11 | - pypika=0.48 12 | - python-dotenv=1.0 13 | - google-cloud-secret-manager=2.22 14 | -------------------------------------------------------------------------------- /envs/tiledb.yml: -------------------------------------------------------------------------------- 1 | name: download 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.12 7 | - pandas=2.2 8 | - libtiledbsoma=1.15 9 | - scanpy=1.10 10 | - psycopg2-binary=2.9 11 | - pypika=0.48 12 | - python-dotenv=1.0 13 | - google-cloud-secret-manager=2.22 14 | -------------------------------------------------------------------------------- /data/accessions_problems.csv: -------------------------------------------------------------------------------- 1 | sample,accession,organism 2 | SRX23875714,SRR28265595,human 3 | SRX23875714,SRR28265593,human 4 | SRX23875714,SRR28265594,human 5 | SRX24608680,SRR29084389,mouse 6 | SRX23103788,SRR27431357,human 7 | SRX21883929,SRR26171850,mouse 8 | ERX11662357,ERR12251650,human 9 | ERX11662357,ERR12252000,human -------------------------------------------------------------------------------- /data/accessions_small_n10.csv: -------------------------------------------------------------------------------- 1 | sample,accession 2 | SRX9556570,SRR13112659 3 | SRX9556570,SRR13112660 4 | SRX9556569,SRR13112650 5 | SRX9556569,SRR13112649 6 | SRX9556572,SRR13112685 7 | SRX9556572,SRR13112687 8 | SRX9556544,SRR13112280 9 | SRX9556544,SRR13112285 10 | SRX9556547,SRR13112329 11 | SRX9556558,SRR13112490 -------------------------------------------------------------------------------- /scripts/gcp-upload/conda-env.yml: -------------------------------------------------------------------------------- 1 | name: download 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.12 7 | - pandas=2.2 8 | - pyarrow=19.0.1 9 | - scanpy=1.10 10 | - psycopg2-binary=2.9 11 | - pypika=0.48 12 | - python-dotenv=1.0 13 | - google-cloud-secret-manager=2.22 14 | -------------------------------------------------------------------------------- /docker/sc-recounter-run/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - python=3.11 6 | - pip 7 | - pandas=2.2 8 | - psycopg2-binary=2.9 9 | - pypika=0.48 10 | - python-dotenv=1.0 11 | - google-cloud-secret-manager=2.22 12 | - google-cloud-storage=2.19.0 13 | - nextflow=24.10 14 | -------------------------------------------------------------------------------- /scripts/tiledb-loader/conda-env.yml: -------------------------------------------------------------------------------- 1 | name: download 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.12 7 | - pandas=2.2 8 | - scanpy=1.10 9 | - psycopg2-binary=2.9 10 | - pypika=0.48 11 | - python-dotenv=1.0 12 | - google-cloud-secret-manager=2.22 13 | - tiledb==2.27.0 14 | - tiledbsoma-py==1.15.4 15 | -------------------------------------------------------------------------------- /config/utils.config: -------------------------------------------------------------------------------- 1 | import java.time.* 2 | Date now = new Date() 3 | 4 | manifest { 5 | name = "scRecounter" 6 | author = "Nick Youngblut" 7 | homePage = "https://github.com/arcinstitute/scRecounter" 8 | description = "Nextflow pipeline for re-processing public single-cell data" 9 | version = "0.1.0" 10 | } 11 | 12 | params { 13 | timestamp = now.format("yyyy-MM-dd_HH-mm-ss") 14 | } 15 | 16 | -------------------------------------------------------------------------------- /scripts/gcp-upload/config/utils.config: -------------------------------------------------------------------------------- 1 | import java.time.* 2 | Date now = new Date() 3 | 4 | manifest { 5 | name = "gcp-loader" 6 | author = "Nick Youngblut" 7 | homePage = "https://github.com/arcinstitute/scRecounter" 8 | description = "Load data on gcp" 9 | version = "0.1.0" 10 | } 11 | 12 | params { 13 | timestamp = now.format("yyyy-MM-dd_HH-mm-ss") 14 | } 15 | 16 | mail { 17 | smtp.host = "chimera-admin" 18 | smtp.port = 25 19 | } 20 | -------------------------------------------------------------------------------- /lib/utils.nf: -------------------------------------------------------------------------------- 1 | process SRA_STAT { 2 | label "download_env" 3 | errorStrategy { task.attempt <= maxRetries ? 'retry' : 'ignore' } 4 | disk 10.GB 5 | 6 | input: 7 | tuple val(sample), val(accession), val(metadata) 8 | 9 | output: 10 | tuple val(sample), val(accession), path("sra-stat.csv") 11 | 12 | script: 13 | """ 14 | sra-stat.py ${accession} 15 | """ 16 | 17 | stub: 18 | """ 19 | touch sra-stat.csv 20 | """ 21 | } -------------------------------------------------------------------------------- /scripts/tiledb-loader/config/utils.config: -------------------------------------------------------------------------------- 1 | import java.time.* 2 | Date now = new Date() 3 | 4 | manifest { 5 | name = "tiledb-loader" 6 | author = "Nick Youngblut" 7 | homePage = "https://github.com/arcinstitute/scRecounter" 8 | description = "Load data into tiledb-soma database" 9 | version = "0.1.0" 10 | } 11 | 12 | params { 13 | timestamp = now.format("yyyy-MM-dd_HH-mm-ss") 14 | } 15 | 16 | mail { 17 | smtp.host = "chimera-admin" 18 | smtp.port = 25 19 | } 20 | -------------------------------------------------------------------------------- /data/gcp/barcodes.csv: -------------------------------------------------------------------------------- 1 | name,cell_barcode_length,umi_length,file_path 2 | 737K-august-2016,16,10,gs://arc-ctc-references/cellranger/barcodes/737K-august-2016.txt 3 | 3M-february-2018,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-february-2018.txt 4 | 3M-5pgex-jan-2023,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-5pgex-jan-2023.txt 5 | 3M-3pgex-may-2023,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-3pgex-may-2023.txt 6 | 737K-arc-v1,16,12,gs://arc-ctc-references/cellranger/barcodes/737K-arc-v1.txt -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | Custom docker containers for the scRecounter pipeline 2 | ===================================================== 3 | 4 | This directory contains the Dockerfiles for the custom docker containers used in the scRecounter pipeline. 5 | The containers are built on top of the official [Bioconductor docker images](https://hub.docker.com/u/bioconductor) 6 | and contain the necessary dependencies for the pipeline to run. 7 | 8 | See the README files in each subdirectory for more information on the individual containers. -------------------------------------------------------------------------------- /data/barcodes.csv: -------------------------------------------------------------------------------- 1 | name,cell_barcode_length,umi_length,file_path 2 | 737K-august-2016,16,10,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt 3 | 3M-february-2018,16,12,/common_datasets/external/references/cellranger/barcodes/3M-february-2018.txt 4 | 3M-5pgex-jan-2023,16,12,/common_datasets/external/references/cellranger/barcodes/3M-5pgex-jan-2023.txt 5 | 3M-3pgex-may-2023,16,12,/common_datasets/external/references/cellranger/barcodes/3M-3pgex-may-2023.txt 6 | 737K-arc-v1,16,12,/common_datasets/external/references/cellranger/barcodes/737K-arc-v1.txt -------------------------------------------------------------------------------- /scripts/gcp-upload/config/process.config: -------------------------------------------------------------------------------- 1 | process { 2 | errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" } 3 | maxRetries = 0 4 | maxErrors = "-1" 5 | 6 | conda = "conda-env.yml" 7 | 8 | cpus = 1 9 | memory = 2.GB 10 | time = 1.h 11 | 12 | withLabel:process_low { 13 | cpus = 2 14 | memory = { 8.GB * task.attempt } 15 | time = { 2.h * task.attempt } 16 | } 17 | withLabel:process_high { 18 | cpus = { 4 * task.attempt } 19 | memory = { 16.GB * task.attempt } 20 | time = { 2.h * task.attempt } 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feature request]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[Bug]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Provide a code example and any sample input data (e.g. an H5AD) as an attachment to reproduce this behavior. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Environment** 23 | Paste in the output of `pip list` or `conda list` 24 | 25 | **Additional context** 26 | Add any other context about the problem here. 27 | -------------------------------------------------------------------------------- /scripts/tiledb-loader/config/process.config: -------------------------------------------------------------------------------- 1 | process { 2 | errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" } 3 | maxRetries = 0 4 | maxErrors = "-1" 5 | 6 | conda = "conda-env.yml" 7 | 8 | cpus = 1 9 | memory = 2.GB 10 | time = 1.h 11 | 12 | withLabel:process_low { 13 | cpus = 2 14 | memory = { 8.GB * task.attempt } 15 | time = { 2.h * task.attempt } 16 | } 17 | withLabel:process_medium { 18 | cpus = { 8 * task.attempt } 19 | memory = { 160.GB + 96.GB * task.attempt } 20 | } 21 | withLabel:process_high { 22 | cpus = { 8 * task.attempt } 23 | memory = { 256.GB + 96.GB * task.attempt } 24 | } 25 | } 26 | 27 | -------------------------------------------------------------------------------- /docker/sc-recounter-star/README.md: -------------------------------------------------------------------------------- 1 | sc-recounter-star container 2 | =========================== 3 | 4 | # Build and push to GCP Container Registry 5 | 6 | Env vars 7 | 8 | ```bash 9 | IMG_NAME=sc-recounter-download 10 | IMG_VERSION=0.1.0 11 | REGION="us-east1" 12 | PROJECT="c-tc-429521" 13 | ``` 14 | 15 | Build 16 | 17 | > from the base directory of the repository 18 | 19 | ```bash 20 | docker build \ 21 | --file docker/${IMG_NAME}/Dockerfile \ 22 | --build-arg CONDA_ENV_YAML=envs/star.yml \ 23 | --platform linux/amd64 \ 24 | --tag ${IMG_NAME}:${IMG_VERSION} \ 25 | . 26 | ``` 27 | 28 | Push 29 | 30 | ```bash 31 | docker tag ${IMG_NAME}:${IMG_VERSION} \ 32 | ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \ 33 | && docker push ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} 34 | ``` 35 | -------------------------------------------------------------------------------- /docker/sc-recounter-download/README.md: -------------------------------------------------------------------------------- 1 | sc-recounter-download container 2 | =============================== 3 | 4 | # Build and push to GCP Container Registry 5 | 6 | Env vars 7 | 8 | ```bash 9 | IMG_NAME=sc-recounter-download 10 | IMG_VERSION=0.1.0 11 | REGION="us-east1" 12 | PROJECT="c-tc-429521" 13 | ``` 14 | 15 | Build 16 | 17 | > from the base directory of the repository 18 | 19 | ```bash 20 | docker build \ 21 | --file docker/${IMG_NAME}/Dockerfile \ 22 | --build-arg CONDA_ENV_YAML=envs/download.yml \ 23 | --platform linux/amd64 \ 24 | --tag ${IMG_NAME}:${IMG_VERSION} \ 25 | . 26 | ``` 27 | 28 | Push 29 | 30 | ```bash 31 | docker tag ${IMG_NAME}:${IMG_VERSION} \ 32 | ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \ 33 | && docker push ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} 34 | ``` 35 | -------------------------------------------------------------------------------- /docker/sc-recounter-star/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use micromamba base image 2 | FROM mambaorg/micromamba:1.5.7 3 | 4 | # Use bash shell 5 | SHELL ["/bin/bash", "-c"] 6 | 7 | # Set user to root for installation (already root by default, but kept for clarity) 8 | USER root 9 | 10 | # Install OS-level packages 11 | RUN apt-get update -y \ 12 | && apt-get install -y build-essential procps curl \ 13 | && apt-get clean \ 14 | && apt-get purge \ 15 | && rm -rf /var/lib/apt/lists/* /tmp/* 16 | 17 | # Copy environment file 18 | ARG CONDA_ENV_YAML 19 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${CONDA_ENV_YAML} /tmp/environment.yml 20 | 21 | # Install the environment using micromamba 22 | RUN micromamba create -f /tmp/environment.yml \ 23 | && micromamba clean --all --yes \ 24 | && rm -rf /opt/conda/pkgs/* 25 | 26 | # Activate the environment by default 27 | ARG MAMBA_DOCKERFILE_ACTIVATE=1 28 | ENV ENV_NAME=star -------------------------------------------------------------------------------- /docker/sc-recounter-download/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use micromamba base image 2 | FROM mambaorg/micromamba:1.5.7 3 | 4 | # Use bash shell 5 | SHELL ["/bin/bash", "-c"] 6 | 7 | # Set user to root for installation (already root by default, but kept for clarity) 8 | USER root 9 | 10 | # Install OS-level packages 11 | RUN apt-get update -y \ 12 | && apt-get install -y build-essential procps curl \ 13 | && apt-get clean \ 14 | && apt-get purge \ 15 | && rm -rf /var/lib/apt/lists/* /tmp/* 16 | 17 | # Copy environment file 18 | ARG CONDA_ENV_YAML 19 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${CONDA_ENV_YAML} /tmp/environment.yml 20 | 21 | # Install the environment using micromamba 22 | RUN micromamba create -f /tmp/environment.yml \ 23 | && micromamba clean --all --yes \ 24 | && rm -rf /opt/conda/pkgs/* 25 | 26 | # Activate the environment by default 27 | ARG MAMBA_DOCKERFILE_ACTIVATE=1 28 | ENV ENV_NAME=download -------------------------------------------------------------------------------- /data/accessions_small_n2_params.csv: -------------------------------------------------------------------------------- 1 | sample,fastq_1,fastq_2,barcodes_file,star_index,cell_barcode_length,umi_length,strand 2 | sample2,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/3f/fb5cc4ef344c6f077a941d7712250a/reads/SRR13112660_1.fastq,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/3f/fb5cc4ef344c6f077a941d7712250a/reads/SRR13112660_2.fastq,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38,16,10,Forward 3 | sample1,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/c9/c22d90b85c440f595fec26fa52ac75/reads/SRR13112659_1.fastq,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/c9/c22d90b85c440f595fec26fa52ac75/reads/SRR13112659_2.fastq,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38,16,10,Forward -------------------------------------------------------------------------------- /docker/sc-recounter-run/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create run name 4 | RUN_NAME="SCRECOUNTER_$(date +"%Y-%m-%d_%H-%M-%S")" 5 | 6 | # Set the profile list from the command line arguments 7 | PROFILE_LIST=$(IFS=,; echo "$*") 8 | 9 | # Activate the micromamba environment and run the pipeline 10 | micromamba run -n sc-recounter-run \ 11 | nextflow run main.nf \ 12 | -profile $PROFILE_LIST \ 13 | -name $RUN_NAME \ 14 | -work-dir "gs://arc-ctc-nextflow/scRecounter/prod/work/${RUN_NAME}" \ 15 | --output_dir "gs://arc-ctc-screcounter/prod3/${RUN_NAME}" \ 16 | -ansi-log false "$@" 17 | 18 | # Delete output directory if only nf-report and nf-trace 19 | export GCP_SQL_DB_HOST="35.243.133.29" 20 | export GCP_SQL_DB_NAME="sragent-prod" 21 | export GCP_SQL_DB_USERNAME="postgres" 22 | micromamba run -n sc-recounter-run \ 23 | python cleanup.py \ 24 | "gs://arc-ctc-nextflow/scRecounter/prod/work/${RUN_NAME}" \ 25 | "gs://arc-ctc-screcounter/prod3/${RUN_NAME}" 26 | -------------------------------------------------------------------------------- /data/accessions_var.csv: -------------------------------------------------------------------------------- 1 | query_accession,sample,accession,experiment,sra_study,bioproject,spots,spots_with_mates,avgLength,size_MB 2 | SRP256479,SRS6484446,SRR11549939,SRX8119841,SRP256479,PRJNA625518,17221838,0,76,445 3 | SRP256479,SRS6484447,SRR11549940,SRX8119842,SRP256479,PRJNA625518,18368896,0,76,475 4 | SRP256479,SRS6484396,SRR11550035,SRX8119791,SRP256479,PRJNA625518,13056540,13056540,152,681 5 | SRP256479,SRS6484399,SRR11550038,SRX8119794,SRP256479,PRJNA625518,15239694,15239694,152,910 6 | SRP256479,SRS6484441,SRR11550087,SRX8119836,SRP256479,PRJNA625518,101386661,101386661,92,2891 7 | SRP256479,SRS6484442,SRR11550088,SRX8119837,SRP256479,PRJNA625518,108258059,108258059,92,3068 8 | SRP256479,SRS7053374,SRR12280849,SRX8784719,SRP256479,PRJNA625518,15053071,0,76,418 9 | SRP256479,SRS7053381,SRR12280856,SRX8784726,SRP256479,PRJNA625518,20917672,0,76,578 10 | SRP256479,SRS6484394,SRR11550033,SRX8119789,SRP256479,PRJNA625518,15656259,15656259,152,821 11 | SRP256479,SRS6484397,SRR11550036,SRX8119792,SRP256479,PRJNA625518,15255046,15255046,152,786 -------------------------------------------------------------------------------- /scripts/tiledb-loader/nextflow.config: -------------------------------------------------------------------------------- 1 | includeConfig "config/utils.config" 2 | 3 | params { 4 | input_dir = "" 5 | db_uri = "" 6 | log_dir = "logs" 7 | feature_type = "GeneFull_Ex50pAS" 8 | mtx_batch_size = 200 9 | h5ad_batch_size = 2 10 | missing_metadata = "skip" 11 | max_datasets = 10000 12 | } 13 | 14 | 15 | //-- Extra configs --// 16 | includeConfig "config/process.config" 17 | includeConfig "config/profiles.config" 18 | 19 | 20 | //-- Functions --// 21 | // Remove trailing forward slashes in a string 22 | def fmtPath(path_str) { 23 | return path_str.replaceAll(/\/+$/, '') 24 | } 25 | 26 | // Create the work directory 27 | def getWorkDir() { 28 | def userGroup = "id -gn".execute().text.trim() 29 | def userName = "whoami".execute().text.trim() 30 | def workDir = "/scratch/$userGroup/$userName/nextflow-work/tiledb-loader" 31 | return workDir 32 | } 33 | 34 | def getCondaCacheDir() { 35 | def userName = "whoami".execute().text.trim() 36 | cacheDir = "/home/$userName/nextflow/conda-cache/tiledb-loader" 37 | return cacheDir 38 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Arc Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /workflows/read_qc.nf: -------------------------------------------------------------------------------- 1 | // Read QC workflow for fastq files 2 | workflow READ_QC_WF{ 3 | take: 4 | ch_fastq 5 | 6 | main: 7 | // Flatten the channel to process each read file separately 8 | ch_fastq_flat = ch_fastq.flatMap { sample, fastq_1, fastq_2 -> 9 | [ [sample, "R1", fastq_1], [sample, "R2", fastq_2] ] 10 | } 11 | 12 | // Run seqkit stats 13 | SEQKIT_STATS(ch_fastq_flat) 14 | .collectFile( 15 | name: "seqkit-stats.tsv", 16 | storeDir: file(params.output_dir) / "read_qc", 17 | newLine: false, keepHeader: true 18 | ) 19 | } 20 | 21 | // Run `seqkit stats` on fastq files 22 | process SEQKIT_STATS { 23 | container "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0" 24 | conda "envs/read_qc.yml" 25 | label "process_low" 26 | 27 | input: 28 | tuple val(sample), val(read), path("${sample}_${read}.fastq") 29 | 30 | output: 31 | path "${sample}_${read}.tsv" 32 | 33 | script: 34 | """ 35 | seqkit -j $task.cpus stats -a -T ${sample}_${read}.fastq > ${sample}_${read}.tsv 36 | """ 37 | 38 | stub: 39 | """ 40 | touch ${sample}_${read}.tsv 41 | """ 42 | } 43 | -------------------------------------------------------------------------------- /lib/download.groovy: -------------------------------------------------------------------------------- 1 | def readAccessions(accessions_input){ 2 | // Read the input CSV file with the sample names and SRA accessions 3 | ch_acc = accessions_input 4 | .splitCsv(header: true, sep: ",") 5 | .map { row -> 6 | def req_columns = ["sample", "accession"] 7 | def miss_columns = req_columns.findAll { !row.containsKey(it) } 8 | if (miss_columns) { 9 | error "Missing columns in the input CSV file: ${miss_columns}" 10 | } 11 | // remove special characters from the sample name 12 | row.sample = row.sample.replaceAll("\\s", "_") 13 | def result = [row.sample, row.accession] 14 | // add optional, metadata columns 15 | def metadata = [:] 16 | ["organism", "tech_10x"].each { col -> 17 | metadata[col] = row.containsKey(col) ? row[col] : "" 18 | } 19 | result << metadata 20 | return result 21 | } 22 | 23 | // print srx values 24 | ch_acc 25 | .map{ sample, accession, metadata -> sample } 26 | .distinct() 27 | .collect() 28 | .map{ it.join(',') } 29 | .view{ "SRX accessions: ${it}" } 30 | 31 | return ch_acc 32 | } 33 | 34 | -------------------------------------------------------------------------------- /config/process.config: -------------------------------------------------------------------------------- 1 | process { 2 | errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" } 3 | maxRetries = 0 4 | maxErrors = "-1" 5 | 6 | cpus = 1 7 | memory = 2.GB 8 | time = 1.h 9 | 10 | withLabel:download_env { 11 | container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0" 12 | conda = "envs/download.yml" 13 | } 14 | withLabel:read_env { 15 | container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0" 16 | conda = "envs/read_qc.yml" 17 | } 18 | withLabel:star_env { 19 | container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-star/sc-recounter-star:0.1.0" 20 | conda = "envs/star.yml" 21 | } 22 | 23 | withLabel:process_low { 24 | cpus = 4 25 | memory = { 4.GB * task.attempt } 26 | time = { 4.h * task.attempt } 27 | } 28 | withLabel:process_medium { 29 | cpus = 8 30 | memory = { 36.GB * task.attempt } 31 | time = { 6.h * task.attempt } 32 | } 33 | withLabel:process_high { 34 | cpus = 8 35 | memory = { 72.GB * task.attempt } 36 | time = { 10.h * task.attempt } 37 | maxRetries = 3 38 | } 39 | } 40 | 41 | -------------------------------------------------------------------------------- /scripts/gcp-upload/nextflow.config: -------------------------------------------------------------------------------- 1 | includeConfig "config/utils.config" 2 | 3 | params { 4 | input_dir = "/processed_datasets/scRecount/scRecounter/prod3" 5 | output_dir = "gs://arc-ctc-scbasecamp/2025-02-25/" 6 | log_dir = "tmp/logs" 7 | feature_type = "GeneFull_Ex50pAS" 8 | missing_metadata = "error" 9 | tissue_categories = "data/2025-02-20_tissue_categories.csv" 10 | max_datasets = 0 11 | organisms = "" 12 | redo_processed = false 13 | update_db = true 14 | db_host = "35.243.133.29" 15 | db_name = "sragent-prod" 16 | db_username = "postgres" 17 | } 18 | 19 | 20 | //-- Extra configs --// 21 | includeConfig "config/process.config" 22 | includeConfig "config/profiles.config" 23 | 24 | //-- Functions --// 25 | // Remove trailing forward slashes in a string 26 | def fmtPath(path_str) { 27 | return path_str.replaceAll(/\/+$/, '') 28 | } 29 | 30 | // Create the work directory 31 | def getWorkDir() { 32 | def userGroup = "id -gn".execute().text.trim() 33 | def userName = "whoami".execute().text.trim() 34 | def workDir = "/scratch/$userGroup/$userName/nextflow-work/gcp-loader" 35 | return workDir 36 | } 37 | 38 | def getCondaCacheDir() { 39 | def userName = "whoami".execute().text.trim() 40 | cacheDir = "/home/$userName/nextflow/conda-cache/gcp-loader" 41 | return cacheDir 42 | } -------------------------------------------------------------------------------- /workflows/db_acc.nf: -------------------------------------------------------------------------------- 1 | include { saveAsLog } from '../lib/utils.groovy' 2 | include { readAccessions } from '../lib/download.groovy' 3 | 4 | workflow DB_ACC_WF { 5 | main: 6 | // obtain accessions from the database 7 | ch_accessions = GET_DB_ACCESSIONS() 8 | ch_accessions.csv.ifEmpty { println 'No accessions found in the scRecounter database' } 9 | 10 | emit: 11 | ch_accessions.csv 12 | } 13 | 14 | // Save accessions csv 15 | def saveAsFinalAcc(filename) { 16 | if (filename.endsWith(".csv")){ 17 | filename = filename.tokenize("/").last() 18 | return "${filename}" 19 | } 20 | return null 21 | } 22 | 23 | process GET_DB_ACCESSIONS { 24 | publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "*.csv" 25 | publishDir file(params.output_dir), mode: "copy", overwrite: true, saveAs: { filename -> saveAsLog(filename) } 26 | label "download_env" 27 | disk 10.GB 28 | 29 | output: 30 | path "accessions.csv", emit: "csv" 31 | path "${task.process}.log", emit: "log" 32 | 33 | script: 34 | """ 35 | export GCP_SQL_DB_HOST="${params.db_host}" 36 | export GCP_SQL_DB_NAME="${params.db_name}" 37 | export GCP_SQL_DB_USERNAME="${params.db_username}" 38 | 39 | get-db-accessions.py \\ 40 | --organisms "${params.organisms}" \\ 41 | --max-srx ${params.max_samples} \\ 42 | 2>&1 | tee ${task.process}.log 43 | """ 44 | } 45 | 46 | -------------------------------------------------------------------------------- /workflows/reads.nf: -------------------------------------------------------------------------------- 1 | // Input workflow for processing paired-end reads 2 | workflow READS_WF{ 3 | main: 4 | // load csv and extract accessions 5 | ch_reads = Channel 6 | .fromPath(params.reads, checkIfExists: true) 7 | .splitCsv(header: true, sep: ",") 8 | .map { row -> 9 | def req_columns = ["sample", "fastq_1", "fastq_2"] 10 | def miss_columns = req_columns.findAll { !row.containsKey(it) } 11 | if (miss_columns) { 12 | error "Missing columns in the input CSV file: ${miss_columns}" 13 | } 14 | return [row.sample, file(row.fastq_1), file(row.fastq_2)] 15 | }.groupTuple() 16 | .map { sample, fastq_1, fastq_2 -> 17 | return [sample, fastq_1.flatten(), fastq_2.flatten()] 18 | } 19 | 20 | // merge reads by sample 21 | MERGE_READS(ch_reads) 22 | 23 | emit: 24 | fastq = MERGE_READS.out 25 | } 26 | 27 | // Merge reads by sample; account for any differences in compression; check sequence formatting 28 | process MERGE_READS { 29 | conda "envs/read_qc.yml" 30 | 31 | input: 32 | tuple val(sample), path("*_read1.fq.gz"), path("*_read2.fq.gz") 33 | 34 | output: 35 | tuple val(sample), path("${sample}_R1.fq"), path("${sample}_R2.fq") 36 | 37 | script: 38 | """ 39 | seqkit seq *_read1.fq.gz > ${sample}_R1.fq 40 | seqkit seq *_read2.fq.gz > ${sample}_R2.fq 41 | """ 42 | 43 | stub: 44 | """ 45 | touch ${sample}_R1.fq ${sample}_R2.fq 46 | """ 47 | } 48 | -------------------------------------------------------------------------------- /bin/csv-merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | from __future__ import print_function 4 | import os 5 | import sys 6 | import argparse 7 | import logging 8 | import pandas as pd 9 | 10 | # logging 11 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 12 | 13 | # argparse 14 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 15 | argparse.RawDescriptionHelpFormatter): 16 | pass 17 | 18 | desc = 'Merge csv files' 19 | epi = """DESCRIPTION: 20 | Merge multiple csv files into a single table. 21 | """ 22 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 23 | formatter_class=CustomFormatter) 24 | parser.add_argument('csv_files', type=str, nargs='+', 25 | help='CSV files') 26 | parser.add_argument('--sample', type=str, default=None, 27 | help='Sample name') 28 | parser.add_argument('--outfile', type=str, default='merged.csv', 29 | help='Output file') 30 | 31 | # functions 32 | def main(args): 33 | # read in files 34 | tables = [pd.read_csv(f) for f in args.csv_files] 35 | # merge 36 | df = pd.concat(tables, ignore_index=True) 37 | # add sample name, if provided 38 | if args.sample: 39 | # add sample name 40 | df['sample'] = args.sample 41 | # reorder columns 42 | cols = ['sample'] + [c for c in df.columns if c != 'sample'] 43 | df = df[cols] 44 | # write 45 | df.to_csv(args.outfile, index=False) 46 | logging.info(f'Output written to: {args.outfile}') 47 | 48 | ## script main 49 | if __name__ == '__main__': 50 | args = parser.parse_args() 51 | main(args) -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | // Subworkflows 2 | include { DB_ACC_WF } from './workflows/db_acc.nf' 3 | include { STAR_PARAMS_WF } from './workflows/star_params.nf' 4 | include { STAR_FULL_WF } from './workflows/star_full.nf' 5 | include { SRA_STAT } from './lib/utils.nf' 6 | // util functions 7 | include { readAccessions; addStats; } from './lib/utils.groovy' 8 | 9 | // Main workflow 10 | workflow { 11 | if (params.accessions == "" || params.accessions == true) { 12 | // Obtain accessions from SRA 13 | println "No accessions provided. Accessions will be obtained from SRA." 14 | ch_accessions = DB_ACC_WF() 15 | } else { 16 | // Use the provided accessions 17 | println "Using provided accessions." 18 | ch_accessions = Channel.fromPath(params.accessions, checkIfExists: true) 19 | } 20 | 21 | // read accessions file 22 | ch_accessions = readAccessions(ch_accessions) 23 | 24 | // run sra-stat on accessions 25 | ch_sra_stat = SRA_STAT(ch_accessions) 26 | ch_accessions = addStats(ch_accessions, ch_sra_stat) 27 | 28 | // filter out any accessions with max SRA file size greater than the user-specified size 29 | ch_accessions = ch_accessions.filter { it[3] <= params.max_sra_size } 30 | 31 | // determine best STAR parameters on a subset of reads 32 | ch_star_params = STAR_PARAMS_WF(ch_accessions, ch_sra_stat) 33 | 34 | // run STAR on all reads with selected parameters 35 | if (! params.define){ 36 | STAR_FULL_WF(ch_accessions, ch_star_params) 37 | } 38 | } 39 | 40 | // On complete 41 | workflow.onComplete { 42 | println "Pipeline completed at: $workflow.complete" 43 | println "Execution status: ${ workflow.success ? 'OK' : 'failed' }" 44 | } 45 | -------------------------------------------------------------------------------- /docker/sc-recounter-run/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use micromamba base image 2 | FROM mambaorg/micromamba:1.5.7 3 | 4 | # Use bash shell 5 | SHELL ["/bin/bash", "-c"] 6 | 7 | # Set working directory 8 | WORKDIR /app 9 | 10 | # Set user to root for installation 11 | USER root 12 | 13 | # Install OS-level packages (if needed) 14 | RUN apt-get update -y \ 15 | && apt-get install -y build-essential procps curl \ 16 | && apt-get clean \ 17 | && apt-get purge \ 18 | && rm -rf /var/lib/apt/lists/* /tmp/* 19 | 20 | # variables 21 | ARG BASE_DIR="docker/sc-recounter-run" 22 | 23 | # Copy environment file into container 24 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${BASE_DIR}/environment.yml /tmp/environment.yml 25 | 26 | # Create conda environment with micromamba 27 | RUN micromamba create -n sc-recounter-run -f /tmp/environment.yml --quiet \ 28 | && micromamba clean --all --yes \ 29 | && rm -rf /opt/conda/pkgs/* 30 | 31 | # Activate environment by default 32 | ARG MAMBA_DOCKERFILE_ACTIVATE=1 33 | ENV ENV_NAME=sc-recounter-run 34 | 35 | # Copy Nextflow pipeline and the Python runner script 36 | COPY --chown=$MAMBA_USER:$MAMBA_USER main.nf nextflow.config . 37 | COPY --chown=$MAMBA_USER:$MAMBA_USER bin/ ./bin/ 38 | COPY --chown=$MAMBA_USER:$MAMBA_USER config/ ./config/ 39 | COPY --chown=$MAMBA_USER:$MAMBA_USER data/ ./data/ 40 | COPY --chown=$MAMBA_USER:$MAMBA_USER lib/ ./lib/ 41 | COPY --chown=$MAMBA_USER:$MAMBA_USER workflows/ ./workflows/ 42 | 43 | # Copy runner scripts 44 | COPY bin/db_utils.py ${BASE_DIR}/entrypoint.sh ${BASE_DIR}/cleanup.py ./ 45 | 46 | # Create a directory for the mamba cache 47 | RUN mkdir -p /.cache/mamba/ /app/.nextflow/ /scratch/ \ 48 | && chmod -R ugo+rwx /.cache/mamba/ /app/.nextflow/ /scratch/ 49 | 50 | # Set the NXF_HOME environment variable 51 | ENV NXF_HOME=/app/.nextflow 52 | 53 | # Set user to mamba 54 | ENTRYPOINT ["bash", "entrypoint.sh"] -------------------------------------------------------------------------------- /data/gcp/star_indices.csv: -------------------------------------------------------------------------------- 1 | organism,star_index 2 | human,gs://arc-ctc-references/STAR/star_refData_2020_hg38/ 3 | mouse,gs://arc-ctc-references/STAR/star_refData_2020_mm10/ 4 | Macaca_mulatta,gs://arc-ctc-references/STAR/star_refData_2019_mmul10/MMUL-10_scRecount/ 5 | Anopheles_gambiae,gs://arc-ctc-references/STAR/Anopheles_gambiae/AgamP4/ 6 | Arabidopsis_thaliana,gs://arc-ctc-references/STAR/Arabidopsis_thaliana/TAIR10/ 7 | Bos_taurus,gs://arc-ctc-references/STAR/Bos_taurus/ARS-UCD1.3/ 8 | Caenorhabditis_elegans,gs://arc-ctc-references/STAR/Caenorhabditis_elegans/WBcel235/ 9 | Callithrix_jacchus,gs://arc-ctc-references/STAR/Callithrix_jacchus/mCalJac1.pat.X/ 10 | Canis_lupus_familiaris,gs://arc-ctc-references/STAR/Canis_lupus_familiaris/ROS_Cfam_1.0/ 11 | Danio_rerio,gs://arc-ctc-references/STAR/Danio_rerio/GRCz11/ 12 | Drosophila_melanogaster,gs://arc-ctc-references/STAR/Drosophila_melanogaster/BDGP6.46/ 13 | Equus_caballus,gs://arc-ctc-references/STAR/Equus_caballus/EquCab3.0/ 14 | Gallus_gallus,gs://arc-ctc-references/STAR/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/ 15 | Gorilla_gorilla,gs://arc-ctc-references/STAR/Gorilla_gorilla/gorGor4/ 16 | Heterocephalus_glaber,gs://arc-ctc-references/STAR/Heterocephalus_glaber/Naked_mole-rat_maternal/ 17 | Oryctolagus_cuniculus,gs://arc-ctc-references/STAR/Oryctolagus_cuniculus/OryCun2.0/ 18 | Oryza_sativa,gs://arc-ctc-references/STAR/Oryza_sativa/IRGSP-1.0/ 19 | Ovis_aries,gs://arc-ctc-references/STAR/Ovis_aries/ARS-UI_Ramb_v2.0/ 20 | Pan_troglodytes,gs://arc-ctc-references/STAR/Pan_troglodytes/Pan_tro_3.0/ 21 | Rattus_norvegicus,gs://arc-ctc-references/STAR/Rattus_norvegicus/mRatBN7.2/ 22 | Saccharomyces_cerevisiae,gs://arc-ctc-references/STAR/Saccharomyces_cerevisiae/R64-1-1/ 23 | Schistosoma_mansoni,gs://arc-ctc-references/STAR/Schistosoma_mansoni/Smansoni_v7/ 24 | Solanum_lycopersicum,gs://arc-ctc-references/STAR/Solanum_lycopersicum/SL3.0/ 25 | Sus_scrofa,gs://arc-ctc-references/STAR/Sus_scrofa/Sscrofa11.1/ 26 | Xenopus_tropicalis,gs://arc-ctc-references/STAR/Xenopus_tropicalis/UCB_Xtro_10.0/ 27 | Zea_mays,gs://arc-ctc-references/STAR/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/ 28 | -------------------------------------------------------------------------------- /scripts/tiledb-loader/main.nf: -------------------------------------------------------------------------------- 1 | workflow { 2 | // find target MTX files to add to the database 3 | FIND_MTX() 4 | 5 | // list target MTX files 6 | mtx_files = FIND_MTX.out.csv 7 | .splitCsv( header: true ) 8 | .map { row -> 9 | tuple( row["batch"], row["srx"], row["matrix_path"] ) 10 | } 11 | .groupTuple() 12 | 13 | // aggregate mtx files as h5ad 14 | MTX_TO_H5AD( mtx_files ) 15 | 16 | // add the h5ad files to the database 17 | H5AD_TO_DB( MTX_TO_H5AD.out.h5ad.buffer( size: params.h5ad_batch_size, remainder: true ) ) 18 | } 19 | 20 | process H5AD_TO_DB { 21 | publishDir file(params.log_dir), mode: "copy", overwrite: true 22 | label "process_medium" 23 | maxForks 1 24 | 25 | input: 26 | path "?.h5ad" 27 | 28 | output: 29 | path "h5ad_to_db.log", emit: log 30 | 31 | script: 32 | """ 33 | h5ad-to-db.py \\ 34 | --threads ${task.cpus} \\ 35 | --db-uri ${params.db_uri} \\ 36 | *.h5ad 2>&1 | tee h5ad_to_db.log 37 | """ 38 | } 39 | 40 | process MTX_TO_H5AD { 41 | publishDir file(params.log_dir) , mode: "copy", overwrite: true, pattern: "*.log" 42 | label "process_high" 43 | maxForks 4 44 | 45 | input: 46 | tuple val(batch), val(srx), val(mtx_path) 47 | 48 | output: 49 | path "data.h5ad", emit: h5ad 50 | path "mtx_to_h5ad_batch-${batch}.log", emit: log 51 | 52 | script: 53 | """ 54 | mtx-to-h5ad.py \\ 55 | --threads ${task.cpus} \\ 56 | --missing-metadata "${params.missing_metadata}" \\ 57 | --srx "$srx" \\ 58 | --path "$mtx_path" \\ 59 | 2>&1 | tee mtx_to_h5ad_batch-${batch}.log 60 | """ 61 | } 62 | 63 | process FIND_MTX { 64 | publishDir file(params.log_dir), mode: "copy", overwrite: true, pattern: "*.log" 65 | label "process_low" 66 | 67 | output: 68 | path "mtx_files.csv", emit: csv 69 | path "find_mtx.log", emit: log 70 | 71 | script: 72 | """ 73 | find-mtx.py \\ 74 | --feature-type ${params.feature_type} \\ 75 | --max-datasets ${params.max_datasets} \\ 76 | --batch-size ${params.mtx_batch_size} \\ 77 | --db-uri ${params.db_uri} \\ 78 | ${params.input_dir} \\ 79 | 2>&1 | tee find_mtx.log 80 | """ 81 | } -------------------------------------------------------------------------------- /scripts/gcp-loader-tahoe100.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os, gc, sys, argparse, tempfile 3 | from glob import glob 4 | import scanpy as sc 5 | import gcsfs 6 | 7 | def main(input_dir, output_dir, temp_dir): 8 | h5ad_files = sorted(glob(os.path.join(input_dir, '*.h5ad.gz'))) 9 | if not h5ad_files: 10 | print("No .h5ad.gz files found in the input directory.") 11 | sys.exit(1) 12 | else: 13 | print(f"Found {len(h5ad_files)} .h5ad.gz files.", file=sys.stderr) 14 | 15 | to_keep = [ 16 | "sample", "gene_count", "tscp_count", "mread_count", "drugname_drugconc", 17 | "drug", "cell_line", "sublibrary", "BARCODE", "pcnt_mito", "S_score", 18 | "G2M_score", "phase", "pass_filter", "cell_name" 19 | ] 20 | 21 | fs = gcsfs.GCSFileSystem() 22 | os.makedirs(temp_dir, exist_ok=True) 23 | 24 | for infile in h5ad_files: 25 | print(f"Reading {infile}...", file=sys.stderr) 26 | adata = sc.read_h5ad(infile) 27 | adata.obs = adata.obs[to_keep] 28 | adata.obs['plate'] = os.path.basename(infile).split('_')[0] 29 | 30 | print(f"Writing temporary file...", file=sys.stderr) 31 | tmp_name = os.path.join(temp_dir, os.path.basename(infile)) 32 | adata.write_h5ad(tmp_name, compression='gzip') 33 | 34 | out_path = os.path.join(output_dir, os.path.basename(infile)) 35 | print(f"Uploading to {output_dir}...", file=sys.stderr) 36 | fs.put(tmp_name, out_path) 37 | 38 | print("Deleting anndata object and temporary file...", file=sys.stderr) 39 | del adata 40 | gc.collect() 41 | os.remove(tmp_name) 42 | 43 | if __name__ == '__main__': 44 | parser = argparse.ArgumentParser(description="Process and upload h5ad files to GCP.") 45 | parser.add_argument('-i', '--input', required=True, help="Input directory containing h5ad.gz files") 46 | parser.add_argument('-o', '--output', required=True, help="Output GCP directory") 47 | parser.add_argument('-t', '--temp', required=True, help="Temporary directory") 48 | args = parser.parse_args() 49 | main(args.input, args.output, args.temp) 50 | 51 | # example 52 | # ./gcp-loader-tahoe100.py -t /scratch/multiomics/nickyoungblut/gcp-loader/ -i /processed_datasets/scRecount/tahoe -o gs://arc-ctc-tahoe100/2025-02-25/h5ad -------------------------------------------------------------------------------- /workflows/download.nf: -------------------------------------------------------------------------------- 1 | include { readAccessions; } from '../lib/download.groovy' 2 | include { joinReads; addStats; } from '../lib/utils.nf' 3 | include { SRA_STAT } from '../lib/utils.nf' 4 | 5 | workflow DOWNLOAD_WF { 6 | take: 7 | ch_accessions 8 | 9 | main: 10 | // Run prefetch & fastq-dump 11 | ch_fqdump = FASTQ_DUMP(ch_accessions) 12 | 13 | /// Merge logs 14 | FQDUMP_LOG_MERGE(ch_fqdump.log.collect()) 15 | 16 | // Join R1 and R2 channels, which will filter out empty R2 records 17 | ch_fastq = joinReads(ch_fqdump.R1, ch_fqdump.R2) 18 | 19 | emit: 20 | fastq = ch_fastq 21 | } 22 | 23 | process FQDUMP_LOG_MERGE { 24 | publishDir file(params.output_dir) / "logs", mode: "copy", overwrite: true 25 | label "download_env" 26 | 27 | input: 28 | path "*_log.csv" 29 | 30 | output: 31 | path "fq-dump_summary.csv" 32 | 33 | script: 34 | """ 35 | csv-merge.py --outfile fq-dump_summary.csv *_log.csv 36 | """ 37 | 38 | stub: 39 | """ 40 | touch fq-dump_summary.csv 41 | """ 42 | } 43 | 44 | process FASTQ_DUMP { 45 | label "download_env" 46 | 47 | input: 48 | tuple val(sample), val(accession), val(metadata), val(sra_file_size_gb) 49 | 50 | output: 51 | tuple val(sample), val(accession), val(metadata), path("reads/read_1.fastq"), emit: "R1" 52 | tuple val(sample), val(accession), val(metadata), path("reads/read_2.fastq"), emit: "R2", optional: true 53 | path "reads/fq-dump_log.csv", emit: "log" 54 | 55 | script: 56 | """ 57 | export GCP_SQL_DB_HOST="${params.db_host}" 58 | export GCP_SQL_DB_NAME="${params.db_name}" 59 | export GCP_SQL_DB_USERNAME="${params.db_username}" 60 | 61 | fq-dump.py \\ 62 | --sample ${sample} \\ 63 | --accession ${accession} \\ 64 | --threads ${task.cpus} \\ 65 | --bufsize 10MB \\ 66 | --curcache 50MB \\ 67 | --mem 5GB \\ 68 | --temp TMP_FILES \\ 69 | --min-read-length ${params.min_read_len} \\ 70 | --maxSpotId ${params.max_spots} \\ 71 | --outdir reads \\ 72 | ${accession} 73 | 74 | # remove the temporary files 75 | rm -rf TMP_FILES 76 | """ 77 | 78 | stub: 79 | """ 80 | mkdir -p reads 81 | touch reads/${accession}_1.fastq reads/${accession}_2.fastq 82 | """ 83 | } 84 | -------------------------------------------------------------------------------- /data/star_indices.csv: -------------------------------------------------------------------------------- 1 | organism,star_index 2 | human,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38 3 | mouse,/large_storage/goodarzilab/public/scRecount/genomes/star2.7.11_refData_2020_mm10 4 | Macaca_mulatta,/large_storage/goodarzilab/public/scRecount/genomes/MMUL-10_scRecount/MMUL-10_scRecount 5 | Anopheles_gambiae,/scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/star 6 | Arabidopsis_thaliana,/scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/star 7 | Bos_taurus,/scratch/multiomics/nickyoungblut/star_refs/Bos_taurus/star 8 | Caenorhabditis_elegans,/scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/star 9 | Callithrix_jacchus,/scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/star 10 | Canis_lupus_familiaris,/scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/star 11 | Danio_rerio,/scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/star 12 | Drosophila_melanogaster,/scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/star 13 | Equus_caballus,/scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/star 14 | Gallus_gallus,/scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/star 15 | Heterocephalus_glaber,/scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/star 16 | Oryctolagus_cuniculus,/scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/star 17 | Oryza_sativa,/scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/star 18 | Ovis_aries,/scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/star 19 | Pan_troglodytes,/scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/star 20 | Gorilla_gorilla,/scratch/multiomics/nickyoungblut/star_refs/Gorilla_gorilla/star 21 | Rattus_norvegicus,/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/star 22 | Saccharomyces_cerevisiae,/scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/star 23 | Schistosoma_mansoni,/scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/star 24 | Solanum_lycopersicum,/scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/star 25 | Sus_scrofa,/scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/star 26 | Troglodytes_gorilla,/scratch/multiomics/nickyoungblut/star_refs/Troglodytes_gorilla/star 27 | Xenopus_tropicalis,/scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/star 28 | Zea_mays,/scratch/multiomics/nickyoungblut/star_refs/Zea_mays/star -------------------------------------------------------------------------------- /scripts/gcp-upload/bin/agg-obs-metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import logging 6 | import argparse 7 | from uuid import uuid4 8 | from pathlib import Path 9 | from typing import List, Set, Tuple, Optional 10 | ## 3rd party 11 | import pandas as pd 12 | from pypika import Query, Table, Criterion 13 | ## package 14 | from db_utils import db_connect 15 | 16 | # format logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | 19 | # classes 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 21 | pass 22 | 23 | # functions 24 | def parse_arguments() -> argparse.Namespace: 25 | """ 26 | Parse command-line arguments. 27 | """ 28 | desc = 'Publish database results as parquet files.' 29 | epi = """DESCRIPTION: 30 | """ 31 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 32 | parser.add_argument( 33 | 'csv_files', type=str, help="csv files", nargs='+' 34 | ) 35 | parser.add_argument( 36 | '--feature-type', default='GeneFull_Ex50pAS', 37 | choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto'], 38 | help='Feature type to process' 39 | ) 40 | return parser.parse_args() 41 | 42 | def merge_csv_files(csv_files: List[str], feature_type: str): 43 | """ 44 | Load all CSV files into memory and write them together 45 | """ 46 | outdir = os.path.join("metadata_TMP", feature_type) 47 | os.makedirs(outdir, exist_ok=True) 48 | outfile = os.path.join(outdir, f"{uuid4()}.csv.gz") 49 | 50 | # Load all dataframes into a list 51 | data = [] 52 | for csv_file in csv_files: 53 | logging.info(f"Processing {csv_file}...") 54 | df = pd.read_csv(csv_file) 55 | data.append(df) 56 | 57 | # Concatenate all dataframes and write to file 58 | pd.concat(data, axis=0, ignore_index=True).to_csv(outfile, index=False, compression='gzip') 59 | logging.info(f"Saved merged csv to {outfile}") 60 | 61 | def main(): 62 | """Main function to run the TileDB loader workflow.""" 63 | args = parse_arguments() 64 | 65 | # merge csv files 66 | merge_csv_files(args.csv_files, args.feature_type) 67 | 68 | if __name__ == "__main__": 69 | #from dotenv import load_dotenv 70 | #load_dotenv(override=True) 71 | main() -------------------------------------------------------------------------------- /scripts/gcp-upload/config/profiles.config: -------------------------------------------------------------------------------- 1 | profiles { 2 | conda { 3 | conda.enabled = true 4 | conda.useMamba = false 5 | docker.enabled = false 6 | singularity.enabled = false 7 | podman.enabled = false 8 | shifter.enabled = false 9 | charliecloud.enabled = false 10 | } 11 | docker { 12 | docker.enabled = true 13 | docker.sudo = false 14 | docker.runOptions = "-u \$(id -u):\$(id -g) --platform=linux/amd64" 15 | singularity.enabled = false 16 | podman.enabled = false 17 | shifter.enabled = false 18 | charliecloud.enabled = false 19 | } 20 | vm { 21 | workDir = "tmp/work" 22 | process { 23 | errorStrategy = "terminate" 24 | maxRetries = 0 25 | resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ] 26 | } 27 | } 28 | slurm { 29 | workDir = getWorkDir() 30 | conda.cacheDir = getCondaCacheDir() 31 | cleanup = true 32 | executor.queueSize = 300 33 | process { 34 | executor = "slurm" 35 | queue = "cpu_batch" 36 | errorStrategy = "retry" 37 | maxRetries = 3 38 | resourceLimits = [ cpus: 24, memory: 900.GB, time: 72.h ] 39 | } 40 | } 41 | dev { 42 | params { 43 | input_dir = "/processed_datasets/scRecount/scRecounter/prod3" 44 | log_dir = "tmp/logs" 45 | output_dir = "gs://arc-ctc-nextflow/gcp-loader/output/" 46 | max_datasets = 3 47 | db_name = "sragent-prod" 48 | update_db = false 49 | } 50 | } 51 | report { 52 | report { 53 | enabled = true 54 | overwrite = true 55 | file = "${params.log_dir}/nf-report/${params.timestamp}.html" 56 | } 57 | } 58 | trace { 59 | trace { 60 | enabled = true 61 | overwrite = true 62 | file = "${params.log_dir}/nf-trace/${params.timestamp}.txt" 63 | fields = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch" 64 | } 65 | } 66 | } 67 | 68 | 69 | -------------------------------------------------------------------------------- /scripts/tiledb-loader/config/profiles.config: -------------------------------------------------------------------------------- 1 | profiles { 2 | conda { 3 | conda.enabled = true 4 | conda.useMamba = false 5 | docker.enabled = false 6 | singularity.enabled = false 7 | podman.enabled = false 8 | shifter.enabled = false 9 | charliecloud.enabled = false 10 | } 11 | docker { 12 | docker.enabled = true 13 | docker.sudo = false 14 | docker.runOptions = "-u \$(id -u):\$(id -g) --platform=linux/amd64" 15 | singularity.enabled = false 16 | podman.enabled = false 17 | shifter.enabled = false 18 | charliecloud.enabled = false 19 | } 20 | vm { 21 | workDir = "tmp/work" 22 | process { 23 | errorStrategy = "terminate" 24 | maxRetries = 0 25 | resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ] 26 | } 27 | } 28 | slurm { 29 | workDir = getWorkDir() 30 | conda.cacheDir = getCondaCacheDir() 31 | cleanup = true 32 | executor.queueSize = 30 33 | process { 34 | executor = "slurm" 35 | queue = "cpu_batch" 36 | errorStrategy = "retry" // "terminate" 37 | maxRetries = 2 38 | resourceLimits = [ cpus: 24, memory: 900.GB, time: 72.h ] 39 | } 40 | } 41 | dev { 42 | params { 43 | input_dir = "/processed_datasets/scRecount/scRecounter/tmp/prod_tmp" 44 | db_uri = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod_tmp" 45 | mtx_batch_size = 8 46 | h5ad_batch_size = 4 47 | max_datasets = 50 48 | } 49 | } 50 | report { 51 | report { 52 | enabled = true 53 | overwrite = true 54 | file = "${params.log_dir}/nf-report/${params.timestamp}.html" 55 | } 56 | } 57 | trace { 58 | trace { 59 | enabled = true 60 | overwrite = true 61 | file = "${params.log_dir}/nf-trace/${params.timestamp}.txt" 62 | fields = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch" 63 | } 64 | } 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /bin/subsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | from __future__ import print_function 4 | import os 5 | import re 6 | import sys 7 | import gzip 8 | import argparse 9 | import logging 10 | from time import sleep 11 | from subprocess import Popen, PIPE 12 | 13 | 14 | # logging 15 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 16 | 17 | # argparse 18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 19 | argparse.RawDescriptionHelpFormatter): 20 | pass 21 | 22 | desc = 'Subsample reads' 23 | epi = """DESCRIPTION: 24 | Subsample reads from a fastq file. 25 | Just taking the first N reads from the head of the file. 26 | gzip input fastq files are supported. 27 | """ 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 29 | formatter_class=CustomFormatter) 30 | parser.add_argument('fastq_file', type=str, nargs='+', 31 | help='file(s) to subsample') 32 | parser.add_argument('--num-seqs', type=int, default=100000, 33 | help='Number of sequences to subsample') 34 | parser.add_argument('--out-file', type=str, default='subsampled.fastq', 35 | help='Output file') 36 | 37 | # functions 38 | def subsample(infile: str, num_seqs: int, outF, is_gzip: bool=False) -> None: 39 | # use gzip if file is gzipped 40 | if is_gzip: 41 | _open = gzip.open 42 | else: 43 | _open = open 44 | # subsample 45 | with _open(infile, 'r') as inF: 46 | for ii,line in enumerate(inF, 1): 47 | # decode if gzip 48 | if is_gzip: 49 | line = line.decode('utf-8') 50 | # write 51 | outF.write(line) 52 | if ii / 4 >= num_seqs: 53 | return None 54 | 55 | def main(args): 56 | # divide num_seqs by number of files 57 | num_files = len(args.fastq_file) 58 | num_seqs = int(args.num_seqs / num_files) 59 | 60 | # loop through each file 61 | with open(args.out_file, 'w') as outF: 62 | for i, infile in enumerate(args.fastq_file, 1): 63 | logging.info(f'Processing file {i}/{num_files}: {infile}') 64 | # subsample 65 | try: 66 | subsample(infile, num_seqs, outF) 67 | except UnicodeDecodeError: 68 | subsample(infile, num_seqs, outF, is_gzip=True) 69 | 70 | # status 71 | logging.info(f'Output written to: {args.out_file}') 72 | 73 | ## script main 74 | if __name__ == '__main__': 75 | args = parser.parse_args() 76 | main(args) -------------------------------------------------------------------------------- /scripts/tiledb-loader/README.md: -------------------------------------------------------------------------------- 1 | tiledb loader 2 | ============= 3 | 4 | A simple Nextflow pipeline for efficiently loading single-cell data into a TileDB-SOMA database. 5 | 6 | Workflow: 7 | * Find new datasets (SRX accessions) 8 | * For each batch of datasets: 9 | * Convert MTX to h5ad 10 | * Load h5ad into TileDB-SOMA database 11 | 12 | 13 | 14 | # Dev 15 | 16 | Local run 17 | 18 | ```bash 19 | nextflow run main.nf -profile conda,vm,dev -resume 20 | ``` 21 | 22 | Slurm run 23 | 24 | ```bash 25 | nextflow run main.nf -profile conda,slurm,dev -resume 26 | ``` 27 | 28 | ## Test prod 29 | 30 | ```bash 31 | nextflow run main.nf -profile conda,vm \ 32 | --max_datasets 8 \ 33 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp \ 34 | --input_dir /processed_datasets/scRecount/scRecounter/prod3 35 | ``` 36 | 37 | ```bash 38 | nextflow run main.nf -profile conda,slurm \ 39 | --max_datasets 8 \ 40 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp \ 41 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 42 | ``` 43 | 44 | ## Test scale 45 | 46 | ```bash 47 | rm -rf /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp 48 | ``` 49 | 50 | ```bash 51 | nextflow run main.nf -profile conda,slurm,report,trace \ 52 | --max_datasets 100 \ 53 | --mtx_batch_size 4 \ 54 | --h5ad_batch_size 4 \ 55 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_4-4 \ 56 | --input_dir /processed_datasets/scRecount/scRecounter/prod3 57 | ``` 58 | 59 | Time: 10m 15s 60 | 61 | ```bash 62 | nextflow run main.nf -profile conda,slurm,report,trace \ 63 | --max_datasets 100 \ 64 | --mtx_batch_size 20 \ 65 | --h5ad_batch_size 4 \ 66 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_20-4 \ 67 | --input_dir /processed_datasets/scRecount/scRecounter/prod3 68 | ``` 69 | 70 | Time: 9m 55s (from-memory) 71 | Time: 11m (from-disk) 72 | 73 | ```bash 74 | nextflow run main.nf -profile conda,slurm,report,trace \ 75 | --max_datasets 100 \ 76 | --mtx_batch_size 50 \ 77 | --h5ad_batch_size 4 \ 78 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_50-4 \ 79 | --input_dir /processed_datasets/scRecount/scRecounter/prod3 80 | ``` 81 | 82 | Time: 10m 25s 83 | 84 | 85 | ```bash 86 | nextflow run main.nf -profile conda,slurm,report,trace \ 87 | --max_datasets 100 \ 88 | --mtx_batch_size 25 \ 89 | --h5ad_batch_size 2 \ 90 | --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_25-2 \ 91 | --input_dir /processed_datasets/scRecount/scRecounter/prod3 92 | ``` 93 | 94 | Time: `TODO` 95 | 96 | 97 | # Backups 98 | 99 | ```console 100 | ~/tmp/tiledb/db_bkup 101 | /large_storage/multiomics/projects/tiledb_bkup 102 | ``` -------------------------------------------------------------------------------- /scripts/gcp-upload/bin/db-to-parquet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import logging 6 | import argparse 7 | from uuid import uuid4 8 | from pathlib import Path 9 | from typing import List, Set, Tuple, Optional 10 | ## 3rd party 11 | import pandas as pd 12 | from pypika import Query, Table, Criterion 13 | ## package 14 | from db_utils import db_connect 15 | 16 | # format logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | 19 | # classes 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 21 | pass 22 | 23 | # functions 24 | def parse_arguments() -> argparse.Namespace: 25 | """ 26 | Parse command-line arguments. 27 | """ 28 | desc = 'Publish database results as parquet files.' 29 | epi = """DESCRIPTION: 30 | """ 31 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 32 | parser.add_argument( 33 | '--feature-type', default='GeneFull_Ex50pAS', 34 | choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto'], 35 | help='Feature type to process' 36 | ) 37 | return parser.parse_args() 38 | 39 | 40 | def load_scbasecamp_metadata(feature_type: str) -> pd.DataFrame: 41 | """ 42 | Load metadata from scBasecamp database. 43 | Args: 44 | feature_type: Feature type to filter on. 45 | Returns: 46 | DataFrame with metadata. 47 | """ 48 | logging.info("Obtaining scbasecamp metadata...") 49 | 50 | # get metadata from scRecounter postgresql database 51 | srx_metadata = Table("scbasecamp_metadata") 52 | stmt = ( 53 | Query 54 | .from_(srx_metadata) 55 | .select("*") 56 | .where(srx_metadata.feature_type == feature_type) 57 | ) 58 | with db_connect() as conn: 59 | metadata = pd.read_sql(str(stmt), conn) 60 | return metadata.drop(columns=['created_at', 'updated_at']) 61 | 62 | def main(): 63 | """Main function to run the TileDB loader workflow.""" 64 | args = parse_arguments() 65 | 66 | # Load metadata 67 | metadata = load_scbasecamp_metadata(feature_type=args.feature_type) 68 | 69 | ## split by organism and save to parquet 70 | for organism, df in metadata.groupby('organism'): 71 | logging.info(f"Processing metadata for {organism}...") 72 | organism_str = organism.replace(" ", "_") 73 | # create directory 74 | out_dir = Path("metadata") / Path(args.feature_type) / Path(organism_str) 75 | out_dir.mkdir(parents=True, exist_ok=True) 76 | # write to parquet 77 | outfile = out_dir / 'sample_metadata.parquet.gz' 78 | df.to_parquet(outfile, index=False, compression='gzip') 79 | logging.info(f"Saved metadata for {organism} to {outfile}") 80 | 81 | if __name__ == "__main__": 82 | main() -------------------------------------------------------------------------------- /data/accessions_all-org.csv: -------------------------------------------------------------------------------- 1 | sample,accession,entrez_id,organism 2 | SRX20288331,SRR24503416,27709479,Anopheles_gambiae 3 | SRX19498702,SRR23613944,26779668,Arabidopsis_thaliana 4 | SRX19992927,SRR24196182,26767425,Bos_taurus 5 | SRX27335695,SRR31980676,36879050,Caenorhabditis_elegans 6 | SRX23995681,SRR28390731,32301733,Callithrix_jacchus 7 | SRX24164706,SRR28565449,32475612,Canis_lupus_familiaris 8 | SRX26737879,SRR31364130,36125471,Danio_rerio 9 | ERX5671941,ERR6032665,15014737,Drosophila_melanogaster 10 | SRX26348969,SRR30946435,35575331,Equus_caballus 11 | SRX21269173,SRR25539693,28706133,Gallus_gallus 12 | ERX6700420,ERR7133201,17777602,Gorilla_gorilla 13 | ERX10138362,ERR10669350,30605311,Heterocephalus_glaber 14 | ERX9511946,ERR9970924,24063361,Oryctolagus_cuniculus 15 | SRX22683047,SRR26990030,30698812,Oryza_sativa 16 | SRX16872040,SRR20852785,23639073,Ovis_aries 17 | ERX3512851,ERR3491595,9149889,Pan_troglodytes 18 | SRX26473837,SRR31090734,34046074,Rattus_norvegicus 19 | ERX4639423,ERR4769579,12367094,Schistosoma_mansoni 20 | SRX15090985,SRR19019104,21568481,Solanum_lycopersicum 21 | SRX23732262,SRR28085873,32021971,Sus_scrofa 22 | ERX5927607,ERR6295317,15346129,Xenopus_tropicalis 23 | SRX19052019,SRR23099821,26222706,Zea_mays 24 | SRX19498738,SRR23614060,26779704,Arabidopsis_thaliana 25 | SRX17163012,SRR21151334,23968863,Bos_taurus 26 | SRX27335698,SRR31980670,36879053,Caenorhabditis_elegans 27 | ERX9648260,ERR10111034,29618243,Callithrix_jacchus 28 | SRX17820910,SRR21831760,24747069,Canis_lupus_familiaris 29 | SRX22195232,SRR26491369,30150470,Danio_rerio 30 | SRX15014603,SRR18937045,21480053,Drosophila_melanogaster 31 | SRX26348970,SRR30946434,35575332,Equus_caballus 32 | SRX22821312,SRR27139512,30863257,Gallus_gallus 33 | ERX6700421,ERR7133209,17777603,Gorilla_gorilla 34 | ERX10213964,ERR10763189,30604755,Heterocephalus_glaber 35 | ERX9511931,ERR9970495,24063321,Oryctolagus_cuniculus 36 | SRX22985906,SRR27308612,31059128,Oryza_sativa 37 | SRX19482704,SRR23597689,26761938,Ovis_aries 38 | SRX18295180,SRR22321952,25305686,Pan_troglodytes 39 | SRX21325315,SRR25597644,28762465,Rattus_norvegicus 40 | ERX11749473,ERR12372788,31020827,Schistosoma_mansoni 41 | SRX15090986,SRR19019100,21568482,Solanum_lycopersicum 42 | SRX22722562,SRR27030753,30756300,Sus_scrofa 43 | SRX19052018,SRR23099822,26222705,Zea_mays 44 | SRX16110579,SRR20072545,22798499,Arabidopsis_thaliana 45 | SRX16110572,SRR20072552,22798492,Arabidopsis_thaliana 46 | SRX19885394,SRR24084458,27255941,Caenorhabditis_elegans 47 | SRX20684355,SRR24923839,28114027,Caenorhabditis_elegans 48 | SRX24172447,SRR28572865,32483421,Drosophila_melanogaster 49 | ERX5671946,ERR6032679,15014742,Drosophila_melanogaster 50 | SRX23498637,SRR27835303,31746997,Equus_caballus 51 | SRX23498644,SRR27835295,31747004,Equus_caballus 52 | SRX21646415,SRR25926818,29301782,Gallus_gallus 53 | SRX20765161,SRR25009824,28195890,Gallus_gallus 54 | ERX3512833,ERR3491577,9149737,Pan_troglodytes 55 | ERX3512988,ERR3491732,9149904,Pan_troglodytes -------------------------------------------------------------------------------- /bin/upload-final-star-params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | ## batteries 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import sys 8 | import argparse 9 | import logging 10 | import pandas as pd 11 | from db_utils import db_connect, db_upsert 12 | 13 | # logging 14 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 15 | 16 | # argparse 17 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 18 | argparse.RawDescriptionHelpFormatter): 19 | pass 20 | 21 | desc = 'Upload final STAR parameters to the scRecounter database' 22 | epi = """DESCRIPTION: 23 | Upload final STAR parameters to the scRecounter database. 24 | """ 25 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 26 | formatter_class=CustomFormatter) 27 | parser.add_argument('--sample', type=str, default=None, 28 | help='Sample name') 29 | parser.add_argument('--barcodes', type=str, default=None, 30 | help='Barcodes file path') 31 | parser.add_argument('--star-index', type=str, default=None, 32 | help='STAR index path') 33 | parser.add_argument('--cell-barcode-length', type=int, default=None, 34 | help='Cell barcode length') 35 | parser.add_argument('--umi-length', type=int, default=None, 36 | help='UMI length') 37 | parser.add_argument('--strand', type=str, default=None, 38 | help='Strandness') 39 | parser.add_argument('--outfile', type=str, default="star_params.csv", 40 | help='Output file path') 41 | 42 | # functions 43 | def main(args): 44 | # set pandas display optionqs 45 | pd.set_option('display.max_columns', 30) 46 | pd.set_option('display.width', 300) 47 | 48 | # create dataframe 49 | df = pd.DataFrame({ 50 | 'sample': [args.sample], 51 | 'barcodes': [os.path.basename(args.barcodes)], 52 | 'star_index': [os.path.basename(args.star_index.rstrip("/"))], 53 | 'cell_barcode_length': [args.cell_barcode_length], 54 | 'umi_length': [args.umi_length], 55 | 'strand': [args.strand] 56 | }) 57 | 58 | # write to file 59 | if os.path.exists(args.outfile): 60 | os.remove(args.outfile) 61 | df.to_csv(args.outfile, index=False) 62 | 63 | # upload to the scRecounter database 64 | with db_connect() as conn: 65 | db_upsert(df, "screcounter_star_params", conn) 66 | 67 | # update screcounter log 68 | log_df = pd.DataFrame({ 69 | "sample": [args.sample], 70 | "accession": [""], 71 | "process": ["STAR save params"], 72 | "step": ["Final"], 73 | "status": ["Success"], 74 | "message": ["STAR final parameters saved to database"], 75 | }) 76 | with db_connect() as conn: 77 | db_upsert(log_df, "screcounter_log", conn) 78 | 79 | 80 | ## script main 81 | if __name__ == '__main__': 82 | args = parser.parse_args() 83 | main(args) -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | includeConfig "config/utils.config" 2 | 3 | params { 4 | accessions = "" // CSV of accessions to download 5 | barcodes = "data/barcodes.csv" // CSV listing barcode files 6 | star_indices = "data/star_indices.csv" // CSV listing STAR indices 7 | output_dir = "results" // Output directory location 8 | max_samples = 3 // Max number of samples to process, if no accessions are provided 9 | max_accessions = 1 // Max number of accessions per sample to use for STAR parameter determination 10 | max_spots = 1000000 // Max number of spots (read-pairs) for STAR param assessment 11 | fallback_max_spots = 200000000 // Max number of spots (read-pairs) if fasterq-dump fails 12 | min_read_len = 26 // Minimum read length for R1 & R2 (shorter read files will be ignored) 13 | max_sra_size = 300 // Max SRA file size in GB (determined via sra-stat); all larger will be filtered 14 | organisms = "human,mouse" // Organisms to process if pulling from the scRecounter SQL database 15 | define = false // Just define the STAR parameters for each sample 16 | fasterq_tmp = "TEMP" // Temporary directory for fasterq-dump 17 | db_host = "35.243.133.29" // scRecounter SQL database host (GCP_SQL_DB_HOST) 18 | db_name = "sragent-prod" // scRecounter SQL database name (GCP_SQL_DB_NAME) 19 | db_username = "postgres" // scRecounter SQL database username (GCP_SQL_DB_USERNAME) 20 | } 21 | 22 | 23 | //-- Extra configs --// 24 | includeConfig "config/process.config" 25 | includeConfig "config/profiles.config" 26 | 27 | //-- Functions --// 28 | // Remove trailing forward slashes in a string 29 | def fmtPath(path_str) { 30 | return path_str.replaceAll(/\/+$/, '') 31 | } 32 | 33 | // Limit to the max resources of the available machine 34 | def check_max(obj, type){ 35 | if(type == 'memory'){ 36 | if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1){ 37 | return params.max_memory as nextflow.util.MemoryUnit 38 | } 39 | } else if(type == 'time'){ 40 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1){ 41 | return params.max_time as nextflow.util.Duration 42 | } 43 | } else if(type == 'cpus'){ 44 | if (obj > params.max_cpus as int){ 45 | return params.max_cpus as int 46 | } 47 | } 48 | return obj 49 | } 50 | 51 | def getWorkDir() { 52 | def userGroup = "id -gn".execute().text.trim() 53 | def userName = "whoami".execute().text.trim() 54 | def workDir = "/scratch/$userGroup/$userName/nextflow-work/scRecounter" 55 | return workDir 56 | } 57 | 58 | def getCondaCacheDir() { 59 | def userName = "whoami".execute().text.trim() 60 | cacheDir = "/home/$userName/nextflow/conda-cache/scRecounter" 61 | return cacheDir 62 | } -------------------------------------------------------------------------------- /scripts/gcp-upload/README.md: -------------------------------------------------------------------------------- 1 | gcp-loader 2 | ========== 3 | 4 | A simple Nextflow pipeline for efficiently loading single-cell data as h5ad files onto GCP 5 | 6 | 7 | # Dev 8 | 9 | Local run 10 | 11 | ```bash 12 | nextflow run main.nf -profile conda,vm,dev --feature_type GeneFull -resume 13 | ``` 14 | 15 | Slurm run 16 | 17 | ```bash 18 | nextflow run main.nf -profile conda,slurm,dev -resume 19 | ``` 20 | 21 | ## prod 22 | 23 | ### GeneFull_Ex50pAS 24 | 25 | ### CZI 26 | 27 | ```bash 28 | nextflow run main.nf \ 29 | -profile conda,slurm \ 30 | --feature_type GeneFull_Ex50pAS \ 31 | --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \ 32 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 33 | ``` 34 | 35 | ### SRA 36 | 37 | ```bash 38 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull_Ex50pAS 39 | ``` 40 | 41 | ### Clean up 42 | 43 | ```bash 44 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/ 45 | ``` 46 | 47 | ### Velocyto 48 | 49 | ### CZI 50 | 51 | ```bash 52 | nextflow run main.nf \ 53 | -profile conda,slurm \ 54 | --feature_type Velocyto \ 55 | --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \ 56 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 57 | ``` 58 | 59 | ### SRA 60 | 61 | ```bash 62 | nextflow run main.nf -profile conda,slurm --feature_type Velocyto 63 | ``` 64 | 65 | **HERE** 66 | 67 | ### Clean up 68 | 69 | ```bash 70 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/ 71 | ``` 72 | 73 | ### Gene ==> REDO 74 | 75 | ### CZI 76 | 77 | ```bash 78 | nextflow run main.nf \ 79 | -profile conda,slurm \ 80 | --feature_type Gene \ 81 | --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \ 82 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 83 | ``` 84 | 85 | ### SRA 86 | 87 | ```bash 88 | nextflow run main.nf -profile conda,slurm --feature_type Gene 89 | ``` 90 | 91 | ### Clean up 92 | 93 | ```bash 94 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/ 95 | ``` 96 | 97 | ### GeneFull 98 | 99 | ### CZI 100 | 101 | ```bash 102 | nextflow run main.nf \ 103 | -profile conda,slurm \ 104 | --feature_type GeneFull \ 105 | --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \ 106 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 107 | ``` 108 | 109 | ### SRA 110 | 111 | ```bash 112 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull 113 | ``` 114 | 115 | ### Clean up 116 | 117 | ```bash 118 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/ 119 | ``` 120 | 121 | ### GeneFull_ExonOverIntron 122 | 123 | ### CZI 124 | 125 | ```bash 126 | nextflow run main.nf \ 127 | -profile conda,slurm \ 128 | --feature_type GeneFull_ExonOverIntron \ 129 | --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \ 130 | --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs 131 | ``` 132 | 133 | ### SRA 134 | 135 | ```bash 136 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull_ExonOverIntron 137 | ``` 138 | 139 | ### Clean up 140 | 141 | ```bash 142 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/ 143 | ``` 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /docker/sc-recounter-run/README.md: -------------------------------------------------------------------------------- 1 | sc-recounter-run 2 | ================ 3 | 4 | The container for running the pipeline on GCP Cloud Run Jobs. 5 | 6 | ## Setup 7 | 8 | Env vars 9 | 10 | ```bash 11 | IMG_NAME=sc-recounter-run 12 | IMG_VERSION=0.1.9 13 | REGION="us-east1" 14 | GCP_PROJECT_ID="c-tc-429521" 15 | SERVICE_ACCOUNT_EMAIL="nick-nextflow@c-tc-429521.iam.gserviceaccount.com" 16 | SERVICE_ACCOUNT_JSON="c-tc-429521-6f6f5b8ccd93.json" 17 | ``` 18 | 19 | ### Docker 20 | 21 | Build 22 | 23 | > from the base directory of the repository 24 | 25 | ```bash 26 | docker build \ 27 | --file docker/${IMG_NAME}/Dockerfile \ 28 | --build-arg CONDA_ENV_YAML=docker/${IMG_NAME}/environment.yml \ 29 | --platform linux/amd64 \ 30 | --tag ${IMG_NAME}:${IMG_VERSION} \ 31 | . 32 | ``` 33 | 34 | Run the image (`-help`) 35 | 36 | ```bash 37 | docker run -it --rm \ 38 | -u $(id -u):$(id -g) \ 39 | -v ${PWD}:/data \ 40 | -v ${HOME}/.gcp/:/.gcp \ 41 | --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \ 42 | --platform linux/amd64 \ 43 | ${IMG_NAME}:${IMG_VERSION} \ 44 | -help 45 | ``` 46 | 47 | Run the image (`-profile`) 48 | 49 | ```bash 50 | docker run -it --rm \ 51 | -u $(id -u):$(id -g) \ 52 | -v ${PWD}:/data \ 53 | -v ${HOME}/.gcp/:/.gcp \ 54 | --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \ 55 | --platform linux/amd64 \ 56 | ${IMG_NAME}:${IMG_VERSION} \ 57 | -profile docker,gcp,gcp_dev,dev,no_acc_dev 58 | ``` 59 | 60 | Run with bash entrypoint 61 | 62 | ```bash 63 | docker run -it --rm \ 64 | -u $(id -u):$(id -g) \ 65 | -v ${PWD}:/data \ 66 | -v ${HOME}/.gcp/:/.gcp \ 67 | --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \ 68 | --entrypoint /bin/bash \ 69 | --platform linux/amd64 \ 70 | ${IMG_NAME}:${IMG_VERSION} 71 | ``` 72 | 73 | ### GCP Artifact Registry 74 | 75 | Create (if needed) 76 | 77 | ```bash 78 | DESCRIPTION="Run the scRecounter nextflow pipeline" 79 | gcloud artifacts repositories create ${IMG_NAME} \ 80 | --repository-format=docker \ 81 | --project=${GCP_PROJECT_ID} \ 82 | --location=${REGION} \ 83 | --description="${DESCRIPTION}" \ 84 | --async 85 | ``` 86 | 87 | Push 88 | 89 | ```bash 90 | docker tag ${IMG_NAME}:${IMG_VERSION} \ 91 | ${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \ 92 | && docker push ${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} 93 | ``` 94 | 95 | ### GCP Cloud Run Jobs 96 | 97 | Create/update the job 98 | 99 | ```bash 100 | JOB_NAME="${IMG_NAME}" 101 | gcloud beta run jobs update ${JOB_NAME} \ 102 | --service-account=${SERVICE_ACCOUNT_EMAIL} \ 103 | --project=${GCP_PROJECT_ID} \ 104 | --region=${REGION} \ 105 | --image=${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \ 106 | --set-env-vars=TZ=America/Los_Angeles \ 107 | --cpu=2 \ 108 | --memory=2Gi \ 109 | --task-timeout=4320m \ 110 | --max-retries=0 \ 111 | --args="docker","gcp","report","trace" 112 | ``` 113 | 114 | Non-human/mouse genomes 115 | 116 | ```bash 117 | JOB_NAME="${IMG_NAME}-all-org" 118 | gcloud beta run jobs update ${JOB_NAME} \ 119 | --service-account=${SERVICE_ACCOUNT_EMAIL} \ 120 | --project=${GCP_PROJECT_ID} \ 121 | --region=${REGION} \ 122 | --image=${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \ 123 | --set-env-vars=TZ=America/Los_Angeles \ 124 | --cpu=2 \ 125 | --memory=2Gi \ 126 | --task-timeout=4320m \ 127 | --max-retries=0 \ 128 | --args="docker","gcp","all_org","report","trace" 129 | ``` 130 | 131 | -------------------------------------------------------------------------------- /bin/format-star-params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | ## batteries 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import sys 8 | import argparse 9 | import logging 10 | import pandas as pd 11 | 12 | # logging 13 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 14 | 15 | # argparse 16 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 17 | argparse.RawDescriptionHelpFormatter): 18 | pass 19 | 20 | desc = 'Set STAR parameters for each sample.' 21 | epi = """DESCRIPTION: 22 | The script reads the STAR summary CSV file and determines 23 | the STAR parameters, based on the number of valid barcodes 24 | for each parameter set among the test STAR runs. 25 | """ 26 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 27 | formatter_class=CustomFormatter) 28 | parser.add_argument('star_summary_csv', type=str, 29 | help='Path to the STAR summary CSV file') 30 | parser.add_argument('--sample', type=str, default=None, 31 | help='Sample name') 32 | parser.add_argument('--accession', type=str, default=None, 33 | help='Accession name') 34 | parser.add_argument('--strand', type=str, default=None, 35 | help='Strandness') 36 | parser.add_argument('--barcodes-name', type=str, default=None, 37 | help='Barcodes name') 38 | parser.add_argument('--barcodes-file', type=str, default=None, 39 | help='Barcodes file path') 40 | parser.add_argument('--cell-barcode-length', type=int, default=None, 41 | help='Cell barcode length') 42 | parser.add_argument('--umi-length', type=int, default=None, 43 | help='UMI length') 44 | parser.add_argument('--organism', type=str, default=None, 45 | help='Organism') 46 | parser.add_argument('--star-index', type=str, default=None, 47 | help='STAR index path') 48 | parser.add_argument('--outfile', type=str, default="star_params.csv", 49 | help='Output file path') 50 | 51 | # functions 52 | def main(args): 53 | # set pandas display optionqs 54 | pd.set_option('display.max_columns', 30) 55 | pd.set_option('display.width', 300) 56 | 57 | # create param table 58 | star_params = { 59 | "sample" : args.sample, 60 | "accession" : args.accession, 61 | "strand" : args.strand, 62 | "barcodes_name" : args.barcodes_name, 63 | "barcodes_file" : args.barcodes_file, 64 | "cell_barcode_length" : args.cell_barcode_length, 65 | "umi_length" : args.umi_length, 66 | "organism" : args.organism, 67 | "star_index" : args.star_index 68 | } 69 | # convert to dataframe 70 | star_params = pd.DataFrame([star_params]) 71 | 72 | # print to stderr 73 | #print("#-- raw dataframe --#", file=sys.stderr) 74 | #star_params.to_csv(sys.stderr, index=False) 75 | 76 | # read star summary 77 | star_summary = pd.read_csv(args.star_summary_csv, header=None) 78 | star_summary["sample"] = args.sample 79 | star_summary["accession"] = args.accession 80 | # pivot 81 | star_summary = star_summary.pivot(index=["sample", "accession"], columns=0, values=1) 82 | star_summary["sample"] = star_summary.index.get_level_values('sample') 83 | star_summary["accession"] = star_summary.index.get_level_values('accession') 84 | star_summary.reset_index(drop=True, inplace=True) 85 | 86 | # merge dataframes on sample and accession 87 | star_params = star_params.merge(star_summary, on=["sample", "accession"], how="inner") 88 | 89 | # print to stderr 90 | #print("#-- final dataframe --#", file=sys.stderr) 91 | #star_params.to_csv(sys.stderr, index=False) 92 | 93 | # write to file 94 | star_params.to_csv(args.outfile, index=False) 95 | 96 | 97 | ## script main 98 | if __name__ == '__main__': 99 | args = parser.parse_args() 100 | main(args) -------------------------------------------------------------------------------- /bin/star-summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | ## batteries 4 | from __future__ import print_function 5 | import os 6 | import re 7 | import sys 8 | import argparse 9 | import logging 10 | from typing import List, Dict, Any, Tuple 11 | import pandas as pd 12 | from db_utils import db_connect, db_upsert 13 | 14 | # logging 15 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 16 | 17 | # argparse 18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 19 | argparse.RawDescriptionHelpFormatter): 20 | pass 21 | 22 | desc = 'Summarize STAR summary files' 23 | epi = """DESCRIPTION: 24 | Summarize STAR summary files into a single table. The summary files 25 | are generated by the STAR aligner and contain information about the 26 | alignment statistics. The script reads in all summary files and 27 | concatenates them into a single table. The table is then written to 28 | a file and upserted into the database. 29 | """ 30 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 31 | formatter_class=CustomFormatter) 32 | parser.add_argument('summary_csv', type=str, nargs='+', 33 | help='STAR summary csv file(s)') 34 | parser.add_argument('--sample', type=str, default="", 35 | help='Sample name') 36 | parser.add_argument('--outfile', type=str, default="Summary.csv", 37 | help='Output file') 38 | 39 | # functions 40 | def main(args): 41 | # set pandas display optionqs 42 | pd.set_option('display.max_columns', 50) 43 | pd.set_option('display.max_rows', 100) 44 | pd.set_option('display.width', 300) 45 | 46 | # read in all summary csv files and concatenate 47 | df = [] 48 | regex = re.compile(r"_summary.csv$") 49 | for infile in args.summary_csv: 50 | x = pd.read_csv(infile, header=None) 51 | x.columns = ["category", "value"] 52 | x["feature"] = regex.sub("", os.path.basename(infile)) 53 | df.append(x) 54 | df = pd.concat(df) 55 | 56 | # status 57 | logging.info(f"Number of rows in the raw table: {df.shape[0]}") 58 | 59 | # format category 60 | for x in ["Gene", "GeneFull", "GeneFull_Ex50pAS", "GeneFull_ExonOverIntron", "Velocyto"]: 61 | regex = re.compile(f" {x} ") 62 | df["category"] = df["category"].apply(lambda x: regex.sub(" feature ", x)) 63 | 64 | # pivot table 65 | df = df.pivot(index='feature', columns='category', values='value').reset_index() 66 | 67 | # format columns: no spaces and lowercase 68 | df.columns = df.columns.str.replace(r'\W', '_', regex=True).str.lower() 69 | 70 | # coerce columns to numeric 71 | for col in df.columns.to_list(): 72 | if col != "feature": 73 | df[col] = pd.to_numeric(df[col], errors='coerce') 74 | 75 | # float columns to integer 76 | cols_to_convert = ["estimated_number_of_cells", "number_of_reads", "umis_in_cells"] 77 | for col in cols_to_convert: 78 | if col in df.columns: 79 | df[col] = df[col].fillna(0).replace([float('inf'), -float('inf')], 0).astype(int) 80 | 81 | # add sample name 82 | df["sample"] = args.sample 83 | 84 | # status 85 | logging.info(f"Number of rows after formattings: {df.shape[0]}") 86 | 87 | # upsert results to database 88 | logging.info("Updating screcounter_star_results...") 89 | with db_connect() as conn: 90 | db_upsert(df, "screcounter_star_results", conn) 91 | 92 | # write output table 93 | outdir = os.path.dirname(args.outfile) 94 | if outdir != "": 95 | os.makedirs(outdir, exist_ok=True) 96 | df.to_csv(args.outfile, index=False) 97 | 98 | # update screcounter log 99 | logging.info("Updating screcounter_log...") 100 | log_df = pd.DataFrame({ 101 | "sample": [args.sample], 102 | "accession": [""], 103 | "process": ["STAR-full"], 104 | "step": ["Final"], 105 | "status": ["Success"], 106 | "message": ["STAR summary table generated"] 107 | }) 108 | with db_connect() as conn: 109 | db_upsert(log_df, "screcounter_log", conn) 110 | 111 | 112 | ## script main 113 | if __name__ == '__main__': 114 | args = parser.parse_args() 115 | main(args) 116 | 117 | -------------------------------------------------------------------------------- /scripts/gcp-find-soft-delete.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | from typing import Tuple, List, Dict 6 | from datetime import datetime 7 | import pandas as pd 8 | from google.cloud import storage 9 | 10 | 11 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 12 | pass 13 | 14 | def parse_args() -> argparse.Namespace: 15 | """ 16 | Parse command-line arguments. 17 | Returns: 18 | argparse.Namespace containing arguments. 19 | """ 20 | desc = 'List all files in a bucket that are designated as soft-delete' 21 | epi = """DESCRIPTION: 22 | 23 | """ 24 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 25 | parser.add_argument('gcs_bucket', type=str, 26 | help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/)') 27 | # parser.add_argument('--min-date-time', type=str, default='2025-01-13_00-00-00', 28 | # help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)') 29 | # parser.add_argument('--max-date-time', type=str, default='2025-01-15_00-00-00', 30 | # help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)') 31 | return parser.parse_args() 32 | 33 | def parse_gs_path(gs_path: str) -> Tuple[str, str]: 34 | """ 35 | Parse a GCP bucket path. 36 | Args: 37 | gs_path: GCP bucket path starting with gs:// 38 | Returns: 39 | A tuple of (bucket_name, prefix). 40 | """ 41 | if not gs_path.startswith("gs://"): 42 | raise ValueError("Path must start with 'gs://'") 43 | parts = gs_path[5:].split("/", 1) 44 | bucket_name = parts[0] 45 | prefix = parts[1] if len(parts) > 1 else "" 46 | return bucket_name, prefix.rstrip("/") + "/" 47 | 48 | def list_soft_deleted_files(bucket: storage.Bucket) -> List[Dict[str, str]]: 49 | """ 50 | List all files in a GCP bucket that are designated as soft-deleted 51 | Args: 52 | bucket: A GCP bucket object. 53 | Returns: 54 | A list of dictionaries containing the name and generation of soft-deleted files. 55 | """ 56 | # List all blobs in the bucket 57 | blobs = bucket.list_blobs(versions=True) 58 | 59 | # Dictionary to track the latest generation of each object 60 | latest_generations = {} 61 | 62 | # First pass: determine the latest generation for each object 63 | print("First pass: determine the latest generation for each object", file=sys.stderr) 64 | for blob in blobs: 65 | if blob.name not in latest_generations: 66 | latest_generations[blob.name] = blob.generation 67 | else: 68 | latest_generations[blob.name] = max(latest_generations[blob.name], blob.generation) 69 | 70 | ## status 71 | print(f"Num blobs: {len(latest_generations)}", file=sys.stderr) 72 | 73 | # Second pass: collect non-current versions 74 | print("Second pass: collect non-current versions", file=sys.stderr) 75 | soft_deleted_files = [] 76 | blobs = bucket.list_blobs(versions=True) 77 | for blob in blobs: 78 | try: 79 | if blob.generation < latest_generations[blob.name]: 80 | soft_deleted_files.append({"name": blob.name, "generation": blob.generation}) 81 | except KeyError: 82 | pass 83 | 84 | return soft_deleted_files 85 | 86 | def main(args: argparse.Namespace) -> None: 87 | """ 88 | Main function that: 89 | 1. Lists all files in a GCP bucket that are designated as soft-deleted 90 | 91 | Args: 92 | args: An argparse.Namespace holding command-line arguments. 93 | """ 94 | # Format arg date/time strings 95 | #min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S") 96 | #max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S") 97 | 98 | # Parse GCP bucket path 99 | bucket_name, path_prefix = parse_gs_path(args.gcs_bucket) 100 | 101 | # Initialize GCP client and bucket 102 | client = storage.Client() 103 | bucket = client.bucket(bucket_name) 104 | 105 | # list soft-deleted files 106 | soft_del_files = list_soft_deleted_files(bucket) 107 | print(soft_del_files) 108 | 109 | 110 | if __name__ == "__main__": 111 | from dotenv import load_dotenv 112 | load_dotenv() 113 | args = parse_args() 114 | main(args) -------------------------------------------------------------------------------- /bin/sra-stat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | from __future__ import print_function 4 | import os 5 | import re 6 | import sys 7 | import argparse 8 | import logging 9 | from time import sleep 10 | from shutil import which 11 | from typing import Dict, List 12 | from subprocess import Popen, PIPE 13 | import xml.etree.ElementTree as ET 14 | import pandas as pd 15 | 16 | # logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | 19 | # argparse 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 21 | argparse.RawDescriptionHelpFormatter): 22 | pass 23 | 24 | desc = 'Run sra-tools sra-stat' 25 | epi = """DESCRIPTION: 26 | Run sra-tools sra-stat with handling of errors and formatting of the output 27 | """ 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 29 | formatter_class=CustomFormatter) 30 | parser.add_argument('accession', type=str, help='SRA accession') 31 | parser.add_argument('--tries', type=int, default=5, 32 | help='Number of tries to download') 33 | parser.add_argument('--outfile', type=str, default='sra-stat.csv', 34 | help='Output file') 35 | 36 | # functions 37 | def run_cmd(cmd: str) -> tuple: 38 | """ 39 | Run sub-command and return returncode, output, and error. 40 | Args: 41 | cmd: Command to run 42 | Returns: 43 | tuple: (returncode, output, error) 44 | """ 45 | logging.info(f'Running: {cmd}') 46 | p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) 47 | output, err = p.communicate() 48 | return p.returncode, output, err 49 | 50 | def run_sra_stat(accession: str, tries: int=5) -> pd.DataFrame: 51 | """ 52 | Run prefetch with error handling. 53 | Args: 54 | accession: SRA accession 55 | tries: Number of tries 56 | Returns: 57 | 58 | """ 59 | cmd = f'sra-stat --xml --quick {accession}' 60 | for i in range(tries): 61 | logging.info(f'Attempt: {i+1}/{tries}') 62 | rc,output,err = run_cmd(cmd) 63 | if rc == 0: 64 | return output 65 | else: 66 | logging.error('Download failed') 67 | logging.error(err) 68 | # sleep prior to next attempt 69 | sleep_time = 10 * (i + 1) 70 | logging.info(f'Sleeping for {sleep_time} seconds...') 71 | sleep(sleep_time) 72 | return None 73 | 74 | def parse_sra_stats(xml_string: str) -> Dict: 75 | """Parse SRA statistics XML and return key metrics. 76 | 77 | Args: 78 | xml_string: XML string containing SRA run statistics 79 | 80 | Returns: 81 | Dictionary containing parsed statistics 82 | """ 83 | # Parse XML string 84 | root = ET.fromstring(xml_string) 85 | 86 | # Get run-level attributes 87 | stats = { 88 | 'accession': root.get('accession'), 89 | 'spot_count': int(root.get('spot_count')), 90 | 'base_count': int(root.get('base_count')) 91 | } 92 | 93 | # Get file size 94 | size_elem = root.find('Size') 95 | if size_elem is not None: 96 | file_size = int(size_elem.get('value')) 97 | file_size_units = size_elem.get('units') 98 | if file_size and file_size_units: 99 | # convert to Gb 100 | if file_size_units == 'bytes': 101 | file_size = file_size / 1e9 102 | elif file_size_units == 'kilobytes': 103 | file_size = file_size / 1e6 104 | elif file_size_units == 'megabytes': 105 | file_size = file_size / 1e3 106 | stats['file_size_gb'] = file_size 107 | else: 108 | stats['file_size_gb'] = 10 # default size (Gb), if no found 109 | 110 | # convert to pandas dataframe 111 | return pd.DataFrame(stats, index=[0]) 112 | 113 | def main(args): 114 | # check for prefetch in path 115 | for exe in ['sra-stat']: 116 | if not which(exe): 117 | logging.error(f'{exe} not found in PATH') 118 | sys.exit(1) 119 | 120 | # run sra-state 121 | data = run_sra_stat(args.accession, args.tries) 122 | if not data: 123 | logging.error('sra-stat failed') 124 | sys.exit(1) 125 | 126 | # parse sra-stat output 127 | stats = parse_sra_stats(data) 128 | 129 | # write to file 130 | stats.to_csv(args.outfile, index=False) 131 | logging.info(f'Output written to: {args.outfile}') 132 | 133 | ## script main 134 | if __name__ == '__main__': 135 | args = parser.parse_args() 136 | main(args) -------------------------------------------------------------------------------- /lib/star_params.groovy: -------------------------------------------------------------------------------- 1 | import groovy.json.JsonSlurper 2 | 3 | def expandStarParams(ch_fastq, ch_star_params_json) { 4 | def processedSamples = [] 5 | 6 | // read the JSON file with the STAR parameters and join with the fastq channel 7 | ch_params = ch_fastq.join(ch_star_params_json, by: [0,1]) 8 | .map{ sample, accession, metadata, read1, read2, json_file -> 9 | processedSamples << [sample, accession] 10 | def params = new JsonSlurper().parseText(json_file.text) 11 | def barcodes_file = params.barcodes_file 12 | def star_index = params.star_index 13 | def cell_barcode_length = params.cell_barcode_length 14 | def umi_length = params.umi_length 15 | def strand = params.strand 16 | return [sample, accession, 17 | barcodes_file, star_index, 18 | cell_barcode_length, umi_length, strand] 19 | } 20 | 21 | // status on number of parameter combinations 22 | ch_params.ifEmpty{ 23 | println "WARNING: No valid parameter set found for the following samples:" 24 | processedSamples.each { sampleInfo -> 25 | println "- Sample: ${sampleInfo[0]}, Accession: ${sampleInfo[1]}" 26 | } 27 | } 28 | return ch_params 29 | } 30 | 31 | def makeParamSets(ch_subsample, ch_barcodes, ch_star_indices) { 32 | // pairwise combine the subsample, barcodes and star indices channels 33 | ch_params = ch_subsample 34 | .combine(Channel.of("Forward", "Reverse")) 35 | .combine(ch_barcodes) 36 | .combine(ch_star_indices) 37 | .map { sample, accession, metadata, r1, r2, strand, barcodes_name, cb_len, umi_len, barcodes_file, organism, star_index -> 38 | if (metadata["organism"] != "" & metadata["organism"] != organism) { 39 | return null 40 | } 41 | def params = [ 42 | sample: sample, 43 | accession: accession, 44 | strand: strand, 45 | barcodes_name: barcodes_name, 46 | cell_barcode_length: cb_len, 47 | umi_length: umi_len, 48 | barcodes_file: barcodes_file, 49 | organism: organism, 50 | star_index: star_index 51 | ] 52 | return [sample, accession, metadata, r1, r2, barcodes_file, star_index, params] 53 | } 54 | .filter { it != null } 55 | 56 | // status on number of parameter combinations 57 | ch_params 58 | .ifEmpty("No valid parameter set found") 59 | .count().view{ count -> "Param sets to test across all SRX: ${count}" } 60 | return ch_params 61 | } 62 | 63 | def validateRequiredColumns(row, required) { 64 | // check if all required columns are present in the input CSV file 65 | def missing = required.findAll { !row.containsKey(it) } 66 | if (missing) { 67 | error "Missing columns in the input CSV file: ${missing}" 68 | } 69 | } 70 | 71 | def loadBarcodes(params) { 72 | // load the barcodes from the input CSV file 73 | ch_barcodes = Channel 74 | .fromPath(params.barcodes, checkIfExists: true) 75 | .splitCsv(header: true) 76 | .map { row -> 77 | def req_columns = ["name", "cell_barcode_length", "umi_length", "file_path"] 78 | validateRequiredColumns(row, req_columns) 79 | // remove special characters 80 | row.name = row.name.replaceAll("\\s", "_") 81 | return [ 82 | row.name, 83 | row.cell_barcode_length.toInteger(), 84 | row.umi_length.toInteger(), 85 | row.file_path 86 | ] 87 | } 88 | // status on number of barcodes 89 | ch_barcodes 90 | .ifEmpty("No barcodes found in the input CSV file") 91 | .count().view{ count -> "Number of input barcodes: ${count}" } 92 | return ch_barcodes 93 | } 94 | 95 | def loadStarIndices(params) { 96 | // load the STAR indices from the input CSV file 97 | ch_indices = Channel 98 | .fromPath(params.star_indices, checkIfExists: true) 99 | .splitCsv(header: true) 100 | .map { row -> 101 | def req_columns = ["organism", "star_index"] 102 | validateRequiredColumns(row, req_columns) 103 | // remove special characters 104 | row.organism = row.organism.replaceAll("\\s", "_") 105 | return [row.organism, row.star_index] 106 | } 107 | // status on number of star indices 108 | ch_indices 109 | .ifEmpty("No star indices found in the input CSV file") 110 | .count().view{ count -> "Number of input star indices: ${count}" } 111 | return ch_indices 112 | } -------------------------------------------------------------------------------- /scripts/gcp-upload/main.nf: -------------------------------------------------------------------------------- 1 | workflow { 2 | // find target MTX files to add to the database 3 | FIND_MTX() 4 | 5 | // list target MTX files 6 | mtx_files = FIND_MTX.out.csv 7 | .splitCsv( header: true ) 8 | .map{ row -> 9 | tuple(row.srx, file(row.matrix_path), file(row.features_path), file(row.barcodes_path)) 10 | } 11 | 12 | // group Velocyto MTX files by SRX 13 | if( params.feature_type == "Velocyto"){ 14 | mtx_files = mtx_files.groupTuple().map{ group -> 15 | tuple(group[0], group[1], group[2][0], group[3][0]) 16 | } 17 | } 18 | 19 | // convert to h5ad and publish 20 | MTX_TO_H5AD( mtx_files, Channel.fromPath(params.tissue_categories) ) 21 | 22 | // write parquet after all MTX_TO_H5AD jobs complete 23 | if( params.update_db ){ 24 | DB_TO_PARQUET( MTX_TO_H5AD.out.h5ad.collect() ) 25 | } 26 | 27 | // aggregate obs metadata 28 | AGG_OBS_METADATA( MTX_TO_H5AD.out.csv.collate(100) ) 29 | } 30 | 31 | process AGG_OBS_METADATA { 32 | publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "metadata_TMP/${params.feature_type}/*.csv.gz" 33 | publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log" 34 | label "process_low" 35 | 36 | input: 37 | path csv_files 38 | 39 | output: 40 | path "metadata_TMP/${params.feature_type}/*.csv.gz", emit: obs_meta 41 | path "agg-obs-metadata.log", emit: log 42 | 43 | script: 44 | """ 45 | agg-obs-metadata.py \\ 46 | --feature-type ${params.feature_type} \\ 47 | ${csv_files} 2>&1 | tee agg-obs-metadata.log 48 | """ 49 | } 50 | 51 | process DB_TO_PARQUET { 52 | publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "metadata/${params.feature_type}/*/sample_metadata.parquet.gz" 53 | publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log" 54 | label "process_low" 55 | 56 | input: 57 | path csv_files 58 | 59 | output: 60 | path "metadata/${params.feature_type}/*/sample_metadata.parquet.gz", emit: samp_meta 61 | path "db-to-parquet.log", emit: log 62 | 63 | script: 64 | """ 65 | export GCP_SQL_DB_HOST="${params.db_host}" 66 | export GCP_SQL_DB_NAME="${params.db_name}" 67 | export GCP_SQL_DB_USERNAME="${params.db_username}" 68 | 69 | db-to-parquet.py \\ 70 | --feature-type ${params.feature_type} \\ 71 | 2>&1 | tee db-to-parquet.log 72 | """ 73 | } 74 | 75 | process MTX_TO_H5AD { 76 | publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "h5ad/${params.feature_type}/*/*.h5ad.gz" 77 | publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log" 78 | errorStrategy { task.attempt <= maxRetries ? 'retry' : 'ignore' } 79 | label "process_low" 80 | maxForks 200 81 | 82 | input: 83 | tuple val(srx), path(mtx_path), path(features_path), path(barcodes_path) 84 | each path(tissue_categories) 85 | 86 | output: 87 | path "h5ad/${params.feature_type}/*/${srx}.h5ad.gz", emit: h5ad 88 | path "metadata/${srx}.csv.gz", emit: csv 89 | path "mtx-to-h5ad_${srx}.log", emit: log 90 | 91 | script: 92 | def update_db = params.update_db ? "--update-database" : "" 93 | """ 94 | export GCP_SQL_DB_HOST="${params.db_host}" 95 | export GCP_SQL_DB_NAME="${params.db_name}" 96 | export GCP_SQL_DB_USERNAME="${params.db_username}" 97 | 98 | mtx-to-h5ad.py ${update_db} \\ 99 | --feature-type ${params.feature_type} \\ 100 | --missing-metadata "${params.missing_metadata}" \\ 101 | --tissue-categories "${tissue_categories}" \\ 102 | --srx ${srx} \\ 103 | --matrix ${mtx_path} \\ 104 | --publish-path "${params.output_dir}" \\ 105 | 2>&1 | tee mtx-to-h5ad_${srx}.log 106 | """ 107 | } 108 | 109 | process FIND_MTX { 110 | publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log" 111 | label "process_low" 112 | 113 | output: 114 | path "mtx_files.csv", emit: csv 115 | path "find-mtx.log", emit: log 116 | 117 | script: 118 | def organisms = params.organisms != "" ? "--organisms \"${params.organisms}\"" : "" 119 | def redo_processed = params.redo_processed.toString() == "true" ? "--redo-processed" : "" 120 | """ 121 | export GCP_SQL_DB_HOST="${params.db_host}" 122 | export GCP_SQL_DB_NAME="${params.db_name}" 123 | export GCP_SQL_DB_USERNAME="${params.db_username}" 124 | 125 | find-mtx.py ${organisms} ${redo_processed} \\ 126 | --feature-type ${params.feature_type} \\ 127 | --max-datasets ${params.max_datasets} \\ 128 | ${params.input_dir} \\ 129 | 2>&1 | tee find-mtx.log 130 | """ 131 | } -------------------------------------------------------------------------------- /config/profiles.config: -------------------------------------------------------------------------------- 1 | profiles { 2 | conda { 3 | conda.enabled = true 4 | conda.useMamba = false 5 | docker.enabled = false 6 | singularity.enabled = false 7 | podman.enabled = false 8 | shifter.enabled = false 9 | charliecloud.enabled = false 10 | } 11 | docker { 12 | docker.enabled = true 13 | docker.sudo = false 14 | docker.runOptions = "-u \$(id -u):\$(id -g) --platform=linux/amd64" 15 | singularity.enabled = false 16 | podman.enabled = false 17 | shifter.enabled = false 18 | charliecloud.enabled = false 19 | } 20 | vm { 21 | workDir = getWorkDir() 22 | conda.cacheDir = getCondaCacheDir() 23 | process { 24 | errorStrategy = "terminate" 25 | maxRetries = 0 26 | resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ] 27 | } 28 | } 29 | slurm { 30 | executor.queueSize = 30 31 | process { 32 | executor = "slurm" 33 | queue = "cpu_batch" 34 | errorStrategy = "retry" // "terminate" 35 | maxRetries = 1 36 | resourceLimits = [ cpus: 24, memory: 800.GB, time: 72.h ] 37 | } 38 | } 39 | gcp { 40 | workDir = "gs://arc-ctc-nextflow/scRecounter/prod/work" 41 | fusion.enabled = false 42 | wave.enabled = false 43 | executor { 44 | queueSize = 200 45 | pollInterval = "15 sec" 46 | } 47 | params { 48 | barcodes = "data/gcp/barcodes.csv" 49 | star_indices = "data/gcp/star_indices.csv" 50 | fasterq_tmp = "/tmp/TEMP" 51 | } 52 | process { 53 | executor = "google-batch" 54 | errorStrategy = "retry" 55 | maxRetries = 2 56 | scratch = true 57 | resourceLimits = [ cpus: 36, memory: 700.GB, time: 120.h ] 58 | } 59 | google { 60 | project = "c-tc-429521" 61 | location = "us-east1" 62 | batch { 63 | serviceAccountEmail = "nick-nextflow@c-tc-429521.iam.gserviceaccount.com" 64 | spot = true 65 | maxSpotAttempts = 3 66 | bootDiskSize = 150.GB 67 | } 68 | storage { 69 | multiplier = 2.0 70 | } 71 | } 72 | } 73 | dev { 74 | params { 75 | min_read_len = 20 76 | db_name = "sragent-test" 77 | fallback_max_spots = 10000000 78 | } 79 | } 80 | vm_dev { 81 | params { 82 | barcodes = "data/barcodes_n2.csv" 83 | star_indices = "data/star_indices.csv" 84 | } 85 | } 86 | slurm_dev { 87 | params { 88 | barcodes = "data/barcodes_n2.csv" 89 | star_indices = "data/star_indices.csv" 90 | } 91 | } 92 | gcp_dev { 93 | workDir = "gs://arc-ctc-nextflow/scRecounter/dev/work/" 94 | params { 95 | //barcodes = "data/gcp/barcodes_n2.csv" 96 | barcodes = "data/gcp/barcodes.csv" 97 | star_indices = "data/gcp/star_indices.csv" 98 | } 99 | } 100 | acc_dev { 101 | params { 102 | accessions = "data/accessions_small_n2.csv" 103 | output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/acc-n2" 104 | } 105 | } 106 | acc_dev_problems { 107 | params { 108 | accessions = "data/accessions_problems.csv" 109 | output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/acc-problems" 110 | } 111 | } 112 | acc_all_org { 113 | params { 114 | accessions = "data/accessions_all-org.csv" 115 | star_indices = "data/star_indices_all-org.csv" 116 | output_dir = "/scratch/multiomics/nickyoungblut/scRecounter/acc_all-org" 117 | define = true 118 | } 119 | } 120 | no_acc_dev { 121 | params { 122 | accessions = "" 123 | output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/no-acc" 124 | } 125 | } 126 | all_org { 127 | params { 128 | organisms = "Macaca mulatta,Anopheles gambiae,Arabidopsis thaliana,Bos taurus,Caenorhabditis elegans,Callithrix jacchus,Canis lupus familiaris,Danio rerio,Drosophila melanogaster,Equus caballus,Gallus gallus,Gorilla gorilla,Heterocephalus glaber,Oryctolagus cuniculus,Oryza sativa,Ovis aries,Pan troglodytes,Rattus norvegicus,Saccharomyces cerevisiae,Schistosoma mansoni,Solanum lycopersicum,Sus scrofa,Xenopus tropicalis,Zea mays" 129 | } 130 | } 131 | report { 132 | report { 133 | enabled = true 134 | overwrite = true 135 | file = "${params.output_dir}/nf-report/${params.timestamp}.html" 136 | } 137 | } 138 | trace { 139 | trace { 140 | enabled = true 141 | overwrite = true 142 | file = "${params.output_dir}/nf-trace/${params.timestamp}.txt" 143 | fields = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch" 144 | } 145 | } 146 | } 147 | 148 | 149 | -------------------------------------------------------------------------------- /lib/utils.groovy: -------------------------------------------------------------------------------- 1 | def readStarParams(star_params_file){ 2 | // read the input CSV file and check if all required columns are present 3 | return Channel 4 | .fromPath(star_params_file, checkIfExists: true) 5 | .splitCsv(header: true, sep: ',') 6 | .map { row -> 7 | def req_columns = ["sample", "fastq_1", "fastq_2", "barcodes_file", "star_index", 8 | "cell_barcode_length", "umi_length", "strand"] 9 | def miss_columns = req_columns.findAll { !row.containsKey(it) } 10 | if (miss_columns) { 11 | error "Missing columns in the input CSV file: ${miss_columns}" 12 | } 13 | // remove special characters from the sample name 14 | row.sample = row.sample.replaceAll("\\s", "_") 15 | return [row.sample, row.fastq_1, row.fastq_2, row.barcodes_file, row.star_index, 16 | row.cell_barcode_length, row.umi_length, row.strand] 17 | } 18 | } 19 | 20 | def readAccessions(accessions_input){ 21 | // read the input accessions CSV file and check if all required columns are present 22 | ch_acc = accessions_input 23 | .splitCsv(header: true, sep: ",") 24 | .map { row -> 25 | def req_columns = ["sample", "accession"] 26 | def miss_columns = req_columns.findAll { !row.containsKey(it) } 27 | if (miss_columns) { 28 | error "Missing columns in the input CSV file: ${miss_columns}" 29 | } 30 | // remove special characters from the sample name 31 | row.sample = row.sample.replaceAll("\\s", "_") 32 | def result = [row.sample, row.accession] 33 | // add optional, metadata columns 34 | def metadata = [:] 35 | ["organism", "tech_10x"].each { col -> 36 | metadata[col] = row.containsKey(col) ? row[col].replaceAll("\\s", "_") : "" 37 | } 38 | result << metadata 39 | return result 40 | } 41 | 42 | // print srx values 43 | ch_acc 44 | .map{ sample, accession, metadata -> sample } 45 | .distinct() 46 | .collect() 47 | .map{ it.join(',') } 48 | .view{ "SRX accessions: ${it}" } 49 | 50 | return ch_acc 51 | } 52 | 53 | def addStats(ch_accessions, ch_sra_stat){ 54 | // add file size information to the accessions 55 | ch_stats = ch_sra_stat 56 | .map{ sample,acc,csv -> csv } 57 | .splitCsv(header: true, sep: ",") 58 | .map{ row -> [row.accession, row.file_size_gb.toDouble()] } 59 | .join(ch_sra_stat.map{ sample,acc,csv -> [acc, sample] }, by: [0]) 60 | .map{ acc, size, sample -> [sample, acc, size] } 61 | return ch_accessions.join(ch_stats, by: [0,1]) // sample, acc, metadata, size 62 | } 63 | 64 | def joinReads(ch_read1, ch_read2){ 65 | // extract metadata to prevent incorrect joining 66 | ch_metadata = ch_read1.map{ sample,accession,metadata,fastq -> [sample,accession,metadata] } 67 | 68 | // join the read1 and read2 channels 69 | return ch_read1 70 | .map{ sample,accession,metadata,fastq -> [sample,accession,fastq] } 71 | .join( 72 | ch_read2.map{ sample,accession,metadata,fastq -> [sample,accession,fastq] }, 73 | by: [0,1] 74 | ) 75 | .join( 76 | ch_metadata, by: [0,1] 77 | ) 78 | .map{ 79 | sample,accession,fastq1,fastq2,metadata -> [sample,accession,metadata,fastq1,fastq2] 80 | } 81 | } 82 | 83 | def saveAsLog(filename, sample=null, accession=null) { 84 | if (filename.endsWith(".log")) { 85 | def basename = filename.tokenize("/")[-1] 86 | def path = "logs" 87 | if (sample){ 88 | path = "${path}/${sample}" 89 | } 90 | if (accession) { 91 | path = "${path}/${accession}" 92 | } 93 | path = "${path}/${basename}" 94 | return path 95 | } 96 | return null 97 | } 98 | 99 | def subsampleByGroup(ch_accessions, max_per_group, seed) { 100 | ch_accessions 101 | .groupTuple() 102 | .map { samples, accessions, meta, sra_stat -> 103 | accessions = accessions.toList() 104 | meta = meta.toList() 105 | sra_stat = sra_stat.toList() 106 | 107 | if (accessions) { // Ensure lists are not empty 108 | def indices = (0.. 0 ? shuffledAcc[0.. 0 ? shuffledMeta[0.. 0 ? shuffledStat[0.. 126 | def flattened = [] 127 | for (int i = 0; i < accessions.size(); i++) { 128 | flattened << [samples, accessions[i], meta[i], sra_stat[i]] 129 | } 130 | flattened 131 | } 132 | } -------------------------------------------------------------------------------- /scripts/tiledb-loader-tahoe100.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import gc 6 | import sys 7 | import logging 8 | import argparse 9 | from glob import glob 10 | from typing import List, Set, Tuple, Optional 11 | ## 3rd party 12 | import pandas as pd 13 | import tiledbsoma 14 | import tiledbsoma.io 15 | import scanpy as sc 16 | 17 | # format logging 18 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 19 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING) 20 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING) 21 | logging.getLogger("tiledb").setLevel(logging.WARNING) 22 | 23 | # classes 24 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 25 | pass 26 | 27 | # functions 28 | def parse_arguments() -> argparse.Namespace: 29 | """ 30 | Parse command-line arguments. 31 | """ 32 | desc = 'Convert Tahoe-100 dataset to TileDB format.' 33 | epi = """DESCRIPTION: 34 | Test example: 35 | ./scripts/tiledb-loader-tahoe.py --db-uri ~/dev/nextflow/scRecounter/tmp/tiledb/srx3/tiledb-soma ~/dev/nextflow/scRecounter/tmp/tiledb/srx3/ 36 | 37 | Production (scRecounter): 38 | ./scripts/tiledb-loader-tahoe.py --h5ad-ext h5ad.gz --db-uri /processed_datasets/scRecount/tahoe/tiledb-soma /processed_datasets/scRecount/tahoe/ 39 | """ 40 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 41 | parser.add_argument( 42 | 'base_dir', type=str, help='Base directory to search for input data files' 43 | ) 44 | parser.add_argument( 45 | '--db-uri', type=str, default="tiledb-soma", 46 | help='URI of existing TileDB database, or it will be created if it does not exist' 47 | ) 48 | parser.add_argument( 49 | '--h5ad-ext', type=str, default="h5ad", 50 | help='File extension (suffix) for h5ad files' 51 | ) 52 | parser.add_argument( 53 | '--max-datasets', type=int, default=None, 54 | help='Maximum number of datasets to process' 55 | ) 56 | return parser.parse_args() 57 | 58 | 59 | def find_matrix_files( 60 | base_dir: str, 61 | file_ext: str, 62 | max_datasets: Optional[int]=None 63 | ) -> List[tuple]: 64 | """ 65 | Recursively find matrix.mtx.gz files and extract SRX IDs. 66 | Args: 67 | base_dir: Base directory to search 68 | max_datasets: Maximum number of datasets to process 69 | Returns: 70 | List of tuples (matrix_path, srx_id) 71 | """ 72 | logging.info(f"Searching for new data files in {base_dir}...") 73 | h5ad_files = glob(f"{base_dir}/*{file_ext}") 74 | if max_datasets: 75 | h5ad_files = h5ad_files[:max_datasets] 76 | 77 | logging.info(f" Found {len(h5ad_files)} new data files to process.") 78 | return h5ad_files 79 | 80 | 81 | def append_to_database(db_uri: str, adata: sc.AnnData) -> None: 82 | """ 83 | Append an AnnData object to the TileDB database. 84 | Args: 85 | db_uri: URI of the TileDB database 86 | adata: AnnData object to append 87 | """ 88 | logging.info(" Appending data...") 89 | 90 | # Register AnnData objects 91 | rd = tiledbsoma.io.register_anndatas( 92 | db_uri, 93 | [adata], 94 | measurement_name="RNA", 95 | obs_field_name="obs_id", 96 | var_field_name="var_id", 97 | ) 98 | 99 | # Apply resize 100 | with tiledbsoma.Experiment.open(db_uri) as exp: 101 | tiledbsoma.io.resize_experiment( 102 | exp.uri, 103 | nobs=rd.get_obs_shape(), 104 | nvars=rd.get_var_shapes() 105 | ) 106 | 107 | # Ingest new data into the db 108 | tiledbsoma.io.from_anndata( 109 | db_uri, 110 | adata, 111 | measurement_name="RNA", 112 | registration_mapping=rd, 113 | ) 114 | 115 | def create_tiledb(db_uri: str, adata: sc.AnnData) -> None: 116 | """ 117 | Create a new tiledb database. 118 | Args: 119 | db_uri: URI of the TileDB database 120 | adata: AnnData object to append 121 | """ 122 | logging.info(f" Creating new database...") 123 | tiledbsoma.io.from_anndata( 124 | db_uri, 125 | adata, 126 | measurement_name="RNA", 127 | ) 128 | 129 | def load_tiledb(h5ad_files: List[str], db_uri: str) -> None: 130 | """ 131 | Load all h5ad files into TileDB-SOMA database 132 | Args: 133 | h5ad_files: List of h5ad files to load 134 | db_uri: URI of the TileDB database 135 | """ 136 | for infile in h5ad_files: 137 | logging.info(f"Processing {infile}...") 138 | 139 | # load anndata object 140 | adata = sc.read_h5ad(infile) 141 | ## format obs and var 142 | if not "obs_id" in adata.obs.columns: 143 | adata.obs["obs_id"] = adata.obs.index 144 | if not "var_id" in adata.var.columns: 145 | adata.var["var_id"] = adata.var.index 146 | 147 | # add to database 148 | if not os.path.exists(db_uri): 149 | create_tiledb(db_uri, adata) 150 | else: 151 | append_to_database(db_uri, adata) 152 | 153 | # clear memory 154 | del adata 155 | gc.collect() 156 | 157 | def main(): 158 | """Main function to run the TileDB loader workflow.""" 159 | args = parse_arguments() 160 | 161 | # Find all matrix files and their corresponding SRX IDs 162 | h5ad_files = find_matrix_files( 163 | args.base_dir, 164 | args.h5ad_ext, 165 | max_datasets=args.max_datasets 166 | ) 167 | 168 | # Load data into memory and append to TileDB 169 | load_tiledb(h5ad_files, args.db_uri) 170 | 171 | 172 | if __name__ == "__main__": 173 | from dotenv import load_dotenv 174 | load_dotenv(override=True) 175 | main() 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /bin/parallel-fastq-dump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys, os, shutil, tempfile, subprocess, argparse, logging 3 | 4 | __version__ = "0.6.7" 5 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) 6 | 7 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass 8 | 9 | desc = "parallel fastq-dump wrapper, extra args will be passed through" 10 | epi = """DESCRIPTION: 11 | Example: parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip 12 | """ 13 | 14 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 15 | parser.add_argument("-s","--sra-id", help="SRA id", action="append") 16 | parser.add_argument("-t","--threads", help="number of threads", default=1, type=int) 17 | parser.add_argument("-O","--outdir", help="output directory", default=".") 18 | parser.add_argument("-T","--tmpdir", help="temporary directory", default=None) 19 | parser.add_argument("-N","--minSpotId", help="Minimum spot id", default=1, type=int) 20 | parser.add_argument("-X","--maxSpotId", help="Maximum spot id", default=None, type=int) 21 | parser.add_argument("-V","--version", help="shows version", action="store_true", default=False) 22 | 23 | def pfd(args: argparse.Namespace, srr_id: str, extra_args: list[str]) -> None: 24 | """Parallel fastq-dump. 25 | Args: 26 | args: Parsed command-line arguments. 27 | srr_id: Identifier for the SRA run. 28 | extra_args: Additional arguments to pass to fastq-dump. 29 | """ 30 | tmp_dir = tempfile.TemporaryDirectory(prefix="pfd_", dir=args.tmpdir) 31 | logging.info(f"tempdir: {tmp_dir.name}") 32 | n_spots = get_spot_count(srr_id) 33 | logging.info(f"{srr_id} spots: {n_spots}") 34 | start = max(args.minSpotId, 1) 35 | end = min(args.maxSpotId, n_spots) if args.maxSpotId is not None else n_spots 36 | blocks = split_blocks(start, end, args.threads) 37 | logging.info(f"blocks: {blocks}") 38 | ps = [] 39 | for i in range(args.threads): 40 | d = os.path.join(tmp_dir.name, str(i)) 41 | os.mkdir(d) 42 | cmd = ["fastq-dump","-N",str(blocks[i][0]),"-X",str(blocks[i][1]),"-O",d]+extra_args+[srr_id] 43 | logging.info(f"CMD: {' '.join(cmd)}") 44 | p = subprocess.Popen(cmd) 45 | ps.append(p) 46 | wfd = {} 47 | for i,p in enumerate(ps): 48 | exit_code = p.wait() 49 | if exit_code != 0: 50 | logging.warning(f"fastq-dump error! exit code: {exit_code}") 51 | sys.exit(1) 52 | tmp_path = os.path.join(tmp_dir.name, str(i)) 53 | for fo in os.listdir(tmp_path): 54 | if fo not in wfd: wfd[fo] = open(os.path.join(args.outdir, fo), "wb") 55 | with open(os.path.join(tmp_path, fo), "rb") as fd: 56 | shutil.copyfileobj(fd, wfd[fo]) 57 | os.remove(os.path.join(tmp_path, fo)) 58 | for fd in wfd.values(): fd.close() 59 | 60 | def split_blocks(start: int, end: int, n_pieces: int) -> list[list[int]]: 61 | """Split a range of spot IDs into smaller blocks. 62 | Args: 63 | start: The first spot ID. 64 | end: The last spot ID. 65 | n_pieces: Number of blocks to split into. 66 | Returns: 67 | A list of lists, where each sub-list is [block_start, block_end]. 68 | """ 69 | total = end - start + 1 70 | avg = total // n_pieces 71 | out = [] 72 | last = start 73 | for i in range(n_pieces): 74 | out.append([last, last + avg - 1]) 75 | last += avg 76 | if i == n_pieces - 1: out[i][1] += total % n_pieces 77 | return out 78 | 79 | def get_spot_count(sra_id: str) -> int: 80 | """Get spot count using sra-stat. 81 | Args: 82 | sra_id: Identifier for the SRA run. 83 | Returns: 84 | Total number of spots in the specified SRA. 85 | """ 86 | cmd = ["sra-stat","--meta","--quick",sra_id] 87 | logging.info(f"CMD: {' '.join(cmd)}") 88 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 89 | stdout, stderr = p.communicate() 90 | txt = stdout.decode().rstrip().split("\n") 91 | total = 0 92 | try: 93 | for l in txt: total += int(l.split("|")[2].split(":")[0]) 94 | except IndexError: 95 | msg = "sra-stat output parsing error!\n--sra-stat STDOUT--\n{}\n--sra-stat STDERR--\n{}" 96 | raise IndexError(msg.format("\n".join(txt), stderr.decode().rstrip())) 97 | return total 98 | 99 | def partition(f, l: list) -> tuple[list, list]: 100 | """Partition a list into two groups based on a predicate. 101 | Args: 102 | f: A function that returns True or False for a given element. 103 | l: The list to be partitioned. 104 | Returns: 105 | A tuple of two lists: (matching, not_matching). 106 | """ 107 | r = ([],[]) 108 | for i in l: r[0 if f(i) else 1].append(i) 109 | return r 110 | 111 | def is_sra_file(path: str) -> bool: 112 | """Check if a file path is potentially an SRA file. 113 | Args: 114 | path: File path. 115 | Returns: 116 | True if the file is recognized as SRA-related, otherwise False. 117 | """ 118 | f = os.path.basename(path) 119 | if f.lower().endswith(".sra"): return True 120 | if any(x in f.upper() for x in ["SRR","ERR","DRR"]): return True 121 | return False 122 | 123 | def main() -> None: 124 | """Main entry point to parse arguments and run parallel fastq-dump.""" 125 | args, extra = parser.parse_known_args() 126 | if args.version: 127 | print(f"parallel-fastq-dump : {__version__}") 128 | subprocess.Popen(["fastq-dump","-V"]).wait() 129 | sys.exit(0) 130 | elif args.sra_id: 131 | extra_srrs, extra_args = partition(is_sra_file, extra) 132 | args.sra_id.extend(extra_srrs) 133 | logging.info(f"SRR ids: {args.sra_id}") 134 | logging.info(f"extra args: {extra_args}") 135 | if not os.path.isdir(args.outdir) and args.outdir != ".": 136 | os.makedirs(args.outdir) 137 | if args.tmpdir and not os.path.isdir(args.tmpdir) and args.tmpdir != ".": 138 | os.makedirs(args.tmpdir) 139 | for si in args.sra_id: pfd(args, si, extra_args) 140 | else: 141 | parser.print_help() 142 | sys.exit(1) 143 | 144 | if __name__ == "__main__": 145 | main() -------------------------------------------------------------------------------- /scripts/tiledb-loader/bin/mtx-to-h5ad.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import logging 6 | import argparse 7 | import concurrent.futures 8 | from pathlib import Path 9 | from itertools import chain, repeat 10 | from typing import List, Set, Tuple, Optional 11 | ## 3rd party 12 | import numpy as np 13 | import scipy.sparse 14 | import pandas as pd 15 | import tiledbsoma 16 | import tiledbsoma.io 17 | import scanpy as sc 18 | from pypika import Query, Table 19 | ## package 20 | from db_utils import db_connect 21 | 22 | # format logging 23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 24 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING) 25 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING) 26 | logging.getLogger("tiledb").setLevel(logging.WARNING) 27 | 28 | # classes 29 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 30 | pass 31 | 32 | # functions 33 | def parse_arguments() -> argparse.Namespace: 34 | """ 35 | Parse command-line arguments. 36 | """ 37 | desc = 'Convert mtx files to h5ad.' 38 | epi = """DESCRIPTION: 39 | Convert mtx files to h5ad in parallel. 40 | """ 41 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 42 | parser.add_argument( 43 | '--srx', type=str, help="SRX accessions", required=True 44 | ) 45 | parser.add_argument( 46 | '--path', type=str, help="Path to matrix.mtx.gz files", required=True 47 | ) 48 | parser.add_argument( 49 | '--missing-metadata', type=str, default="error", 50 | choices=["error", "skip", "allow"], 51 | help="How do handle missing metadata?" 52 | ) 53 | parser.add_argument( 54 | '--threads', type=int, default=8, help="Number of threads to use" 55 | ) 56 | return parser.parse_args() 57 | 58 | 59 | def load_matrix_as_anndata( 60 | srx_id: str, 61 | matrix_path: str, 62 | missing_metadata: str="error", 63 | ) -> sc.AnnData: 64 | """ 65 | Load a matrix.mtx.gz file as an AnnData object. 66 | Args: 67 | srx_id: SRX accession 68 | matrix_path: Path to matrix.mtx.gz file 69 | missing_metadata: How to handle missing metadata 70 | Returns: 71 | AnnData object 72 | """ 73 | # get metadata from scRecounter postgresql database 74 | srx_metadata = Table("srx_metadata") 75 | stmt = ( 76 | Query 77 | .from_(srx_metadata) 78 | .select( 79 | srx_metadata.lib_prep, 80 | srx_metadata.tech_10x, 81 | srx_metadata.organism, 82 | srx_metadata.tissue, 83 | srx_metadata.disease, 84 | srx_metadata.purturbation, 85 | srx_metadata.cell_line, # TODO: add cell_prep 86 | srx_metadata.czi_collection_id, 87 | srx_metadata.czi_collection_name, 88 | ) 89 | .where(srx_metadata.srx_accession == srx_id) 90 | ) 91 | metadata = None 92 | with db_connect() as conn: 93 | metadata = pd.read_sql(str(stmt), conn) 94 | 95 | ## if metadata is not found, return None 96 | if metadata is None or metadata.shape[0] == 0: 97 | if missing_metadata == "allow": 98 | logging.warning( 99 | f" Metadata not found for SRX accession {srx_id}, but `--missing-metadata allow` used" 100 | ) 101 | pass 102 | elif missing_metadata == "skip": 103 | logging.warning( 104 | f" Metadata not found for SRX accession {srx_id}, but `--missing-metadata skip` used" 105 | ) 106 | return None 107 | elif missing_metadata == "error": 108 | raise ValueError(f" Metadata not found for SRX accession {srx_id}") 109 | else: 110 | raise ValueError(f" Invalid value for `--missing-metadata`") 111 | if metadata.shape[0] > 1: 112 | raise ValueError(f"Multiple metadata entries found for SRX accession {srx_id}") 113 | 114 | # load count matrix 115 | adata = sc.read_10x_mtx( 116 | os.path.dirname(matrix_path), 117 | var_names="gene_ids", 118 | make_unique=True 119 | ) 120 | 121 | # calculate total counts 122 | if scipy.sparse.issparse(adata.X): 123 | adata.obs["gene_count"] = (adata.X > 0).sum(axis=1).A1 124 | adata.obs["umi_count"] = adata.X.sum(axis=1).A1 125 | else: 126 | adata.obs["gene_count"] = (adata.X > 0).sum(axis=1) 127 | adata.obs["umi_count"] = adata.X.sum(axis=1) 128 | adata.obs["barcode"] = adata.obs.index 129 | 130 | # append SRX to barcode to create a global-unique index 131 | adata.obs.index = adata.obs.index + f"_{srx_id}" 132 | 133 | # add metadata to adata 134 | adata.obs["SRX_accession"] = srx_id 135 | for col in metadata.columns: 136 | try: 137 | adata.obs[col] = str(metadata[col].values[0]) 138 | except IndexError: 139 | adata.obs[col] = None 140 | 141 | return adata 142 | 143 | def mtx_to_h5ad( 144 | matrix_files: str, 145 | missing_metadata: str="error", 146 | threads: int=8 147 | ) -> sc.AnnData: 148 | """ 149 | Convert a list of matrix.mtx.gz files to a single h5ad file. 150 | """ 151 | logging.info("Loading mtx files to h5ad...") 152 | 153 | # paralle load mtx files 154 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 155 | adata = list(executor.map( 156 | lambda x: load_matrix_as_anndata( 157 | x[0], x[1], missing_metadata=missing_metadata 158 | ), 159 | matrix_files 160 | )) 161 | ## filter out empty objects 162 | adata = [a for a in adata if a is not None] 163 | 164 | ## concat 165 | adata = sc.concat(adata, join="outer") 166 | 167 | ## write to h5ad 168 | adata.write_h5ad(f"data.h5ad") 169 | logging.info(f"Saved h5ad file to data.h5ad") 170 | 171 | def parse_arg(arg: str) -> List[str]: 172 | """Parse a comma-separated argument into a list.""" 173 | return [x.strip() for x in arg.lstrip("[").rstrip("]").split(",")] 174 | 175 | def main(): 176 | """Main function to run the TileDB loader workflow.""" 177 | args = parse_arguments() 178 | 179 | # parse args 180 | mtx_files = list(zip(parse_arg(args.srx), parse_arg(args.path))) 181 | logging.info(f"mtx file count: {len(mtx_files)}") 182 | 183 | # create h5ad files 184 | mtx_to_h5ad( 185 | mtx_files, 186 | threads=args.threads, 187 | missing_metadata=args.missing_metadata 188 | ) 189 | 190 | if __name__ == "__main__": 191 | from dotenv import load_dotenv 192 | load_dotenv(override=True) 193 | main() -------------------------------------------------------------------------------- /bin/get-db-accessions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | from __future__ import print_function 4 | import os 5 | import re 6 | import sys 7 | import argparse 8 | import logging 9 | from typing import List 10 | ## 3rd party 11 | import pandas as pd 12 | import psycopg2 13 | import pandas as pd 14 | from pypika import Query, Table, Criterion 15 | from psycopg2.extras import execute_values 16 | from psycopg2.extensions import connection 17 | ## pipeline 18 | from db_utils import db_connect, db_upsert 19 | 20 | 21 | # logging 22 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 23 | 24 | # argparse 25 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 26 | argparse.RawDescriptionHelpFormatter): 27 | pass 28 | 29 | desc = 'Get SRA accessions from the scRecounter database' 30 | epi = """DESCRIPTION: 31 | Get SRA accessions from the scRecounter database. 32 | Write out the accessions csv table: 33 | - sample: SRX accession 34 | - accession: SRR accession 35 | - organism: organism name 36 | """ 37 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 38 | formatter_class=CustomFormatter) 39 | parser.add_argument('--max-srx', type=int, default=5, 40 | help='Max number of srx records to return') 41 | parser.add_argument('--database', type=str, default=["sra", "gds"], nargs="+", 42 | help='Only return records from these databases') 43 | parser.add_argument('--organisms', type=str, default="human,mouse", 44 | help='Organisms to filter by; comma-separated list') 45 | parser.add_argument('--outfile', type=str, default="accessions.csv", 46 | help='Output file name') 47 | 48 | # functions 49 | def db_get_unprocessed_records( 50 | conn: connection, 51 | process: str, 52 | database: List[str], 53 | max_srx: int=3, 54 | organisms: List[str] = ["human", "mouse"] 55 | ) -> pd.DataFrame: 56 | """ 57 | Get all suitable unprocessed SRX records, limiting by unique srx_accession values. 58 | Args: 59 | conn: Connection to the database. 60 | database: Name of the database to query. 61 | max_srx: Maximum number of SRX records to return. 62 | Returns: 63 | dataframe of unprocessed SRX records. 64 | """ 65 | # init tables 66 | srx_metadata = Table("srx_metadata") 67 | srx_srr = Table("srx_srr") 68 | scr_log = Table("screcounter_log") 69 | 70 | # subquery to get srx_accessions 71 | ## find already-processed records in sc-recounter log 72 | nontarget_srx = ( 73 | Query 74 | .from_(scr_log) 75 | .select(scr_log.sample) 76 | .where( 77 | Criterion.all([ 78 | scr_log.process == process, 79 | scr_log.step == "Final", 80 | scr_log.status == "Success" 81 | ]) 82 | ) 83 | .distinct() 84 | ) 85 | 86 | # status 87 | num_nontarget = pd.read_sql(str(nontarget_srx), conn).shape[0] 88 | logging.info(f"No. of non-target records: {num_nontarget}") 89 | 90 | ## find unprocessed records 91 | target_srx = ( 92 | Query 93 | .from_(srx_metadata) 94 | .left_join(nontarget_srx) 95 | .on(srx_metadata.srx_accession == nontarget_srx.sample) 96 | .select(srx_metadata.srx_accession) 97 | .where( 98 | Criterion.all([ 99 | nontarget_srx.sample.isnull(), # filters out already processed records 100 | srx_metadata.database.isin(database), 101 | srx_metadata.srx_accession != "", 102 | (srx_metadata.srx_accession.like("SRX%") | srx_metadata.srx_accession.like("ERX%")), 103 | srx_metadata.is_illumina == "yes", 104 | srx_metadata.is_single_cell == "yes", 105 | srx_metadata.is_paired_end == "yes", 106 | srx_metadata.lib_prep == "10x_Genomics", 107 | srx_metadata.organism.isin(organisms), 108 | srx_metadata.czi_collection_id.isnull() | srx_metadata.czi_collection_id.isin(["", "NaN", "None"]), 109 | #~srx_metadata.tech_10x.isin(["other", "not_applicable"]) # TODO: comment to make the query more permissive 110 | ]) 111 | ) 112 | .distinct() 113 | .limit(max_srx) 114 | ) 115 | 116 | # status 117 | #df_target = pd.read_sql(str(target_srx), conn) 118 | #print(f"No. of target records: {df_target.shape[0]}") 119 | 120 | # main query to obtain the SRR for each SRX and then format the output 121 | stmt = ( 122 | Query 123 | .from_(srx_metadata) 124 | .inner_join(srx_srr) 125 | .on(srx_metadata.srx_accession == srx_srr.srx_accession) 126 | .where( 127 | srx_metadata.srx_accession.isin(target_srx) 128 | ) 129 | .select( 130 | srx_metadata.srx_accession.as_("sample"), 131 | srx_srr.srr_accession.as_("accession"), 132 | srx_metadata.organism.as_("organism"), 133 | srx_metadata.tech_10x.as_("tech_10x"), 134 | ) 135 | .distinct() 136 | ) 137 | 138 | # fetch as pandas dataframe 139 | return pd.read_sql(str(stmt), conn) 140 | 141 | def main(args): 142 | # parse organisms 143 | args.organisms = args.organisms.split(",") 144 | 145 | # set process name; used to determine which records have been processed 146 | process = "Get db accessions" 147 | 148 | # get unprocessed records 149 | with db_connect() as conn: 150 | df = db_get_unprocessed_records( 151 | conn, process, args.database, max_srx=args.max_srx, organisms=args.organisms 152 | ) 153 | 154 | # remove spaces from organism 155 | df["organism"] = df["organism"].str.replace(" ", "_") 156 | 157 | # log number of records 158 | num_unique_srx = df["sample"].nunique() 159 | logging.info(f"No. of target SRX accessions: {num_unique_srx}") 160 | num_unique_acc = df["accession"].nunique() 161 | logging.info(f"No. of target SRR accessions: {num_unique_acc}") 162 | srr_per_srx = df.groupby("sample")["accession"].count() 163 | logging.info(f"No. of target SRR per SRX: {srr_per_srx.to_dict()}") 164 | 165 | ## write out records 166 | df.to_csv(args.outfile, index=False) 167 | 168 | # write to log table in scRecounter database 169 | ## convert df 170 | df["process"] = process 171 | df["step"] = "Final" 172 | df["status"] = "Success" 173 | df["message"] = "Obtained database accession for processing" 174 | 175 | ## filter columns 176 | df = df[["sample", "accession", "process", "step", "status", "message"]] 177 | 178 | ## upsert log to database 179 | logging.info("Updating scRecounter log table...") 180 | with db_connect() as conn: 181 | db_upsert(df, "screcounter_log", conn) 182 | 183 | ## script main 184 | if __name__ == '__main__': 185 | args = parser.parse_args() 186 | main(args) 187 | 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scRecouter 2 | ========== 3 | 4 | A Nextflow pipeline to re-process single-cell RNA-seq data from the Sequence Read Archive. 5 | 6 | # Workflow 7 | 8 | * **User provides:** 9 | * A table of samples & associated accessions 10 | * Alternatively, the pipeline can pull accessions from the scRecounter SQL database 11 | * Associated files required: 12 | * A table of barcodes to use for cell barcode and UMI identification 13 | * A table of STAR index directories to use for mapping 14 | * **Pipeline:** 15 | * Load accessions from provided table or SQL database 16 | * For each accession: 17 | * Use `fastq-dump` to download a subset of reads as fastq files from the SRA 18 | * Determine the "best" STAR parameters by mapping the reads using various parameter combinations 19 | * Parameters: version of cell barcodes, cell barcode length, UMI length, strand, STAR reference index 20 | * The STAR parameters are selected based on the fraction of valid barcodes 21 | * Download all reads with `fasterq-dump` 22 | * If download fails, try again with `fastq-dump` using a max of `fallback_max_spots` reads (see `nextflow.config`). 23 | * Map the reads with STARsolo using the "best" STAR parameters 24 | 25 | # Manuscript 26 | 27 | **scBaseCamp: An AI agent-curated, uniformly processed, and continually expanding single cell data repository**. 28 | Nicholas D Youngblut, Christopher Carpenter, Jaanak Prashar, Chiara Ricci-Tam, Rajesh Ilango, Noam Teyssier, 29 | Silvana Konermann, Patrick Hsu, Alexander Dobin, David P Burke, Hani Goodarzi, Yusuf H Roohani. 30 | bioRxiv 2025.02.27.640494; doi: [https://doi.org/10.1101/2025.02.27.640494](https://doi.org/10.1101/2025.02.27.640494) 31 | 32 | # Installation 33 | 34 | ## Conda & mamba install 35 | 36 | `mamba` is needed to run the pipeline. 37 | It is a faster version of `conda`. 38 | `mamba` can be installed via `conda`. 39 | You can use `conda` instead of `mamba` if you prefer. 40 | 41 | ## Nextflow install 42 | 43 | It is easiest to install Nextflow using `mamba` (or `conda`). 44 | 45 | ```bash 46 | mamba create -n nextflow_env -c bioconda nextflow 47 | ``` 48 | 49 | Make sure to activate the environment before running the pipeline: 50 | 51 | ```bash 52 | mamba activate nextflow_env 53 | ``` 54 | 55 | All other dependencies will be installed by Nextflow. 56 | 57 | 58 | ## Pipeline install 59 | 60 | ### Clone the repo 61 | 62 | ```bash 63 | git clone https://github.com/ArcInstitute/scRecounter.git \ 64 | && cd scRecounter 65 | ``` 66 | 67 | ### Pipeline conda environments (if running locally) 68 | 69 | The pipeline uses conda environments to manage dependencies. 70 | Nextflow will automatically create the environments as long as `mamba` is installed. 71 | 72 | **Note:** it can take a while to create the environments, even with `mamba`. 73 | 74 | ### Pipeline Docker containers (if running on GCP) 75 | 76 | The pipeline defaults to using custom Docker containers hosted on Google Artifact Registry. 77 | 78 | You can build the Docker containers yourself. See [./docker/README.md](./docker/README.md) for details. 79 | Be sure to update the [profiles.config](./config/profiles.config) file to point to the new containers. 80 | 81 | # Usage 82 | 83 | ## Input 84 | 85 | ### Accessions table 86 | 87 | Lists the samples and their associated SRA experiment accessions. 88 | 89 | > This table is not required if the pipeline is pulling accessions from the scRecounter SQL database. 90 | To pull accessions from the database, do not provide `--accessions` via the command line. 91 | 92 | Example: 93 | 94 | | sample | accession | organism | 95 | |-------------|-------------|----------| 96 | | SRX22716300 | SRR27024456 | human | 97 | | SRX25994842 | SRR30571763 | mouse | 98 | 99 | > `organism` is optional. It will determine the STAR index to use for mapping. Otherwise all indexes will be used for parameter selection. 100 | 101 | ### Barcode table 102 | 103 | Lists all of the possible barcodes that will be used to determine the cell barcode and UMI for the samples. 104 | 105 | Example: 106 | 107 | | name | cell_barcode_length | umi_length | file_path | 108 | |------------------|---------------------|------------|--------------------------------------------------------------------------| 109 | | 737K-arc-v1 | 16 | 12 | /large_storage/goodarzilab/public/scRecount/genomes/737K-arc-v1.txt | 110 | | 737K-august-2016 | 16 | 12 | /large_storage/goodarzilab/public/scRecount/genomes/737K-august-2016.txt | 111 | | 3M-february-2018 | 16 | 10 | /large_storage/goodarzilab/public/scRecount/genomes/3M-february-2018.txt | 112 | 113 | 114 | ### STAR index table 115 | 116 | Lists the STAR index files that will be used to map the reads. 117 | 118 | Example: 119 | 120 | | Organism | Star Index Path | 121 | |----------|-----------------------------------------------------------------------------------| 122 | | human | /large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38 | 123 | | mouse | /large_storage/goodarzilab/public/scRecount/genomes/star2.7.11_refData_2020_mm10 | 124 | 125 | 126 | > If `organism` is provided in the `Accessions` table, the STAR index will be selected based on the `organism` column. 127 | Thus, it reduces the number of parameter combinations that need to be tested. 128 | 129 | ## Nextflow run 130 | 131 | ### Test runs 132 | 133 | Local run with provided accessions: 134 | 135 | ```bash 136 | nextflow run main.nf \ 137 | -work-dir tmp/work \ 138 | -profile conda,trace,report,vm,vm_dev,dev,acc_dev 139 | ``` 140 | 141 | Local run with provided accessions (problematic datasets) 142 | 143 | ```bash 144 | nextflow run main.nf \ 145 | -work-dir tmp/work \ 146 | -profile conda,trace,report,vm,vm_dev,dev,acc_dev_problems 147 | ``` 148 | 149 | With conda, accessions pulled from scRecounter database: 150 | 151 | ```bash 152 | nextflow run main.nf \ 153 | -work-dir tmp/work \ 154 | -profile conda,trace,report,vm,vm_dev,dev,no_acc_dev 155 | ``` 156 | 157 | GCP run with provided accessions: 158 | 159 | ```bash 160 | nextflow run main.nf \ 161 | -profile docker,trace,report,gcp,gcp_dev,dev,acc_dev 162 | ``` 163 | 164 | GCP run with accessions pulled from scRecounter SQL database: 165 | 166 | ```bash 167 | nextflow run main.nf \ 168 | -profile docker,trace,report,gcp,gcp_dev,dev,no_acc_dev 169 | ``` 170 | 171 | ### Characterize datasets 172 | 173 | Use just a small subset of reads in the dataset to identify library prep method, species, etc. 174 | 175 | ```bash 176 | nextflow run /home/nickyoungblut/dev/nextflow/scRecounter/main.nf \ 177 | -work-dir gs://arc-ctc-nextflow/scRecounter/work \ 178 | -profile docker,gcp \ 179 | -ansi-log false \ 180 | --max_spots 100000 \ 181 | --output_dir gs://arc-ctc-nextflow/scRecounter/results/ \ 182 | --accessions TMP/SRX22716300.csv 183 | ``` 184 | 185 | ### Deploy to GCP Cloud Run 186 | 187 | See [./docker/sc-recounter-run/README.md](./docker/sc-recounter-run/README.md) for details. 188 | 189 | 190 | # Contributing 191 | 192 | Feel free to fork the repository and submit a pull request. 193 | However, the top priority is to keep SRAgent functioning 194 | for the ongoing scBaseCamp project. -------------------------------------------------------------------------------- /scripts/tiledb-loader/bin/h5ad-to-db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import gc 6 | import logging 7 | import argparse 8 | import concurrent.futures 9 | from typing import List, Set, Tuple, Optional 10 | ## 3rd party 11 | import pandas as pd 12 | import tiledbsoma 13 | import tiledbsoma.io 14 | import scanpy as sc 15 | 16 | # format logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING) 19 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING) 20 | logging.getLogger("tiledb").setLevel(logging.WARNING) 21 | 22 | # classes 23 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 24 | pass 25 | 26 | # functions 27 | def parse_arguments() -> argparse.Namespace: 28 | """ 29 | Parse command-line arguments. 30 | """ 31 | desc = 'Add scRNA-seq data to a TileDB database.' 32 | epi = """DESCRIPTION: 33 | """ 34 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 35 | parser.add_argument( 36 | 'h5ad_files', type=str, nargs="+", help='Path to the h5ad file(s) to load.' 37 | ) 38 | parser.add_argument( 39 | '--db-uri', type=str, help='URI of the TileDB database.', required=True 40 | ) 41 | parser.add_argument( 42 | '--from-disk', action='store_true', default=False, help='Load from disk instead of memory.' 43 | ) 44 | parser.add_argument( 45 | '--threads', type=int, default=8, help='Number of threads to use.' 46 | ) 47 | return parser.parse_args() 48 | 49 | def append_to_database_from_mem(db_uri: str, adata: sc.AnnData) -> None: 50 | """ 51 | Append an AnnData object to the TileDB database. 52 | Args: 53 | db_uri: URI of the TileDB database 54 | adata: AnnData object to append 55 | """ 56 | logging.info(" Appending data...") 57 | 58 | # Register AnnData objects 59 | rd = tiledbsoma.io.register_anndatas( 60 | db_uri, 61 | [adata], 62 | measurement_name="RNA", 63 | obs_field_name="obs_id", 64 | var_field_name="var_id", 65 | ) 66 | 67 | with tiledbsoma.Experiment.open(db_uri) as exp: 68 | tiledbsoma.io.resize_experiment( 69 | exp.uri, 70 | nobs=rd.get_obs_shape(), 71 | nvars=rd.get_var_shapes() 72 | ) 73 | 74 | # Ingest new data into the db 75 | tiledbsoma.io.from_anndata( 76 | db_uri, 77 | adata, 78 | measurement_name="RNA", 79 | registration_mapping=rd, 80 | ) 81 | 82 | def create_tiledb_from_mem(db_uri: str, adata: sc.AnnData) -> None: 83 | """ 84 | Create a new tiledb database. 85 | Args: 86 | db_uri: URI of the TileDB database 87 | adata: AnnData object to append 88 | """ 89 | logging.info(" Creating new database...") 90 | tiledbsoma.io.from_anndata( 91 | db_uri, 92 | adata, 93 | measurement_name="RNA", 94 | ) 95 | 96 | def load_tiledb_from_mem(h5ad_files: List[str], db_uri: str, threads: int=8) -> None: 97 | """ 98 | Loads `batch_size` files in parallel, then appends them all at once to the database. 99 | Args: 100 | matrix_files: List of tuples (matrix_path, srx_id) 101 | db_uri: URI of the TileDB database 102 | threads: Number of threads to use 103 | """ 104 | # load anndata objects in parallel 105 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 106 | ann_list = executor.map(sc.read_h5ad, h5ad_files) 107 | 108 | # append to database 109 | for i,adata in enumerate(ann_list, 1): 110 | logging.info(f"Processing matrix file {i} of {len(h5ad_files)}") 111 | if not os.path.exists(db_uri): 112 | create_tiledb_from_mem(db_uri, adata) 113 | else: 114 | append_to_database_from_mem(db_uri, adata) 115 | 116 | # status 117 | logging.info("All matrix files processed!") 118 | 119 | def append_to_database_from_disk(db_uri: str, h5ad_files: List[str], threads: int) -> None: 120 | """ 121 | Append a anndata object from h5ad files to the TileDB database. 122 | Args: 123 | db_uri: URI of the TileDB database 124 | h5ad_files: List of h5ad files to append 125 | threads: Number of threads to use 126 | """ 127 | logging.info(" Appending data...") 128 | 129 | # Register h5ad objects 130 | rd = tiledbsoma.io.register_h5ads( 131 | db_uri, 132 | h5ad_files, 133 | measurement_name="RNA", 134 | obs_field_name="obs_id", 135 | var_field_name="var_id", 136 | ) 137 | 138 | # Resize the experiment 139 | with tiledbsoma.Experiment.open(db_uri) as exp: 140 | tiledbsoma.io.resize_experiment( 141 | exp.uri, 142 | nobs=rd.get_obs_shape(), 143 | nvars=rd.get_var_shapes() 144 | ) 145 | 146 | # Ingest new data into the db 147 | with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: 148 | futures = [ 149 | executor.submit( 150 | tiledbsoma.io.from_h5ad, 151 | db_uri, 152 | h5ad_file, 153 | measurement_name="RNA", 154 | registration_mapping=rd 155 | ) 156 | for h5ad_file in h5ad_files 157 | ] 158 | # Wait for all futures to complete 159 | concurrent.futures.wait(futures) 160 | # Raise any exceptions that occurred 161 | for future in futures: 162 | future.result() 163 | 164 | def create_tiledb_from_disk(db_uri: str, h5ad_file: str) -> None: 165 | """ 166 | Create a new tiledb database. 167 | Args: 168 | db_uri: URI of the TileDB database 169 | h5ad_file: Path to the h5ad file to load 170 | """ 171 | logging.info(" Creating new database...") 172 | tiledbsoma.io.from_h5ad( 173 | db_uri, h5ad_file, measurement_name="RNA", 174 | ) 175 | 176 | def load_tiledb_from_disk(h5ad_files: List[str], db_uri: str, threads: int) -> None: 177 | """ 178 | Load h5ad files from disk and append them to the TileDB database. 179 | The database is created if it does not exist. 180 | Args: 181 | h5ad_files: List of h5ad files to load 182 | db_uri: URI of the TileDB database 183 | threads: Number of threads to use 184 | """ 185 | logging.info("Loading data from disk...") 186 | 187 | # append/create database 188 | if not os.path.exists(db_uri): 189 | create_tiledb_from_disk(db_uri, h5ad_files[0]) 190 | h5ad_files = h5ad_files[1:] 191 | append_to_database_from_disk(db_uri, h5ad_files, threads) 192 | 193 | # status 194 | logging.info("All matrix files processed!") 195 | 196 | 197 | def main(): 198 | """Main function to run the TileDB loader workflow.""" 199 | args = parse_arguments() 200 | 201 | # Load data into memory and append to TileDB 202 | if args.from_disk: 203 | load_tiledb_from_disk(args.h5ad_files, args.db_uri, args.threads) 204 | else: 205 | load_tiledb_from_mem(args.h5ad_files, args.db_uri, args.threads) 206 | 207 | 208 | if __name__ == "__main__": 209 | from dotenv import load_dotenv 210 | load_dotenv(override=True) 211 | main() 212 | 213 | -------------------------------------------------------------------------------- /scripts/gcp2chimera.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | from shutil import which, rmtree 6 | from typing import Tuple, List, Dict 7 | from datetime import datetime, timedelta 8 | from google.cloud import storage 9 | from subprocess import run 10 | 11 | 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 13 | pass 14 | 15 | def parse_args() -> argparse.Namespace: 16 | """ 17 | Parse command-line arguments. 18 | Returns: 19 | argparse.Namespace containing arguments. 20 | """ 21 | # default min/max datetime 22 | fmt = "%Y-%m-%d_%H-%M-%S" 23 | min_dt = (datetime.now() - timedelta(days=3)).strftime(fmt) 24 | max_dt = (datetime.now() - timedelta(days=2)).strftime(fmt) 25 | 26 | desc = 'Transfer scRecounter output files from GCP to Chimera.' 27 | epi = """DESCRIPTION: 28 | Transfer scRecounter output files from GCP to Chimera. 29 | Example: 30 | ./scripts/gcp2chimera.py \ 31 | --min-date-time 2025-02-18_00-00-00 \ 32 | --max-date-time 2025-02-19_00-00-00 \ 33 | --dest-dir /processed_datasets/scRecount/scRecounter/prod3 \ 34 | --dry-run \ 35 | gs://arc-ctc-screcounter/prod3/ 36 | """ 37 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 38 | parser.add_argument('gcs_dir', type=str, 39 | help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/prod3/)') 40 | parser.add_argument('--dest-dir', type=str, default="/processed_datasets/scRecount/scRecounter/prod3", 41 | help='Destination location on Chimera') 42 | parser.add_argument('--min-date-time', type=str, default=min_dt, 43 | help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)') 44 | parser.add_argument('--max-date-time', type=str, default=max_dt, 45 | help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)') 46 | parser.add_argument('--dry-run', action='store_true', 47 | help='Print commands without executing') 48 | parser.add_argument('--force', action='store_true', 49 | help='Force overwrite of existing directories in the dest-dir') 50 | return parser.parse_args() 51 | 52 | def parse_gs_path(gs_path: str) -> Tuple[str, str]: 53 | """ 54 | Parse a GCP bucket path. 55 | Args: 56 | gs_path: GCP bucket path starting with gs:// 57 | Returns: 58 | A tuple of (bucket_name, prefix). 59 | """ 60 | if not gs_path.startswith("gs://"): 61 | raise ValueError("Path must start with 'gs://'") 62 | parts = gs_path[5:].split("/", 1) 63 | bucket_name = parts[0] 64 | prefix = parts[1] if len(parts) > 1 else "" 65 | return bucket_name, prefix.rstrip("/") + "/" 66 | 67 | def list_screcounter_directories( 68 | bucket: storage.bucket.Bucket, 69 | prefix: str, 70 | min_dt: datetime, 71 | max_dt: datetime 72 | ) -> List[str]: 73 | """ 74 | List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket 75 | under the given prefix, filtered by date/time range. 76 | Args: 77 | bucket: The GCS bucket object. 78 | prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories. 79 | min_dt: The minimum datetime (inclusive). 80 | max_dt: The maximum datetime (inclusive). 81 | 82 | Returns: 83 | A list of directory prefixes that fall within the specified date/time range. 84 | """ 85 | print(f"Listing directories under {prefix}...") 86 | num_searched = 0 87 | dir_list = [] 88 | # Delimiter forces listing top-level folders under prefix 89 | iterator = bucket.list_blobs(prefix=prefix, delimiter='/') 90 | for page in iterator.pages: 91 | for folder in page.prefixes: 92 | folder_name = folder.rstrip('/').split('/')[-1] 93 | # Expecting folder_name like SCRECOUNTER_YYYY-MM-DD_hh-mm-ss 94 | if folder_name.startswith("SCRECOUNTER_"): 95 | num_searched += 1 96 | try: 97 | date_str = folder_name.replace("SCRECOUNTER_", "") 98 | dt = datetime.strptime(date_str, "%Y-%m-%d_%H-%M-%S") 99 | if min_dt <= dt <= max_dt: 100 | dir_list.append(folder) 101 | except ValueError: 102 | pass 103 | print(f" Num. dirs searched: {num_searched}") 104 | print(f" Num target dirs: {len(dir_list)}") 105 | return dir_list 106 | 107 | 108 | def gsutil_copy( 109 | screcounter_dirs: List[str], dest_dir: str, bucket_name: str, 110 | dry_run: bool=False, force: bool=False 111 | ) -> None: 112 | """ 113 | Use gsutil to copy files from GCP to Chimera. 114 | Args: 115 | screcounter_dirs: A list of GCP bucket directory prefixes. 116 | dest_dir: Destination directory on Chimera. 117 | """ 118 | os.makedirs(dest_dir, exist_ok=True) 119 | 120 | print(f"Copying files to {dest_dir}...", file=sys.stderr) 121 | for src_dir in screcounter_dirs: 122 | src_dir = "gs://" + os.path.join(bucket_name, src_dir) 123 | dest_dir_full = os.path.join(dest_dir, os.path.basename(os.path.dirname(src_dir))) 124 | print(f" Copying {src_dir} to {dest_dir_full}...", file=sys.stderr) 125 | if os.path.exists(dest_dir_full): 126 | msg = f" Destination directory already exists." 127 | if force: 128 | print(f"{msg} Deleting...", file=sys.stderr) 129 | if not dry_run: 130 | rmtree(dest_dir_full) 131 | else: 132 | print(f"{msg} Skipping.", file=sys.stderr) 133 | continue 134 | if not dry_run: 135 | cmd = f"gsutil -m cp -r {src_dir} {dest_dir}" 136 | print(f" CMD: {cmd}", file=sys.stderr) 137 | run(cmd, shell=True, check=True) 138 | 139 | def main(args: argparse.Namespace) -> None: 140 | """ 141 | Main function that: 142 | 1) Parses GCP bucket path. 143 | 2) Lists all SCRECOUNTER directories in the bucket (non-recursive). 144 | 3) Filters directories by date range. 145 | 4) For each target directory, use gsutil to copy files from bucket to Chimera. 146 | Args: 147 | args: An argparse.Namespace holding command-line arguments. 148 | """ 149 | # check if gsutil is installed 150 | if which("gsutil") is None: 151 | print("gsutil is not installed. Please install it first.") 152 | sys.exit(1) 153 | 154 | # Format arg date/time strings 155 | min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S") 156 | max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S") 157 | 158 | # Parse GCP bucket path 159 | bucket_name, path_prefix = parse_gs_path(args.gcs_dir) 160 | 161 | # Initialize GCP client and bucket 162 | client = storage.Client() 163 | bucket = client.bucket(bucket_name) 164 | 165 | # list all SCRECOUNTER directories in the bucket, filtered by date/time range 166 | screcounter_dirs = list_screcounter_directories(bucket, path_prefix, min_dt, max_dt) 167 | 168 | # for each directory, copy files to Chimera 169 | gsutil_copy(screcounter_dirs, args.dest_dir, bucket_name, args.dry_run, args.force) 170 | 171 | 172 | if __name__ == "__main__": 173 | from dotenv import load_dotenv 174 | load_dotenv() 175 | args = parse_args() 176 | main(args) -------------------------------------------------------------------------------- /scripts/search-cloud-run-job-logs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import json 4 | import argparse 5 | import subprocess 6 | from typing import Optional 7 | from datetime import datetime, timedelta, timezone 8 | import pytz 9 | import pandas as pd 10 | 11 | 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass 13 | 14 | def parse_args() -> argparse.Namespace: 15 | desc = 'Search for logs in Cloud Run Jobs that contain a specific keyword.' 16 | epi = """DESCRIPTION: 17 | Search for logs in Cloud Run Jobs. 18 | Examples: 19 | $ search-cloud-run-job-logs.py --keyword "ALREADY_EXISTS" 20 | $ search-cloud-run-job-logs.py --content 21 | """ 22 | # default datetime of N day ago 23 | default_datetime = (datetime.now(timezone.utc) - timedelta(days=3)).strftime("%Y-%m-%dT%H:%M:%SZ") 24 | 25 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 26 | parser.add_argument( 27 | "-k", "--keyword", default=None, help="The keyword to search for in Cloud Run Job logs." 28 | ) 29 | parser.add_argument( 30 | "-p", "--project-id", default="c-tc-429521", help="The Google Cloud project ID." 31 | ) 32 | parser.add_argument( 33 | "-n", "--job-name", default="sc-recounter-run", help="The name of the Cloud Run Job." 34 | ) 35 | parser.add_argument( 36 | "--start-datetime", type=str, default=default_datetime, 37 | help="Start datetime for logs in ISO 8601 format." 38 | ) 39 | parser.add_argument( 40 | "--severity", type=str, default="ERROR", 41 | help="The minimum severity level of the logs to retrieve." 42 | ) 43 | parser.add_argument( 44 | "--content", action="store_true", default=False, 45 | help="Print the content of the logs." 46 | ) 47 | parser.add_argument( 48 | "--limit", type=int, default=None, 49 | help="The total maximum number of logs to retrieve. Use None for unlimited." 50 | ) 51 | return parser.parse_args() 52 | 53 | def convert_time(timestamp: str) -> str: 54 | if timestamp == "Unknown": 55 | return timestamp 56 | try: 57 | gmt_time = datetime.fromisoformat(timestamp.replace("Z", "+00:00")) 58 | pct_timezone = pytz.timezone("America/Los_Angeles") 59 | timestamp = gmt_time.astimezone(pct_timezone) 60 | timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S %Z") 61 | except Exception as e: 62 | timestamp_str = f"Error converting time: {e}" 63 | return timestamp_str 64 | 65 | def find_logs( 66 | project_id: str, 67 | start_datetime: str, 68 | job_name: str="sc-recounter-run", 69 | region: str="us-east1", 70 | keyword: Optional[str]=None, 71 | severity: Optional[str]="ERROR", 72 | limit: int=None, 73 | ) -> None: 74 | """ 75 | Find logs in Cloud Run Jobs 76 | """ 77 | next_page_token = None 78 | logs_retrieved = 0 79 | 80 | # add 8 hours to start_datetime to account for the difference between GMT and PCT 81 | start_datetime = (datetime.fromisoformat(start_datetime) + timedelta(hours=8)).strftime("%Y-%m-%dT%H:%M:%SZ") 82 | 83 | 84 | job_info = [] 85 | while True: 86 | # Construct the gcloud command 87 | query = [ 88 | 'resource.type="cloud_run_job"', 89 | f'resource.labels.job_name="{job_name}"', 90 | f'resource.labels.location="{region}"', 91 | f'timestamp>="{start_datetime}"', 92 | ] 93 | if severity: 94 | query.append(f'severity>={severity}') 95 | if keyword: 96 | query.append(f'textPayload:{keyword}') 97 | query = " AND ".join(query) 98 | cmd = f"gcloud logging read '{query}' --project={project_id} --format=json --limit=1000" 99 | if next_page_token: 100 | cmd += f" --page-token={next_page_token}" 101 | 102 | # Execute the gcloud command 103 | print(f"Executing command: {cmd}", file=sys.stderr) 104 | try: 105 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) 106 | logs = json.loads(result.stdout) 107 | if not logs: 108 | print("No more logs found.") 109 | break 110 | 111 | for log in logs: 112 | job_name = log.get("resource", {}).get("labels", {}).get("job_name", "Unknown") 113 | execution_id = log.get("labels", {}).get("run.googleapis.com/execution_name", "Unknown") 114 | timestamp = convert_time(log.get("timestamp", "Unknown")) 115 | #print(f"Job Name: {job_name}, Execution ID: {execution_id}, Timestamp: {timestamp}") 116 | job_info.append([job_name, execution_id, timestamp]) 117 | logs_retrieved += 1 118 | 119 | # Stop if we've reached the limit 120 | if limit and logs_retrieved >= limit: 121 | print(f"Reached the limit of {limit} logs.") 122 | return job_info 123 | 124 | # Check if there's a next page token 125 | next_page_token = result.stderr.split("nextPageToken: ")[-1].strip() if "nextPageToken" in result.stderr else None 126 | if not next_page_token: 127 | break 128 | except subprocess.CalledProcessError as e: 129 | print(f"Error executing gcloud command: {e.stderr.strip()}") 130 | break 131 | except json.JSONDecodeError: 132 | print("Failed to parse the JSON response from gcloud.") 133 | break 134 | return job_info 135 | 136 | def get_content( 137 | log_info: pd.DataFrame, 138 | project_id: str, 139 | ) -> None: 140 | for index, row in log_info.iterrows(): 141 | job_name = row["Job Name"] 142 | execution_id = row["Execution ID"] 143 | query = f'labels."run.googleapis.com/execution_name"="{execution_id}"' 144 | cmd = f"gcloud logging read '{query}' --project={project_id} --format=json" 145 | print(f"Executing command: {cmd}", file=sys.stderr) 146 | try: 147 | result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) 148 | logs = json.loads(result.stdout) 149 | print(f"#-- Execution ID: {execution_id} --#") 150 | for log in logs: 151 | print(log.get("textPayload", "")) 152 | except subprocess.CalledProcessError as e: 153 | print(f"Error executing gcloud command: {e.stderr.strip()}") 154 | except json.JSONDecodeError: 155 | print("Failed to parse the JSON response from gcloud.") 156 | 157 | if __name__ == "__main__": 158 | args = parse_args() 159 | 160 | # find the logs 161 | log_info = find_logs( 162 | keyword=args.keyword, 163 | project_id=args.project_id, 164 | start_datetime=args.start_datetime, 165 | job_name=args.job_name, 166 | severity=args.severity, 167 | limit=args.limit, 168 | ) 169 | # convert to a pandas dataframe 170 | log_info = pd.DataFrame(log_info, columns=["Job Name", "Execution ID", "Timestamp"]) 171 | 172 | # get the content of the logs or save general log info to a CSV file 173 | if args.content: 174 | # get the content of the logs 175 | get_content(log_info, args.project_id) 176 | else: 177 | # save to a CSV file 178 | log_info.to_csv(sys.stdout, index=False) 179 | 180 | -------------------------------------------------------------------------------- /scripts/purge-srx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | from typing import Tuple, List, Dict 6 | import pandas as pd 7 | from google.cloud import storage 8 | from psycopg2.extensions import connection 9 | from db_utils import db_connect, db_update 10 | 11 | 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass 13 | 14 | def parse_args() -> argparse.Namespace: 15 | """ 16 | Parse command-line arguments. 17 | Returns: 18 | argparse.Namespace containing arguments. 19 | """ 20 | desc = 'Purge SRX accessions from the scRecounter system.' 21 | epi = """DESCRIPTION: 22 | Purging: 23 | - Removes SRX records from scRecounter SQL database. 24 | - Removes the SRX directories from the GCP output folder of the scRecounter pipeline. 25 | 26 | Note: only scRecounter is purged, not SRAgent. 27 | 28 | Examples: 29 | purge-srx.py ERX10024831 ERX10086874 30 | """ 31 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 32 | parser.add_argument('srx_accession', type=str, nargs='+', 33 | help='>=1 SRX accession to purge from the scRecounter system.') 34 | parser.add_argument('--dry-run', action='store_true', default=False, 35 | help='Print actions without executing.') 36 | parser.add_argument('--gcs-dir', type=str, default='gs://arc-ctc-screcounter/prod3/', 37 | help='Base directory in GCP bucket where SCRECOUNTER directories are stored.') 38 | return parser.parse_args() 39 | 40 | def parse_gs_path(gs_path: str) -> Tuple[str, str]: 41 | """ 42 | Parse a GCP bucket path. 43 | Args: 44 | gs_path: GCP bucket path starting with gs:// 45 | Returns: 46 | A tuple of (bucket_name, prefix). 47 | """ 48 | if not gs_path.startswith("gs://"): 49 | raise ValueError("Path must start with 'gs://'") 50 | parts = gs_path[5:].split("/", 1) 51 | bucket_name = parts[0] 52 | prefix = parts[1] if len(parts) > 1 else "" 53 | return bucket_name, prefix.rstrip("/") + "/" 54 | 55 | def list_screcounter_directories( 56 | bucket: storage.bucket.Bucket, 57 | prefix: str, 58 | srx_accesions: List[str], 59 | ) -> Dict[str,str]: 60 | """ 61 | List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket under the given prefix. 62 | Args: 63 | bucket: The GCS bucket object. 64 | prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories. 65 | Returns: 66 | A dictionary of {srx_accession: directory_path} for the target SRX accessions. 67 | """ 68 | print(f"Searching for SRX directories...", file=sys.stderr) 69 | srx_dirs = {} 70 | for blob in bucket.list_blobs(prefix=prefix): 71 | blob_dir = os.path.dirname(blob.name) 72 | blob_dir_base = os.path.basename(blob_dir) 73 | blob_dir_parent = os.path.basename(os.path.dirname(blob_dir)) 74 | if blob_dir_parent == "STAR" and blob_dir_base in srx_accesions: 75 | srx_dirs[blob_dir_base] = blob_dir 76 | print(f" Found {len(srx_dirs)} SRX directories", file=sys.stderr) 77 | return srx_dirs 78 | 79 | def purge_accession_tables( 80 | srx_dirs: Dict[str,str], bucket: storage.bucket.Bucket, dry_run: bool=False 81 | ) -> None: 82 | """ 83 | Purge SRX accessions from the accession tables in the GCP bucket. 84 | Args: 85 | srx_dirs: Dictionary of {srx_accession: directory_path} for the target SRX accessions. 86 | bucket: The GCS bucket object. 87 | dry_run: If True, only print actions without executing. 88 | """ 89 | if len(srx_dirs) == 0: 90 | return None 91 | print(f"Purging accession tables...", file=sys.stderr) 92 | target_parent_dirs = set() 93 | for srx, srx_dir in srx_dirs.items(): 94 | target_parent_dirs.add(os.path.dirname(os.path.dirname(srx_dir))) 95 | 96 | for parent_dir in target_parent_dirs: 97 | for blob in bucket.list_blobs(prefix=parent_dir): 98 | if os.path.basename(blob.name) == "accessions.csv": 99 | # read in accessions file 100 | if not dry_run: 101 | df = pd.read_csv(pd.io.common.StringIO(blob.download_as_text())) 102 | # filter out the SRX accessions 103 | df = df[~df["sample"].isin(srx_dirs.keys())] 104 | # write back to GCP 105 | blob.upload_from_string(df.to_csv(index=False)) 106 | print(f" Purged {blob.name}", file=sys.stderr) 107 | 108 | def delete_srx(srx_accessions: List[str], conn: connection, dry_run: bool=False): 109 | """ 110 | Delete SRX accessions from scRecounter tables 111 | Args: 112 | srx_accessions: list of SRX accessions to delete 113 | conn: database connection 114 | dry_run: if True, only print actions without executing 115 | """ 116 | if len(srx_accessions) == 0: 117 | return None 118 | print("Purging SRX accessions from scRecounter DB tables...", file=sys.stderr) 119 | target_tables = ["screcounter_log", "screcounter_star_params", "screcounter_star_results"] 120 | with db_connect() as conn: 121 | for srx in srx_accessions: 122 | if not dry_run: 123 | for tbl_name in target_tables: 124 | with conn.cursor() as cur: 125 | cur.execute(f"DELETE FROM {tbl_name} WHERE sample = '{srx}'") 126 | conn.commit() 127 | print(f" Deleted: {srx}", file=sys.stderr) 128 | 129 | def delete_srx_star_dirs(srx_dirs: Dict[str,str], bucket: storage.bucket.Bucket, dry_run: bool=False): 130 | """ 131 | Delete SRX directories from the GCP bucket 132 | Args: 133 | srx_dirs: Dictionary of {srx_accession: directory_path} for the target SRX accessions. 134 | bucket: The GCS bucket object. 135 | dry_run: If True, only print actions without executing. 136 | """ 137 | if len(srx_dirs) == 0: 138 | return None 139 | print(f"Deleting SRX STAR directories...", file=sys.stderr) 140 | for srx_dir in srx_dirs.values(): 141 | print(f" Deleting: {srx_dir}", file=sys.stderr) 142 | if not dry_run: 143 | for blob in bucket.list_blobs(prefix=srx_dir): 144 | blob.delete() 145 | 146 | def main(args: argparse.Namespace) -> None: 147 | """ 148 | - Input: 149 | - >=1 NCBI SRX accession 150 | - Path to GCP directory 151 | - Method 152 | - Recursively search in the GCP directory for folders named the same as the SRX accession 153 | - Delete each target folder 154 | - For each target folder, find the "accessions.csv" file 2 levels up from the target folder 155 | - Also delete the SRX from all scRecounter tables in the SQL database 156 | """ 157 | print(f"GCP_SQL_DB_NAME: {os.getenv('GCP_SQL_DB_NAME')}", file=sys.stderr) 158 | 159 | # Parse the GCP bucket path 160 | bucket_name, path_prefix = parse_gs_path(args.gcs_dir) 161 | 162 | # Initialize GCP client and bucket 163 | client = storage.Client() 164 | bucket = client.bucket(bucket_name) 165 | 166 | # Dind target SRX directories in GCP bucket 167 | srx_dirs = list_screcounter_directories(bucket, path_prefix, args.srx_accession) 168 | 169 | # Selete SRX accessions from scRecounter tables 170 | purge_accession_tables(srx_dirs, bucket, dry_run=args.dry_run) 171 | 172 | # Selete SRX directories from GCP bucket 173 | delete_srx_star_dirs(srx_dirs, bucket, dry_run=args.dry_run) 174 | 175 | # Selete SRX accessions from scRecounter tables 176 | with db_connect() as conn: 177 | delete_srx(args.srx_accession, conn, dry_run=args.dry_run) 178 | 179 | 180 | if __name__ == "__main__": 181 | from dotenv import load_dotenv 182 | load_dotenv() 183 | args = parse_args() 184 | main(args) -------------------------------------------------------------------------------- /scripts/extract-from-result-files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | import argparse 5 | from typing import Tuple, List, Dict 6 | from datetime import datetime 7 | import pandas as pd 8 | from google.cloud import storage 9 | from db_utils import db_connect, db_update 10 | 11 | 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 13 | pass 14 | 15 | def parse_args() -> argparse.Namespace: 16 | """ 17 | Parse command-line arguments. 18 | Returns: 19 | argparse.Namespace containing arguments. 20 | """ 21 | desc = 'Extract data from STAR results in scRecounter output directory' 22 | epi = """DESCRIPTION: 23 | 24 | """ 25 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 26 | parser.add_argument('gcs_dir', type=str, 27 | help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/prod3/)') 28 | parser.add_argument('--min-date-time', type=str, default='2025-01-13_00-00-00', 29 | help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)') 30 | parser.add_argument('--max-date-time', type=str, default='2025-01-15_00-00-00', 31 | help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)') 32 | return parser.parse_args() 33 | 34 | def parse_gs_path(gs_path: str) -> Tuple[str, str]: 35 | """ 36 | Parse a GCP bucket path. 37 | Args: 38 | gs_path: GCP bucket path starting with gs:// 39 | Returns: 40 | A tuple of (bucket_name, prefix). 41 | """ 42 | if not gs_path.startswith("gs://"): 43 | raise ValueError("Path must start with 'gs://'") 44 | parts = gs_path[5:].split("/", 1) 45 | bucket_name = parts[0] 46 | prefix = parts[1] if len(parts) > 1 else "" 47 | return bucket_name, prefix.rstrip("/") + "/" 48 | 49 | def list_screcounter_directories( 50 | bucket: storage.bucket.Bucket, 51 | prefix: str, 52 | min_dt: datetime, 53 | max_dt: datetime 54 | ) -> List[str]: 55 | """ 56 | List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket 57 | under the given prefix, filtered by date/time range. 58 | Args: 59 | bucket: The GCS bucket object. 60 | prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories. 61 | min_dt: The minimum datetime (inclusive). 62 | max_dt: The maximum datetime (inclusive). 63 | 64 | Returns: 65 | A list of directory prefixes that fall within the specified date/time range. 66 | """ 67 | dir_list = [] 68 | # Delimiter forces listing top-level folders under prefix 69 | iterator = bucket.list_blobs(prefix=prefix, delimiter='/') 70 | for page in iterator.pages: 71 | for folder in page.prefixes: 72 | folder_name = folder.rstrip('/').split('/')[-1] 73 | # Expecting folder_name like SCRECOUNTER_YYYY-MM-DD_hh-mm-ss 74 | if folder_name.startswith("SCRECOUNTER_"): 75 | try: 76 | date_str = folder_name.replace("SCRECOUNTER_", "") 77 | dt = datetime.strptime(date_str, "%Y-%m-%d_%H-%M-%S") 78 | if min_dt <= dt <= max_dt: 79 | dir_list.append(folder) 80 | except ValueError: 81 | pass 82 | return dir_list 83 | 84 | def find_summary_files( 85 | bucket: storage.bucket.Bucket, 86 | directory_prefix: str 87 | ) -> List[str]: 88 | """ 89 | Recursively find all Summary.csv files within a given SCRECOUNTER directory. 90 | 91 | Args: 92 | bucket: The GCS bucket object. 93 | directory_prefix: The prefix for the specific SCRECOUNTER directory. 94 | 95 | Returns: 96 | A list of blob paths (strings) for Summary.csv files meeting criteria. 97 | """ 98 | valid_parents = {"Velocyto", "GeneFull_ExonOverIntron", "GeneFull_Ex50pAS", "GeneFull", "Gene"} 99 | summary_blobs = [] 100 | for blob in bucket.list_blobs(prefix=directory_prefix): 101 | if blob.name.endswith("Summary.csv"): 102 | # The parent directory is right before the filename in the path 103 | path_parts = blob.name.split('/') 104 | if len(path_parts) > 1: 105 | parent_dir = path_parts[-2] 106 | if parent_dir in valid_parents: 107 | summary_blobs.append(blob.name) 108 | return summary_blobs 109 | 110 | def read_and_merge_summary_files( 111 | bucket: storage.bucket.Bucket, 112 | file_paths: List[str] 113 | ) -> List[pd.DataFrame]: 114 | """ 115 | Read multiple Summary.csv files into dataframes and merge them. 116 | 117 | Args: 118 | bucket: The GCS bucket object. 119 | file_paths: A list of blob paths for Summary.csv files. 120 | 121 | Returns: 122 | A merged pandas DataFrame of all summary data. 123 | """ 124 | rename_idx = { 125 | "Gene": "gene", 126 | "GeneFull": "gene_full", 127 | "GeneFull_ExonOverIntron": "gene_ex_int", 128 | "GeneFull_Ex50pAS": "gene_ex50", 129 | "Velocyto": "velocyto" 130 | } 131 | 132 | dfs = [] 133 | for path in file_paths: 134 | # read CSV file from GCS 135 | blob = bucket.blob(path) 136 | data_str = blob.download_as_text() 137 | df = pd.read_csv(pd.io.common.StringIO(data_str)) 138 | # format 139 | df.columns = ["Category", "Value"] 140 | df = df[df["Category"] == "Reads With Valid Barcodes"] 141 | df = df.set_index("Category").transpose() 142 | ## add file path info 143 | p = os.path.dirname(path) 144 | df["feature"] = rename_idx[os.path.basename(p)] 145 | df["sample"] = os.path.basename(os.path.dirname(p)) 146 | # add to list 147 | dfs.append(df) 148 | 149 | print("No. of tables: ", len(dfs), file=sys.stderr) 150 | return dfs 151 | 152 | def main(args: argparse.Namespace) -> None: 153 | """ 154 | Main function that: 155 | 1) Parses GCP bucket path. 156 | 2) Lists all SCRECOUNTER directories in the bucket (non-recursive). 157 | 3) Filters directories by date range. 158 | 4) For each directory, recursively searches for 'Summary.csv' files 159 | in allowed parent subdirectories. 160 | 5) Merges summary data and upserts into a database. 161 | 162 | Args: 163 | args: An argparse.Namespace holding command-line arguments. 164 | """ 165 | # Format arg date/time strings 166 | min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S") 167 | max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S") 168 | 169 | # Parse GCP bucket path 170 | bucket_name, path_prefix = parse_gs_path(args.gcs_dir) 171 | 172 | # Initialize GCP client and bucket 173 | client = storage.Client() 174 | bucket = client.bucket(bucket_name) 175 | 176 | # list all SCRECOUNTER directories in the bucket, filtered by date/time range 177 | screcounter_dirs = list_screcounter_directories(bucket, path_prefix, min_dt, max_dt) 178 | 179 | # for each directory, find and merge Summary.csv files 180 | merged_df = [] 181 | for directory in screcounter_dirs: 182 | print(f"Processing directory: {directory}", file=sys.stderr) 183 | summary_paths = find_summary_files(bucket, directory) 184 | if summary_paths: 185 | merged_df += read_and_merge_summary_files(bucket, summary_paths) 186 | # concat all dataframes 187 | merged_df = pd.concat(merged_df, ignore_index=True).rename( 188 | columns={"Reads With Valid Barcodes": "reads_with_valid_barcodes"} 189 | ) 190 | 191 | # check if any valid data was found 192 | if merged_df is None: 193 | print("No valid data found.", file=sys.stderr) 194 | return None 195 | else: 196 | print(f"No. of records found: {merged_df.shape[0]}", file=sys.stderr) 197 | 198 | # Upsert data into database 199 | print("Updating data...", file=sys.stderr) 200 | with db_connect() as conn: 201 | db_update(merged_df, "screcounter_star_results", conn) 202 | 203 | 204 | if __name__ == "__main__": 205 | from dotenv import load_dotenv 206 | load_dotenv() 207 | args = parse_args() 208 | main(args) -------------------------------------------------------------------------------- /docker/sc-recounter-run/cleanup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import argparse 4 | from typing import Tuple, List, Dict 5 | import pandas as pd 6 | from google.cloud import storage 7 | from db_utils import db_connect, db_upsert 8 | 9 | # argparse 10 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 11 | argparse.RawDescriptionHelpFormatter): 12 | pass 13 | 14 | desc = 'Clean up after a scRecounter production run' 15 | epi = """DESCRIPTION: 16 | Examples: 17 | python cleanup.py gs://arc-ctc-nextflow/scRecounter/prod/work/SCRECOUNTER_2025-01-06_15-46-04/ gs://arc-ctc-screcounter/prod/SCRECOUNTER_2025-01-06_15-46-04/ 18 | """ 19 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 20 | formatter_class=CustomFormatter) 21 | parser.add_argument( 22 | 'work_dir', type=str, 23 | help='GCP bucket path to work directory (e.g., gs://bucket-name/path/to/folder)' 24 | ) 25 | parser.add_argument( 26 | 'output_dir', type=str, 27 | help='GCP bucket path to output directory (e.g., gs://bucket-name/path/to/folder)' 28 | ) 29 | 30 | 31 | # functions 32 | def list_bucket_contents(bucket_name: str, prefix: str) -> Tuple[List[str], Dict[str, int]]: 33 | """ 34 | List directories and files in a GCP bucket path 35 | Args: 36 | bucket_name: GCP bucket name 37 | prefix: GCP bucket prefix 38 | Returns: 39 | Tuple of (directories list, files list) in the bucket path 40 | """ 41 | client = storage.Client() 42 | bucket = client.bucket(bucket_name) 43 | blobs = bucket.list_blobs(prefix=prefix, delimiter='/') 44 | 45 | directories = [] 46 | files = {} 47 | 48 | # Get directories (prefixes) 49 | for page in blobs.pages: 50 | directories.extend(page.prefixes) 51 | directories = [d.rstrip('/') for d in directories] 52 | 53 | # Create a new iterator for blobs 54 | blobs = bucket.list_blobs(prefix=prefix, delimiter='/') 55 | 56 | # Get files 57 | for blob in blobs: 58 | if not blob.name.endswith('/'): # Skip directory markers 59 | num_rows = 0 60 | if blob.name.split('/')[-1] == "accessions.csv": 61 | # get the number of rows 62 | blob.download_to_filename('/tmp/accessions.csv') 63 | with open('/tmp/accessions.csv', 'r') as f: 64 | num_rows = pd.read_csv(f).shape[0] 65 | os.remove('/tmp/accessions.csv') 66 | files[os.path.basename(blob.name)] = num_rows 67 | return directories, files 68 | 69 | def delete_bucket_path(bucket_name: str, path: str) -> None: 70 | """ 71 | Delete all objects in a GCP bucket path 72 | Args: 73 | bucket_name: GCP bucket name 74 | """ 75 | client = storage.Client() 76 | bucket = client.bucket(bucket_name) 77 | blobs = bucket.list_blobs(prefix=path) 78 | 79 | for blob in blobs: 80 | blob.delete() 81 | 82 | def parse_gs_path(gs_path: str) -> Tuple[str, str]: 83 | """ 84 | Parse a GCP bucket path 85 | Args: 86 | gs_path: GCP bucket path 87 | Returns: 88 | Tuple of bucket name and prefix 89 | """ 90 | if not gs_path.startswith("gs://"): 91 | raise ValueError("Path must start with 'gs://'") 92 | parts = gs_path[5:].split("/", 1) 93 | bucket_name = parts[0] 94 | prefix = parts[1] if len(parts) > 1 else "" 95 | return bucket_name, prefix.rstrip("/") + "/" 96 | 97 | def clean_output_dir(output_dir: str) -> None: 98 | """ 99 | Delete the contents of the output directory, 100 | if it only contains 'nf-report', 'nf-trace'. 101 | Args: 102 | output_dir: GCP bucket path to output directory 103 | """ 104 | # parse the bucket path 105 | bucket_name, path_prefix = parse_gs_path(output_dir) 106 | 107 | # list directories in the bucket path 108 | directories,files = list_bucket_contents(bucket_name, path_prefix) 109 | directories = [os.path.basename(d) for d in directories] 110 | print(f"Directories found: {', '.join(directories)}") 111 | files_basename = [os.path.basename(f) for f in files] 112 | print(f"Files found: {', '.join(files_basename)}") 113 | 114 | # if accessions.csv in the directory, get the number of lines 115 | if files.get("accessions.csv") == 0: 116 | print("No accessions found. Deleting the bucket path...") 117 | delete_bucket_path(bucket_name, path_prefix) 118 | print(f"Deleted path: {output_dir}") 119 | elif set(directories).issubset({"nf-report", "nf-trace"}): 120 | print("Just Nextflow report and/or trace found. Deleting the bucket path...") 121 | delete_bucket_path(bucket_name, path_prefix) 122 | print(f"Deleted path: {output_dir}") 123 | else: 124 | print("Bucket path contains pipeline results. No deletion performed.") 125 | 126 | def clean_work_dir(work_dir: str) -> None: 127 | """ 128 | Delete the contents of the work directory 129 | Args: 130 | work_dir: GCP bucket path to work directory 131 | """ 132 | # parse the bucket path 133 | bucket_name, path_prefix = parse_gs_path(work_dir) 134 | 135 | print("Deleting the contents of the working directory...") 136 | delete_bucket_path(bucket_name, path_prefix) 137 | print(f"Deleted path: {work_dir}") 138 | 139 | def download_gcs_file( 140 | bucket_name: str, gcs_file_path: str, local_file_path: str="/tmp/temp_file.tsv" 141 | ) -> str: 142 | """ 143 | Download a file from a GCP bucket to a local file 144 | Args: 145 | bucket_name: GCP bucket name 146 | gcs_file_path: GCP bucket path to the file 147 | local_file_path: Local file path 148 | Returns: 149 | Local file path 150 | """ 151 | client = storage.Client() 152 | bucket = client.bucket(bucket_name) 153 | blob = bucket.blob(gcs_file_path) 154 | blob.download_to_filename(local_file_path) 155 | return local_file_path 156 | 157 | def upload_trace(output_dir: str) -> None: 158 | """ 159 | Upload the trace file to the screcounter db 160 | Args: 161 | output_dir: GCP bucket path to output directory 162 | """ 163 | # does nf-trace directory exists in gcp bucket location? 164 | bucket_name, path_prefix = parse_gs_path(output_dir) 165 | directories,files = list_bucket_contents(bucket_name, path_prefix) 166 | directories = [os.path.basename(d) for d in directories] 167 | 168 | if "nf-trace" in directories: 169 | # list the files in nf-trace directory 170 | trace_dir = path_prefix + "nf-trace/" 171 | trace_files = list_bucket_contents(bucket_name, trace_dir)[1] 172 | # get the most recent based on the name 173 | trace_file = sorted(list(trace_files.keys()))[-1] 174 | # read the trace file as a pandas dataframe 175 | trace_file_path = os.path.join(trace_dir, trace_file) 176 | # read from gcp 177 | local_file_path = download_gcs_file(bucket_name, trace_file_path) 178 | # read the file 179 | if not os.path.exists(local_file_path): 180 | print(f"File not found: {local_file_path}") 181 | return None 182 | trace_df = pd.read_csv(local_file_path, sep="\t") 183 | # remove the local file 184 | os.remove(local_file_path) 185 | # format 186 | ## convert exit column to character 187 | if "exit" in trace_df.columns: 188 | trace_df["exit"] = trace_df["exit"].astype(str) 189 | ## remove second "submit" column 190 | if "submit.1" in trace_df.columns: 191 | trace_df.drop(columns=["submit.1"], inplace=True) 192 | ## rename "%cpu" to cpu_percent 193 | if r"%cpu" in trace_df.columns: 194 | trace_df.rename(columns={r"%cpu": "cpu_percent"}, inplace=True) 195 | # upsert 196 | with db_connect() as conn: 197 | db_upsert(trace_df, "screcounter_trace", conn) 198 | # status update 199 | print(f"Uploaded trace file to screcounter db: {trace_file}") 200 | else: 201 | print("No nf-trace directory found. Skipping trace file db upload.") 202 | 203 | def main(args): 204 | # clean up the work and output directories 205 | clean_work_dir(args.work_dir) 206 | clean_output_dir(args.output_dir) 207 | # upload the trace file to the screcounter db 208 | upload_trace(args.output_dir) 209 | 210 | 211 | if __name__ == "__main__": 212 | args = parser.parse_args() 213 | main(args) -------------------------------------------------------------------------------- /bin/prefetch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | from __future__ import print_function 4 | import os 5 | import re 6 | import sys 7 | import argparse 8 | import logging 9 | from typing import Tuple, Optional 10 | from time import sleep 11 | from shutil import which 12 | from subprocess import Popen, PIPE 13 | import pandas as pd 14 | from db_utils import db_connect, db_upsert, add_to_log 15 | 16 | # logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | 19 | # argparse 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 21 | argparse.RawDescriptionHelpFormatter): 22 | pass 23 | 24 | desc = 'Run sra-tools prefetch' 25 | epi = """DESCRIPTION: 26 | Run sra-tools prefetch with handling of errors 27 | """ 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 29 | formatter_class=CustomFormatter) 30 | parser.add_argument('accession', type=str, help='SRA accession') 31 | parser.add_argument('--outdir', type=str, default='prefetch_out', 32 | help='Output directory') 33 | parser.add_argument('--max-size-gb', type=int, default=1000, 34 | help='Max file size in Gb') 35 | parser.add_argument('--tries', type=int, default=3, 36 | help='Number of tries to download') 37 | parser.add_argument('--sample', type=str, default="", 38 | help='Sample name') 39 | parser.add_argument('--gcp-download', action='store_true', default=False, 40 | help='Obtain sequence data from SRA GCP mirror') 41 | 42 | # functions 43 | def run_cmd(cmd: str) -> Tuple[int,bytes,bytes]: 44 | """ 45 | Run sub-command and return returncode, output, and error. 46 | Args: 47 | cmd: Command to run 48 | Returns: 49 | (returncode, output, error) 50 | """ 51 | logging.info(f'Running: {cmd}') 52 | p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) 53 | output, err = p.communicate() 54 | return p.returncode, output, err 55 | 56 | def run_vdb_config() -> Tuple[str,str]: 57 | """ 58 | Run vdb-config with error handling. 59 | Returns: 60 | Status and message 61 | """ 62 | cmd = f"vdb-config --report-cloud-identity yes" 63 | rc,output,err = run_cmd(cmd) 64 | if rc != 0: 65 | logging.warning('vdb-config failed') 66 | logging.warning(err) 67 | return "Failure",f'vdb-config failed: {err}' 68 | return "Success","vdb-config successful" 69 | 70 | def prefetch(accession: str, tries: int, max_size_gb: int, outdir: str) -> Tuple[str,str]: 71 | """ 72 | Run prefetch with error handling. 73 | Args: 74 | accession: SRA accession 75 | tries: Number of tries 76 | max_size_gb: Max file size in Gb 77 | outdir: Output directory 78 | Returns: 79 | Status and message 80 | """ 81 | logging.info(f"Downloading {accession}") 82 | cmd = f"prefetch --max-size {max_size_gb}G --output-directory {outdir} {accession}" 83 | err = "" 84 | for i in range(tries): 85 | logging.info(f"Attempt: {i+1}/{tries}") 86 | rc,output,err = run_cmd(cmd) 87 | if rc == 0: 88 | logging.info("Download successful") 89 | # run vdb-validate 90 | sra_dir = os.path.join(outdir, accession) 91 | rc,output,err = run_cmd(f"vdb-validate {sra_dir}") 92 | if rc == 0: 93 | logging.info("Validation successful") 94 | return "Success","Download and validation successful" 95 | else: 96 | logging.warning("Validation failed") 97 | logging.warning(err) 98 | else: 99 | logging.warning("Download failed") 100 | logging.warning(err) 101 | # sleep prior to next attempt 102 | sleep_time = 20 * (i + 1) 103 | logging.info(f"Sleeping for {sleep_time} seconds...") 104 | sleep(sleep_time) 105 | # assume failure 106 | err = err.decode().replace('\n', ' ') 107 | return "Failure",f"Failed to download and validate: {err}" 108 | 109 | def run_vdb_dump(accession: str, min_size: int=1e6) -> Tuple[str,str]: 110 | """ 111 | Run vdb-dump with error handling. 112 | Args: 113 | sra_file: SRA file 114 | outdir: Output directory 115 | Returns: 116 | Status and message 117 | """ 118 | cmd = f"vdb-dump --info {accession}" 119 | rc,output,err = run_cmd(cmd) 120 | if rc != 0: 121 | logging.warning("Dump failed") 122 | logging.warning(err) 123 | return "Failure",f'vdb-dump failed: {err}' 124 | 125 | # parse the output 126 | regex = re.compile(r' *: ') 127 | data = {} 128 | for line in output.decode().split('\n'): 129 | line = regex.split(line.rstrip(), 1) 130 | if len(line) < 2: 131 | continue 132 | data[line[0]] = line[1] 133 | 134 | # checks 135 | ## keys 136 | for x in ['acc', 'size', 'FMT', 'platf']: 137 | if x not in data: 138 | return "Failure","Missing key in vdb-dump output: {x}" 139 | ## accession 140 | if data['acc'] != accession: 141 | return "Failure",f'Accession mismatch: {data["acc"]} != {accession}' 142 | ## size 143 | size = int(data['size'].replace(',', '')) 144 | if size < min_size: 145 | return "Failure",f'File size too small: {size} < {min_size}' 146 | ## format 147 | #fmt = data['FMT'].lower() 148 | #if 'fastq' not in fmt and fmt not in ['sharq', 'sralite', 'sra']: 149 | # return "Failure",f'Invalid format: {data["FMT"]}' 150 | ## platform 151 | if 'illumina' not in data['platf'].lower(): 152 | return "Failure",f'Invalid platform: {data["platf"]}' 153 | # all checks passed 154 | return "Success","Validation successful" 155 | 156 | def write_log(logF, sample: str, accession: str, step: str, msg: str) -> None: 157 | """ 158 | Write log to file. 159 | Args: 160 | logF: Log file handle 161 | sample: Sample name 162 | accession: SRA accession 163 | step: Step name 164 | msg: Message 165 | """ 166 | if len(msg) > 100: 167 | msg = msg[:100] + '...' 168 | logF.write(','.join([sample, accession, step, msg]) + '\n') 169 | 170 | def prefetch_workflow(sample: str, accession: str, log_df: pd.DataFrame, outdir:str, 171 | gcp_download: bool=False, tries: int=3, max_size_gb: float=1000) -> Optional[str]: 172 | """ 173 | Run prefetch workflow. 174 | Args: 175 | sample: Sample name 176 | accession: SRA accession 177 | log_df: Log dataframe 178 | outdir: Output directory 179 | gcp_download: Use GCP mirror 180 | tries: Number of tries 181 | max_size_gb: Max file size in Gb 182 | """ 183 | # check for prefetch in path 184 | for exe in ['prefetch', 'vdb-dump']: 185 | if not which(exe): 186 | logging.error(f'{exe} not found in PATH') 187 | sys.exit(1) 188 | 189 | # run vdb-config 190 | if gcp_download: 191 | status,msg = run_vdb_config() 192 | add_to_log(log_df, sample, accession, "prefetch", "vdb-config", status, msg) 193 | 194 | # run vdb-dump 195 | status,msg = run_vdb_dump(accession) 196 | add_to_log(log_df, sample, accession, "prefetch", "vdb-dump", status, msg) 197 | if status != "Success": 198 | logging.warning(f'vdb-dump validation failed: {msg}') 199 | return None 200 | 201 | # run prefetch 202 | status,msg = prefetch(accession, tries, max_size_gb, outdir) 203 | add_to_log(log_df, sample, accession, "prefetch", "prefetch", status, msg) 204 | if status != "Success": 205 | logging.warning(f'Failed to download: {msg}') 206 | return None 207 | 208 | # print output file size 209 | sra_file = os.path.join(outdir, accession) 210 | if not os.path.exists(sra_file): 211 | logging.warning(f'File not found: {sra_file}') 212 | return None 213 | file_size = os.path.getsize(sra_file) 214 | logging.info(f"SRA file size: {file_size / 1e9:.3f} GB") 215 | 216 | # return output file 217 | return sra_file 218 | 219 | ## script main 220 | if __name__ == '__main__': 221 | # arg parse 222 | args = parser.parse_args() 223 | 224 | # setup 225 | os.makedirs(args.outdir, exist_ok=True) 226 | log_df = pd.DataFrame( 227 | columns=["sample", "accession", "process", "step", "status", "message"] 228 | ) 229 | 230 | # run workflow 231 | prefetch_workflow( 232 | args.sample, args.accession, log_df, 233 | outdir=args.outdir, 234 | gcp_download=args.gcp_download, 235 | tries=args.tries, 236 | max_size_db=args.max_size_gb 237 | ) 238 | 239 | # write log 240 | log_file = os.path.join(args.outdir, "prefetch_log.csv") 241 | log_df.to_csv(log_file, index=False) 242 | logging.info(f'Log written to: {log_file}') 243 | 244 | # upsert log to database 245 | with db_connect() as conn: 246 | db_upsert(log_df, "screcounter_log", conn) 247 | -------------------------------------------------------------------------------- /scripts/tiledb-loader/bin/find-mtx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import 3 | ## batteries 4 | import os 5 | import logging 6 | import argparse 7 | from pathlib import Path 8 | from itertools import chain, repeat 9 | from typing import List, Set, Tuple, Optional 10 | ## 3rd party 11 | import pandas as pd 12 | import tiledbsoma 13 | import tiledbsoma.io 14 | import scanpy as sc 15 | 16 | # format logging 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 18 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING) 19 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING) 20 | logging.getLogger("tiledb").setLevel(logging.WARNING) 21 | 22 | # classes 23 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): 24 | pass 25 | 26 | # functions 27 | def parse_arguments() -> argparse.Namespace: 28 | """ 29 | Parse command-line arguments. 30 | """ 31 | desc = 'Find scRNA-seq count matrix files for TileDB loader.' 32 | epi = """DESCRIPTION: 33 | """ 34 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter) 35 | parser.add_argument( 36 | 'base_dir', type=str, help='Base directory to search for input data files' 37 | ) 38 | parser.add_argument( 39 | '--feature-type', default='GeneFull_Ex50pAS', 40 | choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto', None], 41 | help='Feature type to process' 42 | ) 43 | parser.add_argument( 44 | '--raw', action='store_true', default=False, 45 | help='Use raw count matrix files instead of filtered' 46 | ) 47 | parser.add_argument( 48 | '--db-uri', type=str, default="tiledb_exp", 49 | help='URI of existing TileDB database, or it will be created if it does not exist' 50 | ) 51 | parser.add_argument( 52 | '--batch-size', type=int, default=8, help='batch size for downstream processing' 53 | ) 54 | parser.add_argument( 55 | '--max-datasets', type=int, default=None, 56 | help='Maximum number of datasets to process' 57 | ) 58 | parser.add_argument( # TODO: implement => https://github.com/alexdobin/STAR/blob/master/extras/scripts/soloBasicCellFilter.awk 59 | '--multi-mapper', default='None', choices=['None', 'EM', 'uniform'], 60 | help='Multi-mapper strategy to use' 61 | ) 62 | return parser.parse_args() 63 | 64 | def get_existing_srx_ids(db_uri: str) -> Set[str]: 65 | """ 66 | Read metadata from existing database and return set of SRX IDs. 67 | Args: 68 | db_uri: URI of the TileDB database 69 | Returns: 70 | Set of SRX IDs already in the database 71 | """ 72 | logging.info(f"Checking for existing SRX accessions in {db_uri}...") 73 | 74 | srx = set() 75 | if not os.path.exists(db_uri): 76 | logging.info("Database does not exist yet. No SRX/ERX accessions to obtain.") 77 | else: 78 | with tiledbsoma.open(db_uri) as exp: 79 | try: 80 | metadata = (exp.obs.read(column_names=["SRX_accession"]) 81 | .concat() 82 | .group_by(["SRX_accession"]) 83 | .aggregate([ 84 | ([], 'count_all'), 85 | ]) 86 | .to_pandas()) 87 | srx = set(metadata["SRX_accession"].unique()) 88 | except tiledbsoma._exception.DoesNotExistError: 89 | metadata = (exp.obs.read(column_names=["SRX_accession"]) 90 | .concat() 91 | .to_pandas()) 92 | srx = set(metadata["SRX_accession"].unique()) 93 | # status 94 | logging.info(f" Found {len(srx)} existing SRX/ERX accessions.") 95 | return srx 96 | 97 | def find_matrix_files( 98 | base_dir: str, 99 | feature_type: str, 100 | existing_srx: Set[str], 101 | multi_mapper: str='None', 102 | raw: bool=False, 103 | max_datasets: Optional[int]=None 104 | ) -> List[tuple]: 105 | """ 106 | Recursively find matrix.mtx.gz files and extract SRX/ERX IDs. 107 | Args: 108 | base_dir: Base directory to search 109 | feature_type: 'Gene' or 'GeneFull' 110 | existing_srx: Set of existing SRX IDs 111 | multi_mapper: 'EM', 'uniform', or 'None' 112 | raw: Use raw count matrix files instead of filtered 113 | max_datasets: Maximum number of datasets to process 114 | Returns: 115 | List of tuples (matrix_path, srx_id) 116 | """ 117 | logging.info(f"Searching for new data files in {base_dir}...") 118 | base_path = Path(base_dir) 119 | subdir = 'raw' if raw else 'filtered' 120 | results = [] 121 | stats = {'found': 0, 'exists': 0, 'permissions': 0, 'mtx_file_missing': 0, 'novel': 0} 122 | 123 | # Determine which matrix file to look for based on multi_mapper 124 | if multi_mapper == 'None': 125 | matrix_filename = 'matrix.mtx.gz' 126 | elif multi_mapper == 'EM': 127 | matrix_filename = 'UniqueAndMult-EM.mtx.gz' 128 | elif multi_mapper == 'uniform': 129 | matrix_filename = 'UniqueAndMult-Uniform.mtx.gz' 130 | else: 131 | raise ValueError(f"Invalid multi-mapper strategy: {multi_mapper}") 132 | 133 | # Walk through directory structure 134 | num_dirs = 0 135 | for srx_dir in chain(base_path.glob('**/SRX*'), base_path.glob('**/ERX*')): 136 | # skip files 137 | if not srx_dir.is_dir(): 138 | continue 139 | else: 140 | stats['found'] += 1 141 | 142 | # status 143 | num_dirs += 1 144 | if num_dirs % 1000 == 0: 145 | logging.info(f" Searched {num_dirs} SRX directories so far...") 146 | 147 | # Check if SRX directory exists in database 148 | if srx_dir.name in existing_srx: 149 | stats['exists'] += 1 150 | continue 151 | 152 | # Find target matrix file in SRX directory 153 | for mtx_file in srx_dir.glob(f'**/{matrix_filename}'): 154 | hit = None 155 | # check for `feature_type/subdir` in file path 156 | for i,x in enumerate(mtx_file.parts): 157 | try: 158 | if feature_type in x and mtx_file.parts[i+1] == subdir: 159 | hit = True 160 | break 161 | except IndexError: 162 | continue 163 | # if target file found, check if it exists, and add to results 164 | if hit: 165 | try: 166 | if not mtx_file.exists(): 167 | stats['mtx_file_missing'] += 1 168 | else: 169 | stats['novel'] += 1 170 | results.append([mtx_file, srx_dir.name]) 171 | except PermissionError: 172 | logging.warning(f"Permission denied for {mtx_file}. Skipping.") 173 | stats['permissions'] += 1 174 | break 175 | 176 | # Check max datasets 177 | if max_datasets and len(results) >= max_datasets: 178 | logging.info(f" Found --max-datasets datasets. Stopping search.") 179 | break 180 | 181 | # Status 182 | logging.info(f" {stats['found']} total SRX directories found (total).") 183 | logging.info(f" {stats['exists']} existing SRX directories found (skipped).") 184 | logging.info(f" {stats['mtx_file_missing']} missing matrix files (skipped).") 185 | logging.info(f" {stats['permissions']} directories with permission errors (skipped).") 186 | logging.info(f" {stats['novel']} novel SRX directories found (final).") 187 | return results 188 | 189 | def make_batch(num_repeats: int, total_numbers: int) -> List[int]: 190 | """ 191 | Bin numbers into batches of num_repeats. 192 | Args: 193 | num_repeats: Number of repeats per unique number 194 | total_numbers: Total number of unique numbers 195 | Returns: 196 | List of batch numbers 197 | """ 198 | batch_counts = [] 199 | unique_count = int(round(total_numbers / num_repeats + 0.5)) 200 | for i in range(1, unique_count + 1): 201 | batch_counts.extend(repeat(i, num_repeats)) 202 | return batch_counts[:total_numbers] 203 | 204 | def main(): 205 | """Main function to run the TileDB loader workflow.""" 206 | args = parse_arguments() 207 | 208 | # Get existing SRX IDs 209 | existing_srx = get_existing_srx_ids(args.db_uri) 210 | 211 | # Find all matrix files and their corresponding SRX IDs 212 | matrix_files = find_matrix_files( 213 | args.base_dir, args.feature_type, existing_srx, 214 | multi_mapper=args.multi_mapper, 215 | raw=args.raw, 216 | max_datasets=args.max_datasets 217 | ) 218 | 219 | # write as csv 220 | df = pd.DataFrame(matrix_files, columns=['matrix_path', 'srx']) 221 | df["batch"] = make_batch(args.batch_size, df.shape[0]) 222 | df.to_csv('mtx_files.csv', index=False) 223 | logging.info(f"File written: mtx_files.csv") 224 | 225 | if __name__ == "__main__": 226 | from dotenv import load_dotenv 227 | load_dotenv(override=True) 228 | main() -------------------------------------------------------------------------------- /scripts/acc2srr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # import 3 | import os 4 | import io 5 | import csv 6 | import sys 7 | import argparse 8 | from time import sleep 9 | from typing import List, Dict 10 | from urllib.error import HTTPError 11 | from dotenv import load_dotenv 12 | import pandas as pd 13 | from Bio import Entrez 14 | from pysradb.sraweb import SRAweb 15 | 16 | 17 | # argparse 18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 19 | argparse.RawDescriptionHelpFormatter): 20 | pass 21 | 22 | desc = 'Convert accessions to SRR accessions' 23 | epi = """DESCRIPTION: 24 | Convert SRP, GSE, or other accessions to SRR accessions. 25 | If NCBI_API_KEY is set in the environment, it will be used as the API key. 26 | """ 27 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 28 | formatter_class=CustomFormatter) 29 | parser.add_argument('accession_file', type=str, 30 | help='Text file with accessions; 1 per line') 31 | parser.add_argument('--email', type=str, default=None, 32 | help='Email address for Entrez') 33 | parser.add_argument('--batch-size', type=int, default=50, 34 | help='Batch size for fetching') 35 | parser.add_argument('--outfile', type=str, default='srr_accessions.csv', 36 | help='Output file name') 37 | 38 | # functions 39 | def load_accessions(accession_file: str) -> List[str]: 40 | """ 41 | Load accessions from file 42 | Args: 43 | accession_file: File with accessions 44 | Returns: 45 | List of accessions 46 | """ 47 | accessions = [] 48 | with open(accession_file) as inF: 49 | for line in inF: 50 | line = line.strip().split(',')[0] 51 | if line == "" or line.startswith("#"): 52 | continue 53 | accessions.append(line) 54 | return accessions 55 | 56 | def esearch_batch(db, accession, batch_size = 50, ntries=3, sleep_time=5) -> List[str]: 57 | """ 58 | Entrez esearch in batches 59 | Args: 60 | db: Database to search 61 | accession: Accession to search 62 | batch_size: Batch size for fetching 63 | ntries: Number of tries before giving up 64 | sleep_time: Sleep time between retries 65 | Returns: 66 | List of unique IDs 67 | """ 68 | print(f"esearch of {db} for: {accession}", file=sys.stderr) 69 | results = [] 70 | # Initial search to get the total count of records 71 | handle = Entrez.esearch(db=db, term=accession, usehistory="y", retmax=1) 72 | record = Entrez.read(handle) 73 | handle.close() 74 | results += record["IdList"] if record["IdList"] else None 75 | total_records = int(record["Count"]) 76 | print(f" Total records: {total_records}", file=sys.stderr) 77 | 78 | # Retrieve results in batches 79 | for start in range(0, total_records, batch_size): 80 | print(f" Fetching records {start+1}-{min(start+batch_size, total_records)}", file=sys.stderr) 81 | for i in range(ntries): 82 | try: 83 | handle = Entrez.esearch(db=db, term=accession, retstart=start, retmax=batch_size, usehistory="y") 84 | record = Entrez.read(handle) 85 | handle.close() 86 | if "IdList" in record: 87 | results += record["IdList"] 88 | sleep(0.5) # comply with NCBI rate limits 89 | except Exception as e: 90 | print(f" Attempt {i+1}/{ntries}: Error encountered: {e}", file=sys.stderr) 91 | sleep(5 * (i+1)) 92 | 93 | # Return unique IDs 94 | return list(set(results)) 95 | 96 | def efetch_batch(db, idlist, batch_size=20, rettype="runinfo", retmode="text", ntries=3, sleep_time=5 97 | ) -> List[pd.DataFrame]: 98 | """ 99 | Entrez efetch in batches 100 | Args: 101 | db: Database to search 102 | idlist: List of IDs to fetch 103 | batch_size: Batch size for fetching 104 | rettype: Return type 105 | retmode: Return mode 106 | ntries: Number of tries before giving up 107 | sleep_time: Sleep time between retries 108 | Returns: 109 | List of dataframes 110 | """ 111 | print(f"efetch of {db} for: {len(idlist)} IDs", file=sys.stderr) 112 | results = [] 113 | for start in range(0, len(idlist), batch_size): 114 | print(f" Fetching batch {start+1}-{min(start+batch_size, len(idlist))}", file=sys.stderr) 115 | batch_ids = ",".join(idlist[start:start + batch_size]) # Get current batch of IDs 116 | batch_result = None 117 | for i in range(ntries): # Retry logic for each batch 118 | try: 119 | handle = Entrez.efetch(db=db, id=batch_ids, rettype=rettype, retmode=retmode) 120 | batch_result = handle.read() 121 | handle.close() 122 | # convert to dataframe 123 | df = pd.read_csv(io.StringIO(batch_result.decode('utf-8'))) 124 | results.append(df) 125 | sleep(0.5) # comply with NCBI rate limits 126 | break # Exit retry loop on success 127 | except HTTPError as e: 128 | print(f" Attempt {i+1}/{ntries}: HTTPError for batch {start}-{start+batch_size}: {e}", file=sys.stderr) 129 | sleep(sleep_time * (i + 1)) # Progressive wait time before retry 130 | continue 131 | if batch_result is None: 132 | print(f" Failed to fetch batch {start}-{start+batch_size}", file=sys.stderr) 133 | return results 134 | 135 | def fetch_srr_from_srp(accession, batch_size=50, ntries=3, sleep_time=5) -> pd.DataFrame: 136 | """ 137 | Fetch SRR accessions from SRP 138 | Args: 139 | accession: SRP accession 140 | batch_size: Batch size for fetching 141 | ntries: Number of tries before giving up 142 | sleep_time: Sleep time between retries 143 | Returns: 144 | Dataframe with SRR accessions 145 | """ 146 | # Search the SRA database for the SRP accession 147 | idlist = esearch_batch("sra", accession, batch_size=batch_size, ntries=ntries, sleep_time=sleep_time) 148 | # get IDs from record 149 | if len(idlist) == 0: 150 | print(f"No records found for accession: {accession}", file=sys.stderr) 151 | return [] 152 | # Fetch run info to get SRR accessions 153 | results = efetch_batch("sra", idlist, batch_size=batch_size, ntries=ntries, sleep_time=sleep_time) 154 | # concat dataframes 155 | df = pd.concat(results) 156 | # return specific columns 157 | to_keep = [ 158 | "Sample", "Run", "Experiment", "SRAStudy", "BioProject", 159 | "spots", "spots_with_mates", "avgLength", "size_MB" 160 | ] 161 | df = df[to_keep].rename(columns={ 162 | "Sample" : "sample", 163 | "Run" : "accession", 164 | "Experiment" : "experiment", 165 | "SRAStudy" : "sra_study", 166 | "BioProject" : "bioproject", 167 | "avgLength" : "avg_length", 168 | "size_MB" : "size_mb" 169 | }) 170 | # getting just unique for for "accession" 171 | return df.drop_duplicates(subset=["accession"]) 172 | 173 | def gse_to_srp(accession: str) -> str: 174 | """ 175 | Use pysradb to convert GSE to SRP 176 | Args: 177 | accession: GSE accession 178 | Returns: 179 | SRP accession 180 | """ 181 | sradb = SRAweb() 182 | df = sradb.gse_to_srp( 183 | [accession], 184 | detailed=False, 185 | sample_attribute=False, 186 | expand_sample_attributes=False, 187 | ) 188 | srp_accession = df["study_accession"].tolist()[0] 189 | print(f"Converted GSE to SRP: {srp_accession}", file=sys.stderr) 190 | return srp_accession 191 | 192 | def gsm_to_srp(accession: str) -> str: 193 | """ 194 | Use pysradb to convert GSM to SRP 195 | Args: 196 | accession: GSM accession 197 | Returns: 198 | SRP accession 199 | """ 200 | sradb = SRAweb() 201 | df = sradb.gsm_to_srp( 202 | [accession], 203 | detailed=False, 204 | sample_attribute=False, 205 | expand_sample_attributes=False, 206 | ) 207 | srp_accession = df["study_accession"].tolist()[0] 208 | print(f"Converted GSM to SRP: {srp_accession}", file=sys.stderr) 209 | return srp_accession 210 | 211 | def convert_to_srp(accession: str) -> str: 212 | """ 213 | Convert GSE or GSM to SRP 214 | Args: 215 | accession: GSE or GSM accession 216 | Returns: 217 | SRP accession 218 | """ 219 | if accession.startswith('GSE'): 220 | try: 221 | return gse_to_srp(accession) 222 | except Exception as e: 223 | print(f"Error converting GSE to SRP: {e}", file=sys.stderr) 224 | return None 225 | elif accession.startswith('GSM'): 226 | try: 227 | return gsm_to_srp(accession) 228 | except Exception as e: 229 | print(f"Error converting GSM to SRP: {e}", file=sys.stderr) 230 | return None 231 | else: 232 | print(f"Accession type not recognized: {accession}", file=sys.stderr) 233 | return None 234 | 235 | def fetch_srr_from_accession(accession: str, batch_size: int) -> List[pd.DataFrame]: 236 | """ 237 | Fetch SRR accessions from SRP or GSE 238 | Args: 239 | accession: SRP or GSE accession 240 | batch_size: Batch size for fetching 241 | Returns: 242 | List of dataframes with SRR accession info 243 | """ 244 | print(f"#-- Fetching SRR accessions for: {accession} --#", file=sys.stderr) 245 | if accession.startswith('GSE') or accession.startswith('GSM'): 246 | # convert GSE to SRP 247 | srp_accession = convert_to_srp(accession) 248 | df = fetch_srr_from_srp(srp_accession) 249 | elif accession.startswith('SRP'): 250 | # fetch SRR from SRP 251 | df = fetch_srr_from_srp(accession) 252 | else: 253 | print(f"Accession type not recognized: {accession}", file=sys.stderr) 254 | return None 255 | # add query accession 256 | df["query_accession"] = accession 257 | # move query accession to first column 258 | cols = df.columns.tolist() 259 | cols = cols[-1:] + cols[:-1] 260 | return df[cols] 261 | 262 | def main(args): 263 | # load accessions 264 | accessions = load_accessions(args.accession_file) 265 | 266 | # set email 267 | if args.email: 268 | Entrez.email = args.email 269 | # set API key 270 | if 'NCBI_API_KEY' in os.environ: 271 | Entrez.api_key = os.environ['NCBI_API_KEY'] 272 | 273 | # get SRR accessions 274 | srr_accessions = [] 275 | for accession in accessions: 276 | srr_accessions.append( 277 | fetch_srr_from_accession(accession, batch_size=args.batch_size) 278 | ) 279 | 280 | # concat list of dataframes 281 | srr_accessions = pd.concat(srr_accessions) 282 | 283 | # write table 284 | srr_accessions.to_csv(args.outfile, sep=',', index=False) 285 | print(f"Saved SRR accessions to: {args.outfile}", file=sys.stderr) 286 | 287 | 288 | ## script main 289 | if __name__ == '__main__': 290 | args = parser.parse_args() 291 | load_dotenv() 292 | main(args) --------------------------------------------------------------------------------