├── data
    ├── accessions_no_params.csv
    ├── accessions_large_n2.csv
    ├── gcp
    │   ├── star_indices_hg.csv
    │   ├── barcodes_n2.csv
    │   ├── barcodes.csv
    │   └── star_indices.csv
    ├── accessions_no-read2.csv
    ├── accessions_small_n2.csv
    ├── star_indices_hg.csv
    ├── accessions_small_n3.csv
    ├── accessions_org_n2.csv
    ├── barcodes_n2.csv
    ├── accessions_problems.csv
    ├── accessions_small_n10.csv
    ├── barcodes.csv
    ├── accessions_small_n2_params.csv
    ├── accessions_var.csv
    ├── star_indices.csv
    └── accessions_all-org.csv
├── envs
    ├── read_qc.yml
    ├── star.yml
    ├── download.yml
    └── tiledb.yml
├── .gitignore
├── scripts
    ├── gcp-upload
    │   ├── conda-env.yml
    │   ├── config
    │   │   ├── utils.config
    │   │   ├── process.config
    │   │   └── profiles.config
    │   ├── nextflow.config
    │   ├── bin
    │   │   ├── agg-obs-metadata.py
    │   │   └── db-to-parquet.py
    │   ├── README.md
    │   └── main.nf
    ├── tiledb-loader
    │   ├── conda-env.yml
    │   ├── config
    │   │   ├── utils.config
    │   │   ├── process.config
    │   │   └── profiles.config
    │   ├── nextflow.config
    │   ├── main.nf
    │   ├── README.md
    │   └── bin
    │   │   ├── mtx-to-h5ad.py
    │   │   ├── h5ad-to-db.py
    │   │   └── find-mtx.py
    ├── gcp-loader-tahoe100.py
    ├── gcp-find-soft-delete.py
    ├── tiledb-loader-tahoe100.py
    ├── gcp2chimera.py
    ├── search-cloud-run-job-logs.py
    ├── purge-srx.py
    ├── extract-from-result-files.py
    └── acc2srr.py
├── docker
    ├── sc-recounter-run
    │   ├── environment.yml
    │   ├── entrypoint.sh
    │   ├── Dockerfile
    │   ├── README.md
    │   └── cleanup.py
    ├── README.md
    ├── sc-recounter-star
    │   ├── README.md
    │   └── Dockerfile
    └── sc-recounter-download
    │   ├── README.md
    │   └── Dockerfile
├── config
    ├── utils.config
    ├── process.config
    └── profiles.config
├── lib
    ├── utils.nf
    ├── download.groovy
    ├── star_params.groovy
    └── utils.groovy
├── .github
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── LICENSE
├── workflows
    ├── read_qc.nf
    ├── db_acc.nf
    ├── reads.nf
    └── download.nf
├── bin
    ├── csv-merge.py
    ├── subsample.py
    ├── upload-final-star-params.py
    ├── format-star-params.py
    ├── star-summary.py
    ├── sra-stat.py
    ├── parallel-fastq-dump.py
    ├── get-db-accessions.py
    └── prefetch.py
├── main.nf
├── nextflow.config
└── README.md


/data/accessions_no_params.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | SRX19162973,SRR23215162,human


--------------------------------------------------------------------------------
/data/accessions_large_n2.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | sample1,SRR13711613,human
3 | sample2,SRR13960234,human


--------------------------------------------------------------------------------
/data/gcp/star_indices_hg.csv:
--------------------------------------------------------------------------------
1 | organism,star_index
2 | human,gs://arc-ctc-references/STAR/star_refData_2020_hg38/


--------------------------------------------------------------------------------
/data/accessions_no-read2.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | no_read2_s1,SRR25778815,human
3 | no_read2_s2,ERR11746860,human


--------------------------------------------------------------------------------
/data/accessions_small_n2.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | SRX24914804,SRR13112659,human
3 | SRX24914805,SRR13112660,human


--------------------------------------------------------------------------------
/data/star_indices_hg.csv:
--------------------------------------------------------------------------------
1 | organism,star_index
2 | human,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38
3 | 


--------------------------------------------------------------------------------
/data/accessions_small_n3.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | SRX24914804,SRR13112659,human
3 | SRX24914805,SRR13112660,human
4 | SRX20274301,SRR24488917,mouse


--------------------------------------------------------------------------------
/envs/read_qc.yml:
--------------------------------------------------------------------------------
1 | name: read_qc
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 | dependencies:
6 |   - python=3.11
7 |   - pandas=2.2
8 |   - seqkit=2.8


--------------------------------------------------------------------------------
/data/accessions_org_n2.csv:
--------------------------------------------------------------------------------
1 | sample,accession,entrez_id,tech_10x,organism
2 | ERX10987225,ERR11583756,34046074,3_prime_gex,human
3 | ERX10987225,ERR11583807,34046074,3_prime_gex,human
4 | 


--------------------------------------------------------------------------------
/data/gcp/barcodes_n2.csv:
--------------------------------------------------------------------------------
1 | name,cell_barcode_length,umi_length,file_path
2 | 737K-august-2016,16,10,gs://arc-ctc-references/cellranger/barcodes/737K-august-2016.txt
3 | 737K-arc-v1,16,12,gs://arc-ctc-references/cellranger/barcodes/737K-arc-v1.txt


--------------------------------------------------------------------------------
/data/barcodes_n2.csv:
--------------------------------------------------------------------------------
1 | name,cell_barcode_length,umi_length,file_path
2 | 737K-august-2016,16,10,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt
3 | 3M-february-2018,16,12,/common_datasets/external/references/cellranger/barcodes/3M-february-2018.txt


--------------------------------------------------------------------------------
/envs/star.yml:
--------------------------------------------------------------------------------
 1 | name: star
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.11
 7 |   - pandas=2.2
 8 |   - star=2.7
 9 |   - psycopg2-binary=2.9
10 |   - pypika=0.48
11 |   - python-dotenv=1.0
12 |   - google-cloud-secret-manager=2.22
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ._*
 2 | *~
 3 | *.log
 4 | .env
 5 | .gcp/
 6 | tmp/
 7 | TMP/
 8 | logs/
 9 | work/
10 | results/
11 | archive/
12 | star_ref/
13 | ignore/
14 | .nextflow/
15 | .nextflow*
16 | screenlog.*
17 | *.pyc
18 | SRAgent/
19 | scripts/db_utils.py
20 | notebooks/tiledb/
21 | docker/sc-recounter-run/db_utils.py


--------------------------------------------------------------------------------
/envs/download.yml:
--------------------------------------------------------------------------------
 1 | name: download
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.11
 7 |   - pandas=2.2
 8 |   - seqkit=2.8
 9 |   - sra-tools=3.1
10 |   - psycopg2-binary=2.9
11 |   - pypika=0.48
12 |   - python-dotenv=1.0
13 |   - google-cloud-secret-manager=2.22
14 | 


--------------------------------------------------------------------------------
/envs/tiledb.yml:
--------------------------------------------------------------------------------
 1 | name: download
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.12
 7 |   - pandas=2.2
 8 |   - libtiledbsoma=1.15
 9 |   - scanpy=1.10
10 |   - psycopg2-binary=2.9
11 |   - pypika=0.48
12 |   - python-dotenv=1.0
13 |   - google-cloud-secret-manager=2.22
14 | 


--------------------------------------------------------------------------------
/data/accessions_problems.csv:
--------------------------------------------------------------------------------
1 | sample,accession,organism
2 | SRX23875714,SRR28265595,human
3 | SRX23875714,SRR28265593,human
4 | SRX23875714,SRR28265594,human
5 | SRX24608680,SRR29084389,mouse
6 | SRX23103788,SRR27431357,human
7 | SRX21883929,SRR26171850,mouse
8 | ERX11662357,ERR12251650,human
9 | ERX11662357,ERR12252000,human


--------------------------------------------------------------------------------
/data/accessions_small_n10.csv:
--------------------------------------------------------------------------------
 1 | sample,accession
 2 | SRX9556570,SRR13112659
 3 | SRX9556570,SRR13112660
 4 | SRX9556569,SRR13112650
 5 | SRX9556569,SRR13112649
 6 | SRX9556572,SRR13112685
 7 | SRX9556572,SRR13112687
 8 | SRX9556544,SRR13112280
 9 | SRX9556544,SRR13112285
10 | SRX9556547,SRR13112329
11 | SRX9556558,SRR13112490


--------------------------------------------------------------------------------
/scripts/gcp-upload/conda-env.yml:
--------------------------------------------------------------------------------
 1 | name: download
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.12
 7 |   - pandas=2.2
 8 |   - pyarrow=19.0.1
 9 |   - scanpy=1.10
10 |   - psycopg2-binary=2.9
11 |   - pypika=0.48
12 |   - python-dotenv=1.0
13 |   - google-cloud-secret-manager=2.22
14 |   


--------------------------------------------------------------------------------
/docker/sc-recounter-run/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 | dependencies:
 5 |   - python=3.11
 6 |   - pip
 7 |   - pandas=2.2
 8 |   - psycopg2-binary=2.9
 9 |   - pypika=0.48
10 |   - python-dotenv=1.0
11 |   - google-cloud-secret-manager=2.22
12 |   - google-cloud-storage=2.19.0
13 |   - nextflow=24.10
14 | 


--------------------------------------------------------------------------------
/scripts/tiledb-loader/conda-env.yml:
--------------------------------------------------------------------------------
 1 | name: download
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=3.12
 7 |   - pandas=2.2
 8 |   - scanpy=1.10
 9 |   - psycopg2-binary=2.9
10 |   - pypika=0.48
11 |   - python-dotenv=1.0
12 |   - google-cloud-secret-manager=2.22
13 |   - tiledb==2.27.0
14 |   - tiledbsoma-py==1.15.4
15 | 


--------------------------------------------------------------------------------
/config/utils.config:
--------------------------------------------------------------------------------
 1 | import java.time.*
 2 | Date now = new Date()
 3 | 
 4 | manifest {
 5 |     name = "scRecounter"
 6 |     author = "Nick Youngblut"
 7 |     homePage = "https://github.com/arcinstitute/scRecounter"
 8 |     description = "Nextflow pipeline for re-processing public single-cell data"
 9 |     version = "0.1.0"
10 | }
11 | 
12 | params {
13 |     timestamp = now.format("yyyy-MM-dd_HH-mm-ss")
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/scripts/gcp-upload/config/utils.config:
--------------------------------------------------------------------------------
 1 | import java.time.*
 2 | Date now = new Date()
 3 | 
 4 | manifest {
 5 |     name = "gcp-loader"
 6 |     author = "Nick Youngblut"
 7 |     homePage = "https://github.com/arcinstitute/scRecounter"
 8 |     description = "Load data on gcp"
 9 |     version = "0.1.0"
10 | }
11 | 
12 | params {
13 |     timestamp = now.format("yyyy-MM-dd_HH-mm-ss")
14 | }
15 | 
16 | mail {
17 |     smtp.host = "chimera-admin"
18 |     smtp.port = 25
19 | }
20 | 


--------------------------------------------------------------------------------
/lib/utils.nf:
--------------------------------------------------------------------------------
 1 | process SRA_STAT {
 2 |     label "download_env"
 3 |     errorStrategy { task.attempt <= maxRetries ? 'retry' : 'ignore' }
 4 |     disk 10.GB
 5 | 
 6 |     input:
 7 |     tuple val(sample), val(accession), val(metadata)
 8 | 
 9 |     output:
10 |     tuple val(sample), val(accession), path("sra-stat.csv")
11 | 
12 |     script:
13 |     """
14 |     sra-stat.py ${accession}
15 |     """
16 | 
17 |     stub:
18 |     """
19 |     touch sra-stat.csv
20 |     """
21 | }


--------------------------------------------------------------------------------
/scripts/tiledb-loader/config/utils.config:
--------------------------------------------------------------------------------
 1 | import java.time.*
 2 | Date now = new Date()
 3 | 
 4 | manifest {
 5 |     name = "tiledb-loader"
 6 |     author = "Nick Youngblut"
 7 |     homePage = "https://github.com/arcinstitute/scRecounter"
 8 |     description = "Load data into tiledb-soma database"
 9 |     version = "0.1.0"
10 | }
11 | 
12 | params {
13 |     timestamp = now.format("yyyy-MM-dd_HH-mm-ss")
14 | }
15 | 
16 | mail {
17 |     smtp.host = "chimera-admin"
18 |     smtp.port = 25
19 | }
20 | 


--------------------------------------------------------------------------------
/data/gcp/barcodes.csv:
--------------------------------------------------------------------------------
1 | name,cell_barcode_length,umi_length,file_path
2 | 737K-august-2016,16,10,gs://arc-ctc-references/cellranger/barcodes/737K-august-2016.txt
3 | 3M-february-2018,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-february-2018.txt
4 | 3M-5pgex-jan-2023,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-5pgex-jan-2023.txt
5 | 3M-3pgex-may-2023,16,12,gs://arc-ctc-references/cellranger/barcodes/3M-3pgex-may-2023.txt
6 | 737K-arc-v1,16,12,gs://arc-ctc-references/cellranger/barcodes/737K-arc-v1.txt


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | Custom docker containers for the scRecounter pipeline
2 | =====================================================
3 | 
4 | This directory contains the Dockerfiles for the custom docker containers used in the scRecounter pipeline. 
5 | The containers are built on top of the official [Bioconductor docker images](https://hub.docker.com/u/bioconductor)
6 | and contain the necessary dependencies for the pipeline to run.
7 | 
8 | See the README files in each subdirectory for more information on the individual containers.


--------------------------------------------------------------------------------
/data/barcodes.csv:
--------------------------------------------------------------------------------
1 | name,cell_barcode_length,umi_length,file_path
2 | 737K-august-2016,16,10,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt
3 | 3M-february-2018,16,12,/common_datasets/external/references/cellranger/barcodes/3M-february-2018.txt
4 | 3M-5pgex-jan-2023,16,12,/common_datasets/external/references/cellranger/barcodes/3M-5pgex-jan-2023.txt
5 | 3M-3pgex-may-2023,16,12,/common_datasets/external/references/cellranger/barcodes/3M-3pgex-may-2023.txt
6 | 737K-arc-v1,16,12,/common_datasets/external/references/cellranger/barcodes/737K-arc-v1.txt


--------------------------------------------------------------------------------
/scripts/gcp-upload/config/process.config:
--------------------------------------------------------------------------------
 1 | process {
 2 |     errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" }
 3 |     maxRetries    = 0
 4 |     maxErrors     = "-1"
 5 | 
 6 |     conda = "conda-env.yml"
 7 | 
 8 |     cpus = 1
 9 |     memory = 2.GB
10 |     time = 1.h
11 | 
12 |     withLabel:process_low {
13 |         cpus = 2
14 |         memory = { 8.GB * task.attempt }
15 |         time = { 2.h * task.attempt }
16 |     }
17 |     withLabel:process_high {
18 |         cpus = { 4 * task.attempt }
19 |         memory = { 16.GB * task.attempt }
20 |         time = { 2.h * task.attempt }
21 |     }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature request]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[Bug]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Provide a code example and any sample input data (e.g. an H5AD) as an attachment to reproduce this behavior.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Environment**
23 | Paste in the output of `pip list` or `conda list`
24 | 
25 | **Additional context**
26 | Add any other context about the problem here.
27 | 


--------------------------------------------------------------------------------
/scripts/tiledb-loader/config/process.config:
--------------------------------------------------------------------------------
 1 | process {
 2 |     errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" }
 3 |     maxRetries    = 0
 4 |     maxErrors     = "-1"
 5 | 
 6 |     conda = "conda-env.yml"
 7 | 
 8 |     cpus = 1
 9 |     memory = 2.GB
10 |     time = 1.h
11 | 
12 |     withLabel:process_low {
13 |         cpus = 2
14 |         memory = { 8.GB * task.attempt }
15 |         time = { 2.h * task.attempt }
16 |     }
17 |     withLabel:process_medium {
18 |         cpus = { 8 * task.attempt }
19 |         memory = { 160.GB + 96.GB * task.attempt }
20 |     }
21 |     withLabel:process_high {
22 |         cpus = { 8 * task.attempt }
23 |         memory = { 256.GB + 96.GB * task.attempt }
24 |     }
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/docker/sc-recounter-star/README.md:
--------------------------------------------------------------------------------
 1 | sc-recounter-star container
 2 | ===========================
 3 | 
 4 | # Build and push to GCP Container Registry
 5 | 
 6 | Env vars
 7 | 
 8 | ```bash
 9 | IMG_NAME=sc-recounter-download
10 | IMG_VERSION=0.1.0
11 | REGION="us-east1"
12 | PROJECT="c-tc-429521"
13 | ```
14 | 
15 | Build
16 | 
17 | > from the base directory of the repository
18 | 
19 | ```bash
20 | docker build \
21 |   --file docker/${IMG_NAME}/Dockerfile \
22 |   --build-arg CONDA_ENV_YAML=envs/star.yml \
23 |   --platform linux/amd64 \
24 |   --tag ${IMG_NAME}:${IMG_VERSION} \
25 |   .
26 | ```
27 | 
28 | Push
29 | 
30 | ```bash
31 | docker tag ${IMG_NAME}:${IMG_VERSION} \
32 |   ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \
33 |   && docker push ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION}
34 | ```
35 | 


--------------------------------------------------------------------------------
/docker/sc-recounter-download/README.md:
--------------------------------------------------------------------------------
 1 | sc-recounter-download container
 2 | ===============================
 3 | 
 4 | # Build and push to GCP Container Registry
 5 | 
 6 | Env vars
 7 | 
 8 | ```bash
 9 | IMG_NAME=sc-recounter-download
10 | IMG_VERSION=0.1.0
11 | REGION="us-east1"
12 | PROJECT="c-tc-429521"
13 | ```
14 | 
15 | Build
16 | 
17 | > from the base directory of the repository
18 | 
19 | ```bash
20 | docker build \
21 |   --file docker/${IMG_NAME}/Dockerfile \
22 |   --build-arg CONDA_ENV_YAML=envs/download.yml \
23 |   --platform linux/amd64 \
24 |   --tag ${IMG_NAME}:${IMG_VERSION} \
25 |   .
26 | ```
27 | 
28 | Push
29 | 
30 | ```bash
31 | docker tag ${IMG_NAME}:${IMG_VERSION} \
32 |   ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \
33 |   && docker push ${REGION}-docker.pkg.dev/${PROJECT}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION}
34 | ```
35 | 


--------------------------------------------------------------------------------
/docker/sc-recounter-star/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use micromamba base image
 2 | FROM mambaorg/micromamba:1.5.7
 3 | 
 4 | # Use bash shell
 5 | SHELL ["/bin/bash", "-c"]
 6 | 
 7 | # Set user to root for installation (already root by default, but kept for clarity)
 8 | USER root
 9 | 
10 | # Install OS-level packages
11 | RUN apt-get update -y \
12 |     && apt-get install -y build-essential procps curl \
13 |     && apt-get clean \
14 |     && apt-get purge \
15 |     && rm -rf /var/lib/apt/lists/* /tmp/*
16 | 
17 | # Copy environment file
18 | ARG CONDA_ENV_YAML
19 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${CONDA_ENV_YAML} /tmp/environment.yml
20 | 
21 | # Install the environment using micromamba
22 | RUN micromamba create -f /tmp/environment.yml \
23 |     && micromamba clean --all --yes \
24 |     && rm -rf /opt/conda/pkgs/*
25 | 
26 | # Activate the environment by default
27 | ARG MAMBA_DOCKERFILE_ACTIVATE=1
28 | ENV ENV_NAME=star


--------------------------------------------------------------------------------
/docker/sc-recounter-download/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use micromamba base image
 2 | FROM mambaorg/micromamba:1.5.7
 3 | 
 4 | # Use bash shell
 5 | SHELL ["/bin/bash", "-c"]
 6 | 
 7 | # Set user to root for installation (already root by default, but kept for clarity)
 8 | USER root
 9 | 
10 | # Install OS-level packages
11 | RUN apt-get update -y \
12 |     && apt-get install -y build-essential procps curl \
13 |     && apt-get clean \
14 |     && apt-get purge \
15 |     && rm -rf /var/lib/apt/lists/* /tmp/*
16 | 
17 | # Copy environment file
18 | ARG CONDA_ENV_YAML
19 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${CONDA_ENV_YAML} /tmp/environment.yml
20 | 
21 | # Install the environment using micromamba
22 | RUN micromamba create -f /tmp/environment.yml \
23 |     && micromamba clean --all --yes \
24 |     && rm -rf /opt/conda/pkgs/*
25 | 
26 | # Activate the environment by default
27 | ARG MAMBA_DOCKERFILE_ACTIVATE=1
28 | ENV ENV_NAME=download


--------------------------------------------------------------------------------
/data/accessions_small_n2_params.csv:
--------------------------------------------------------------------------------
1 | sample,fastq_1,fastq_2,barcodes_file,star_index,cell_barcode_length,umi_length,strand
2 | sample2,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/3f/fb5cc4ef344c6f077a941d7712250a/reads/SRR13112660_1.fastq,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/3f/fb5cc4ef344c6f077a941d7712250a/reads/SRR13112660_2.fastq,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38,16,10,Forward
3 | sample1,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/c9/c22d90b85c440f595fec26fa52ac75/reads/SRR13112659_1.fastq,/scratch/multiomics/nickyoungblut/nextflow-work/scRecounter/c9/c22d90b85c440f595fec26fa52ac75/reads/SRR13112659_2.fastq,/common_datasets/external/references/cellranger/barcodes/737K-august-2016.txt,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38,16,10,Forward


--------------------------------------------------------------------------------
/docker/sc-recounter-run/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create run name
 4 | RUN_NAME="SCRECOUNTER_$(date +"%Y-%m-%d_%H-%M-%S")"
 5 | 
 6 | # Set the profile list from the command line arguments
 7 | PROFILE_LIST=$(IFS=,; echo "$*")
 8 | 
 9 | # Activate the micromamba environment and run the pipeline
10 | micromamba run -n sc-recounter-run \
11 |   nextflow run main.nf \
12 |     -profile $PROFILE_LIST \
13 |     -name $RUN_NAME \
14 |     -work-dir "gs://arc-ctc-nextflow/scRecounter/prod/work/${RUN_NAME}" \
15 |     --output_dir "gs://arc-ctc-screcounter/prod3/${RUN_NAME}" \
16 |     -ansi-log false "$@"
17 | 
18 | # Delete output directory if only nf-report and nf-trace
19 | export GCP_SQL_DB_HOST="35.243.133.29"
20 | export GCP_SQL_DB_NAME="sragent-prod"
21 | export GCP_SQL_DB_USERNAME="postgres"
22 | micromamba run -n sc-recounter-run \
23 |   python cleanup.py \
24 |     "gs://arc-ctc-nextflow/scRecounter/prod/work/${RUN_NAME}" \
25 |     "gs://arc-ctc-screcounter/prod3/${RUN_NAME}"
26 | 


--------------------------------------------------------------------------------
/data/accessions_var.csv:
--------------------------------------------------------------------------------
 1 | query_accession,sample,accession,experiment,sra_study,bioproject,spots,spots_with_mates,avgLength,size_MB
 2 | SRP256479,SRS6484446,SRR11549939,SRX8119841,SRP256479,PRJNA625518,17221838,0,76,445
 3 | SRP256479,SRS6484447,SRR11549940,SRX8119842,SRP256479,PRJNA625518,18368896,0,76,475
 4 | SRP256479,SRS6484396,SRR11550035,SRX8119791,SRP256479,PRJNA625518,13056540,13056540,152,681
 5 | SRP256479,SRS6484399,SRR11550038,SRX8119794,SRP256479,PRJNA625518,15239694,15239694,152,910
 6 | SRP256479,SRS6484441,SRR11550087,SRX8119836,SRP256479,PRJNA625518,101386661,101386661,92,2891
 7 | SRP256479,SRS6484442,SRR11550088,SRX8119837,SRP256479,PRJNA625518,108258059,108258059,92,3068
 8 | SRP256479,SRS7053374,SRR12280849,SRX8784719,SRP256479,PRJNA625518,15053071,0,76,418
 9 | SRP256479,SRS7053381,SRR12280856,SRX8784726,SRP256479,PRJNA625518,20917672,0,76,578
10 | SRP256479,SRS6484394,SRR11550033,SRX8119789,SRP256479,PRJNA625518,15656259,15656259,152,821
11 | SRP256479,SRS6484397,SRR11550036,SRX8119792,SRP256479,PRJNA625518,15255046,15255046,152,786


--------------------------------------------------------------------------------
/scripts/tiledb-loader/nextflow.config:
--------------------------------------------------------------------------------
 1 | includeConfig "config/utils.config"
 2 | 
 3 | params {
 4 |   input_dir         = ""
 5 |   db_uri            = ""
 6 |   log_dir           = "logs"
 7 |   feature_type      = "GeneFull_Ex50pAS"
 8 |   mtx_batch_size    = 200
 9 |   h5ad_batch_size   = 2
10 |   missing_metadata  = "skip"
11 |   max_datasets      = 10000
12 | }
13 | 
14 | 
15 | //-- Extra configs --//
16 | includeConfig "config/process.config"
17 | includeConfig "config/profiles.config"
18 | 
19 | 
20 | //-- Functions --//
21 | // Remove trailing forward slashes in a string
22 | def fmtPath(path_str) {
23 |     return path_str.replaceAll(/\/+$/, '')
24 | }
25 | 
26 | // Create the work directory
27 | def getWorkDir() {
28 |     def userGroup = "id -gn".execute().text.trim()
29 |     def userName = "whoami".execute().text.trim()
30 |     def workDir = "/scratch/$userGroup/$userName/nextflow-work/tiledb-loader"
31 |     return workDir
32 | }
33 | 
34 | def getCondaCacheDir() {
35 |     def userName = "whoami".execute().text.trim()
36 |     cacheDir = "/home/$userName/nextflow/conda-cache/tiledb-loader"
37 |     return cacheDir
38 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Arc Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/workflows/read_qc.nf:
--------------------------------------------------------------------------------
 1 | // Read QC workflow for fastq files
 2 | workflow READ_QC_WF{
 3 |     take:
 4 |     ch_fastq
 5 | 
 6 |     main:
 7 |     // Flatten the channel to process each read file separately
 8 |     ch_fastq_flat = ch_fastq.flatMap { sample, fastq_1, fastq_2 -> 
 9 |         [ [sample, "R1", fastq_1], [sample, "R2", fastq_2] ] 
10 |     }
11 | 
12 |     // Run seqkit stats
13 |     SEQKIT_STATS(ch_fastq_flat)
14 |         .collectFile(
15 |           name: "seqkit-stats.tsv", 
16 |           storeDir:  file(params.output_dir) / "read_qc", 
17 |           newLine: false, keepHeader: true
18 |         )
19 | }
20 | 
21 | // Run `seqkit stats` on fastq files
22 | process SEQKIT_STATS {
23 |     container "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0"
24 |     conda "envs/read_qc.yml"
25 |     label "process_low"
26 | 
27 |     input:
28 |     tuple val(sample), val(read), path("${sample}_${read}.fastq")
29 | 
30 |     output:
31 |     path "${sample}_${read}.tsv"
32 | 
33 |     script:
34 |     """
35 |     seqkit -j $task.cpus stats -a -T ${sample}_${read}.fastq > ${sample}_${read}.tsv
36 |     """
37 | 
38 |     stub:
39 |     """
40 |     touch ${sample}_${read}.tsv
41 |     """
42 | }
43 | 


--------------------------------------------------------------------------------
/lib/download.groovy:
--------------------------------------------------------------------------------
 1 | def readAccessions(accessions_input){
 2 |     // Read the input CSV file with the sample names and SRA accessions
 3 |     ch_acc = accessions_input
 4 |         .splitCsv(header: true, sep: ",")
 5 |         .map { row ->
 6 |             def req_columns = ["sample", "accession"]
 7 |             def miss_columns = req_columns.findAll { !row.containsKey(it) }
 8 |             if (miss_columns) {
 9 |                 error "Missing columns in the input CSV file: ${miss_columns}"
10 |             }
11 |             // remove special characters from the sample name
12 |             row.sample = row.sample.replaceAll("\\s", "_")
13 |             def result = [row.sample, row.accession]
14 |             // add optional, metadata columns
15 |             def metadata = [:]
16 |             ["organism", "tech_10x"].each { col ->
17 |                 metadata[col] = row.containsKey(col) ? row[col] : ""
18 |             }
19 |             result << metadata
20 |             return result
21 |         }
22 | 
23 |     // print srx values
24 |     ch_acc
25 |         .map{ sample, accession, metadata -> sample }
26 |         .distinct()
27 |         .collect() 
28 |         .map{ it.join(',') }
29 |         .view{ "SRX accessions: ${it}" } 
30 | 
31 |     return ch_acc
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/config/process.config:
--------------------------------------------------------------------------------
 1 | process {
 2 |     errorStrategy = { task.exitStatus in ((130..145) + 104 + 125) ? "retry" : "finish" }
 3 |     maxRetries    = 0
 4 |     maxErrors     = "-1"
 5 | 
 6 |     cpus = 1
 7 |     memory = 2.GB
 8 |     time = 1.h
 9 | 
10 |     withLabel:download_env {
11 |         container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0"
12 |         conda = "envs/download.yml"
13 |     }
14 |     withLabel:read_env {
15 |         container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-download/sc-recounter-download:0.1.0"
16 |         conda = "envs/read_qc.yml"
17 |     }
18 |     withLabel:star_env {
19 |         container = "us-east1-docker.pkg.dev/c-tc-429521/sc-recounter-star/sc-recounter-star:0.1.0"
20 |         conda = "envs/star.yml"
21 |     }
22 | 
23 |     withLabel:process_low {
24 |         cpus = 4
25 |         memory = { 4.GB * task.attempt }
26 |         time = { 4.h * task.attempt }
27 |     }
28 |     withLabel:process_medium {
29 |         cpus = 8
30 |         memory = { 36.GB * task.attempt }
31 |         time = { 6.h * task.attempt }
32 |     }
33 |     withLabel:process_high {
34 |         cpus = 8
35 |         memory = { 72.GB * task.attempt }
36 |         time = { 10.h * task.attempt }
37 |         maxRetries = 3
38 |     }
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/scripts/gcp-upload/nextflow.config:
--------------------------------------------------------------------------------
 1 | includeConfig "config/utils.config"
 2 | 
 3 | params {
 4 |   input_dir         = "/processed_datasets/scRecount/scRecounter/prod3"
 5 |   output_dir        = "gs://arc-ctc-scbasecamp/2025-02-25/"
 6 |   log_dir           = "tmp/logs"
 7 |   feature_type      = "GeneFull_Ex50pAS"
 8 |   missing_metadata  = "error"
 9 |   tissue_categories = "data/2025-02-20_tissue_categories.csv"
10 |   max_datasets      = 0
11 |   organisms         = ""
12 |   redo_processed    = false
13 |   update_db         = true
14 |   db_host           = "35.243.133.29"      
15 |   db_name           = "sragent-prod"
16 |   db_username       = "postgres"         
17 | }
18 | 
19 | 
20 | //-- Extra configs --//
21 | includeConfig "config/process.config"
22 | includeConfig "config/profiles.config"
23 | 
24 | //-- Functions --//
25 | // Remove trailing forward slashes in a string
26 | def fmtPath(path_str) {
27 |     return path_str.replaceAll(/\/+$/, '')
28 | }
29 | 
30 | // Create the work directory
31 | def getWorkDir() {
32 |     def userGroup = "id -gn".execute().text.trim()
33 |     def userName = "whoami".execute().text.trim()
34 |     def workDir = "/scratch/$userGroup/$userName/nextflow-work/gcp-loader"
35 |     return workDir
36 | }
37 | 
38 | def getCondaCacheDir() {
39 |     def userName = "whoami".execute().text.trim()
40 |     cacheDir = "/home/$userName/nextflow/conda-cache/gcp-loader"
41 |     return cacheDir
42 | }


--------------------------------------------------------------------------------
/workflows/db_acc.nf:
--------------------------------------------------------------------------------
 1 | include { saveAsLog } from '../lib/utils.groovy'
 2 | include { readAccessions } from '../lib/download.groovy'
 3 | 
 4 | workflow DB_ACC_WF {
 5 |     main:
 6 |     // obtain accessions from the database
 7 |     ch_accessions = GET_DB_ACCESSIONS()
 8 |     ch_accessions.csv.ifEmpty { println 'No accessions found in the scRecounter database' }
 9 | 
10 |     emit:
11 |     ch_accessions.csv
12 | }
13 | 
14 | // Save accessions csv
15 | def saveAsFinalAcc(filename) {
16 |     if (filename.endsWith(".csv")){
17 |         filename = filename.tokenize("/").last()
18 |         return "${filename}"
19 |     }
20 |     return null
21 | }
22 | 
23 | process GET_DB_ACCESSIONS {
24 |     publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "*.csv"
25 |     publishDir file(params.output_dir), mode: "copy", overwrite: true, saveAs: { filename -> saveAsLog(filename) }
26 |     label "download_env"
27 |     disk 10.GB
28 | 
29 |     output:
30 |     path "accessions.csv",      emit: "csv"
31 |     path "${task.process}.log", emit: "log"
32 | 
33 |     script:
34 |     """
35 |     export GCP_SQL_DB_HOST="${params.db_host}"
36 |     export GCP_SQL_DB_NAME="${params.db_name}"
37 |     export GCP_SQL_DB_USERNAME="${params.db_username}"
38 | 
39 |     get-db-accessions.py \\
40 |       --organisms "${params.organisms}" \\
41 |       --max-srx ${params.max_samples} \\
42 |       2>&1 | tee ${task.process}.log
43 |     """
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/workflows/reads.nf:
--------------------------------------------------------------------------------
 1 | // Input workflow for processing paired-end reads
 2 | workflow READS_WF{
 3 |     main:
 4 |     // load csv and extract accessions
 5 |     ch_reads = Channel
 6 |         .fromPath(params.reads, checkIfExists: true)
 7 |         .splitCsv(header: true, sep: ",")
 8 |         .map { row ->
 9 |             def req_columns = ["sample", "fastq_1", "fastq_2"]
10 |             def miss_columns = req_columns.findAll { !row.containsKey(it) }
11 |             if (miss_columns) {
12 |                 error "Missing columns in the input CSV file: ${miss_columns}"
13 |             }
14 |             return [row.sample, file(row.fastq_1), file(row.fastq_2)]
15 |         }.groupTuple()
16 |         .map { sample, fastq_1, fastq_2 ->
17 |             return [sample, fastq_1.flatten(), fastq_2.flatten()]
18 |         }
19 | 
20 |     // merge reads by sample
21 |     MERGE_READS(ch_reads)
22 | 
23 |     emit:
24 |     fastq = MERGE_READS.out
25 | }
26 | 
27 | // Merge reads by sample; account for any differences in compression; check sequence formatting
28 | process MERGE_READS {
29 |     conda "envs/read_qc.yml"
30 |     
31 |     input:
32 |     tuple val(sample), path("*_read1.fq.gz"), path("*_read2.fq.gz")
33 | 
34 |     output:
35 |     tuple val(sample), path("${sample}_R1.fq"), path("${sample}_R2.fq")
36 | 
37 |     script:
38 |     """
39 |     seqkit seq *_read1.fq.gz > ${sample}_R1.fq
40 |     seqkit seq *_read2.fq.gz > ${sample}_R2.fq
41 |     """
42 | 
43 |     stub:
44 |     """
45 |     touch ${sample}_R1.fq ${sample}_R2.fq
46 |     """
47 | }
48 | 


--------------------------------------------------------------------------------
/bin/csv-merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # import
 3 | from __future__ import print_function
 4 | import os
 5 | import sys
 6 | import argparse
 7 | import logging
 8 | import pandas as pd
 9 | 
10 | # logging
11 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
12 | 
13 | # argparse
14 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
15 |                       argparse.RawDescriptionHelpFormatter):
16 |     pass
17 | 
18 | desc = 'Merge csv files'
19 | epi = """DESCRIPTION:
20 | Merge multiple csv files into a single table.
21 | """
22 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
23 |                                  formatter_class=CustomFormatter)
24 | parser.add_argument('csv_files', type=str, nargs='+',
25 |                     help='CSV files')
26 | parser.add_argument('--sample', type=str, default=None,
27 |                     help='Sample name')
28 | parser.add_argument('--outfile', type=str, default='merged.csv',
29 |                     help='Output file')
30 | 
31 | # functions
32 | def main(args):
33 |     # read in files
34 |     tables = [pd.read_csv(f) for f in args.csv_files]
35 |     # merge
36 |     df = pd.concat(tables, ignore_index=True)
37 |     # add sample name, if provided
38 |     if args.sample:
39 |         # add sample name
40 |         df['sample'] = args.sample
41 |         # reorder columns
42 |         cols = ['sample'] + [c for c in df.columns if c != 'sample']
43 |         df = df[cols]
44 |     # write
45 |     df.to_csv(args.outfile, index=False)
46 |     logging.info(f'Output written to: {args.outfile}')
47 | 
48 | ## script main
49 | if __name__ == '__main__':
50 |     args = parser.parse_args()
51 |     main(args)


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
 1 | // Subworkflows
 2 | include { DB_ACC_WF } from './workflows/db_acc.nf'
 3 | include { STAR_PARAMS_WF } from './workflows/star_params.nf'
 4 | include { STAR_FULL_WF } from './workflows/star_full.nf'
 5 | include { SRA_STAT } from './lib/utils.nf'
 6 | // util functions
 7 | include { readAccessions; addStats; } from './lib/utils.groovy'
 8 | 
 9 | // Main workflow
10 | workflow { 
11 |     if (params.accessions == "" || params.accessions == true) {
12 |         // Obtain accessions from SRA
13 |         println "No accessions provided. Accessions will be obtained from SRA."
14 |         ch_accessions = DB_ACC_WF()
15 |     } else {
16 |         // Use the provided accessions
17 |         println "Using provided accessions."
18 |         ch_accessions = Channel.fromPath(params.accessions, checkIfExists: true)
19 |     }
20 | 
21 |     // read accessions file
22 |     ch_accessions = readAccessions(ch_accessions)
23 | 
24 |     // run sra-stat on accessions
25 |     ch_sra_stat = SRA_STAT(ch_accessions)
26 |     ch_accessions = addStats(ch_accessions, ch_sra_stat)
27 | 
28 |     // filter out any accessions with max SRA file size greater than the user-specified size
29 |     ch_accessions = ch_accessions.filter { it[3] <= params.max_sra_size }
30 |     
31 |     // determine best STAR parameters on a subset of reads
32 |     ch_star_params = STAR_PARAMS_WF(ch_accessions, ch_sra_stat)
33 | 
34 |     // run STAR on all reads with selected parameters
35 |     if (! params.define){
36 |         STAR_FULL_WF(ch_accessions, ch_star_params)
37 |     }
38 | }
39 | 
40 | // On complete
41 | workflow.onComplete {
42 |     println "Pipeline completed at: $workflow.complete"
43 |     println "Execution status: ${ workflow.success ? 'OK' : 'failed' }"
44 | }
45 | 


--------------------------------------------------------------------------------
/docker/sc-recounter-run/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use micromamba base image
 2 | FROM mambaorg/micromamba:1.5.7
 3 | 
 4 | # Use bash shell
 5 | SHELL ["/bin/bash", "-c"]
 6 | 
 7 | # Set working directory
 8 | WORKDIR /app
 9 | 
10 | # Set user to root for installation
11 | USER root
12 | 
13 | # Install OS-level packages (if needed)
14 | RUN apt-get update -y \
15 |     && apt-get install -y build-essential procps curl \
16 |     && apt-get clean \
17 |     && apt-get purge \
18 |     && rm -rf /var/lib/apt/lists/* /tmp/*
19 | 
20 | # variables
21 | ARG BASE_DIR="docker/sc-recounter-run"
22 | 
23 | # Copy environment file into container
24 | COPY --chown=$MAMBA_USER:$MAMBA_USER ${BASE_DIR}/environment.yml /tmp/environment.yml
25 | 
26 | # Create conda environment with micromamba
27 | RUN micromamba create -n sc-recounter-run -f /tmp/environment.yml --quiet \
28 |     && micromamba clean --all --yes \
29 |     && rm -rf /opt/conda/pkgs/*
30 | 
31 | # Activate environment by default
32 | ARG MAMBA_DOCKERFILE_ACTIVATE=1
33 | ENV ENV_NAME=sc-recounter-run
34 | 
35 | # Copy Nextflow pipeline and the Python runner script
36 | COPY --chown=$MAMBA_USER:$MAMBA_USER main.nf nextflow.config .
37 | COPY --chown=$MAMBA_USER:$MAMBA_USER bin/ ./bin/
38 | COPY --chown=$MAMBA_USER:$MAMBA_USER config/ ./config/
39 | COPY --chown=$MAMBA_USER:$MAMBA_USER data/ ./data/
40 | COPY --chown=$MAMBA_USER:$MAMBA_USER lib/ ./lib/ 
41 | COPY --chown=$MAMBA_USER:$MAMBA_USER workflows/ ./workflows/
42 | 
43 | # Copy runner scripts
44 | COPY bin/db_utils.py ${BASE_DIR}/entrypoint.sh ${BASE_DIR}/cleanup.py  ./
45 | 
46 | # Create a directory for the mamba cache
47 | RUN mkdir -p /.cache/mamba/ /app/.nextflow/ /scratch/ \
48 |   && chmod -R ugo+rwx /.cache/mamba/ /app/.nextflow/ /scratch/
49 | 
50 | # Set the NXF_HOME environment variable
51 | ENV NXF_HOME=/app/.nextflow
52 | 
53 | # Set user to mamba
54 | ENTRYPOINT ["bash", "entrypoint.sh"]


--------------------------------------------------------------------------------
/data/gcp/star_indices.csv:
--------------------------------------------------------------------------------
 1 | organism,star_index
 2 | human,gs://arc-ctc-references/STAR/star_refData_2020_hg38/
 3 | mouse,gs://arc-ctc-references/STAR/star_refData_2020_mm10/
 4 | Macaca_mulatta,gs://arc-ctc-references/STAR/star_refData_2019_mmul10/MMUL-10_scRecount/
 5 | Anopheles_gambiae,gs://arc-ctc-references/STAR/Anopheles_gambiae/AgamP4/
 6 | Arabidopsis_thaliana,gs://arc-ctc-references/STAR/Arabidopsis_thaliana/TAIR10/
 7 | Bos_taurus,gs://arc-ctc-references/STAR/Bos_taurus/ARS-UCD1.3/
 8 | Caenorhabditis_elegans,gs://arc-ctc-references/STAR/Caenorhabditis_elegans/WBcel235/
 9 | Callithrix_jacchus,gs://arc-ctc-references/STAR/Callithrix_jacchus/mCalJac1.pat.X/
10 | Canis_lupus_familiaris,gs://arc-ctc-references/STAR/Canis_lupus_familiaris/ROS_Cfam_1.0/
11 | Danio_rerio,gs://arc-ctc-references/STAR/Danio_rerio/GRCz11/
12 | Drosophila_melanogaster,gs://arc-ctc-references/STAR/Drosophila_melanogaster/BDGP6.46/
13 | Equus_caballus,gs://arc-ctc-references/STAR/Equus_caballus/EquCab3.0/
14 | Gallus_gallus,gs://arc-ctc-references/STAR/Gallus_gallus/bGalGal1.mat.broiler.GRCg7b/
15 | Gorilla_gorilla,gs://arc-ctc-references/STAR/Gorilla_gorilla/gorGor4/
16 | Heterocephalus_glaber,gs://arc-ctc-references/STAR/Heterocephalus_glaber/Naked_mole-rat_maternal/
17 | Oryctolagus_cuniculus,gs://arc-ctc-references/STAR/Oryctolagus_cuniculus/OryCun2.0/
18 | Oryza_sativa,gs://arc-ctc-references/STAR/Oryza_sativa/IRGSP-1.0/
19 | Ovis_aries,gs://arc-ctc-references/STAR/Ovis_aries/ARS-UI_Ramb_v2.0/
20 | Pan_troglodytes,gs://arc-ctc-references/STAR/Pan_troglodytes/Pan_tro_3.0/
21 | Rattus_norvegicus,gs://arc-ctc-references/STAR/Rattus_norvegicus/mRatBN7.2/
22 | Saccharomyces_cerevisiae,gs://arc-ctc-references/STAR/Saccharomyces_cerevisiae/R64-1-1/
23 | Schistosoma_mansoni,gs://arc-ctc-references/STAR/Schistosoma_mansoni/Smansoni_v7/
24 | Solanum_lycopersicum,gs://arc-ctc-references/STAR/Solanum_lycopersicum/SL3.0/
25 | Sus_scrofa,gs://arc-ctc-references/STAR/Sus_scrofa/Sscrofa11.1/
26 | Xenopus_tropicalis,gs://arc-ctc-references/STAR/Xenopus_tropicalis/UCB_Xtro_10.0/
27 | Zea_mays,gs://arc-ctc-references/STAR/Zea_mays/Zm-B73-REFERENCE-NAM-5.0/
28 | 


--------------------------------------------------------------------------------
/scripts/tiledb-loader/main.nf:
--------------------------------------------------------------------------------
 1 | workflow { 
 2 |     // find target MTX files to add to the database
 3 |     FIND_MTX()
 4 | 
 5 |     // list target MTX files
 6 |     mtx_files = FIND_MTX.out.csv
 7 |         .splitCsv( header: true )
 8 |         .map { row -> 
 9 |             tuple( row["batch"], row["srx"], row["matrix_path"] ) 
10 |         }
11 |         .groupTuple()
12 | 
13 |     // aggregate mtx files as h5ad
14 |     MTX_TO_H5AD( mtx_files )
15 | 
16 |     // add the h5ad files to the database
17 |     H5AD_TO_DB( MTX_TO_H5AD.out.h5ad.buffer( size: params.h5ad_batch_size, remainder: true ) )
18 | }
19 | 
20 | process H5AD_TO_DB {
21 |     publishDir file(params.log_dir), mode: "copy", overwrite: true
22 |     label "process_medium"
23 |     maxForks 1
24 | 
25 |     input:
26 |     path "?.h5ad"
27 | 
28 |     output:
29 |     path "h5ad_to_db.log", emit: log
30 | 
31 |     script:
32 |     """
33 |     h5ad-to-db.py \\
34 |       --threads ${task.cpus} \\
35 |       --db-uri ${params.db_uri} \\
36 |       *.h5ad 2>&1 | tee h5ad_to_db.log
37 |     """
38 | }
39 | 
40 | process MTX_TO_H5AD {
41 |     publishDir file(params.log_dir) , mode: "copy", overwrite: true, pattern: "*.log"
42 |     label "process_high"
43 |     maxForks 4
44 | 
45 |     input:
46 |     tuple val(batch), val(srx), val(mtx_path)
47 | 
48 |     output:
49 |     path "data.h5ad",                      emit: h5ad
50 |     path "mtx_to_h5ad_batch-${batch}.log", emit: log
51 | 
52 |     script:
53 |     """
54 |     mtx-to-h5ad.py \\
55 |       --threads ${task.cpus} \\
56 |       --missing-metadata "${params.missing_metadata}" \\
57 |       --srx "$srx" \\
58 |       --path "$mtx_path" \\
59 |       2>&1 | tee mtx_to_h5ad_batch-${batch}.log
60 |     """
61 | }
62 | 
63 | process FIND_MTX {
64 |     publishDir file(params.log_dir), mode: "copy", overwrite: true, pattern: "*.log"
65 |     label "process_low"
66 | 
67 |     output:
68 |     path "mtx_files.csv", emit: csv
69 |     path "find_mtx.log",  emit: log
70 | 
71 |     script:
72 |     """
73 |     find-mtx.py \\
74 |       --feature-type ${params.feature_type} \\
75 |       --max-datasets ${params.max_datasets} \\
76 |       --batch-size ${params.mtx_batch_size} \\
77 |       --db-uri ${params.db_uri} \\
78 |       ${params.input_dir} \\
79 |       2>&1 | tee find_mtx.log
80 |     """
81 | }


--------------------------------------------------------------------------------
/scripts/gcp-loader-tahoe100.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os, gc, sys, argparse, tempfile
 3 | from glob import glob
 4 | import scanpy as sc
 5 | import gcsfs
 6 | 
 7 | def main(input_dir, output_dir, temp_dir):
 8 |     h5ad_files = sorted(glob(os.path.join(input_dir, '*.h5ad.gz')))
 9 |     if not h5ad_files:
10 |         print("No .h5ad.gz files found in the input directory.")
11 |         sys.exit(1)
12 |     else:
13 |         print(f"Found {len(h5ad_files)} .h5ad.gz files.", file=sys.stderr)
14 | 
15 |     to_keep = [
16 |         "sample", "gene_count", "tscp_count", "mread_count", "drugname_drugconc",
17 |         "drug", "cell_line", "sublibrary", "BARCODE", "pcnt_mito", "S_score",
18 |         "G2M_score", "phase", "pass_filter", "cell_name"
19 |     ]
20 | 
21 |     fs = gcsfs.GCSFileSystem()
22 |     os.makedirs(temp_dir, exist_ok=True)
23 | 
24 |     for infile in h5ad_files:
25 |         print(f"Reading {infile}...", file=sys.stderr)
26 |         adata = sc.read_h5ad(infile)
27 |         adata.obs = adata.obs[to_keep]
28 |         adata.obs['plate'] = os.path.basename(infile).split('_')[0]
29 | 
30 |         print(f"Writing temporary file...", file=sys.stderr)
31 |         tmp_name = os.path.join(temp_dir, os.path.basename(infile))
32 |         adata.write_h5ad(tmp_name, compression='gzip')
33 | 
34 |         out_path = os.path.join(output_dir, os.path.basename(infile))
35 |         print(f"Uploading to {output_dir}...", file=sys.stderr)
36 |         fs.put(tmp_name, out_path)
37 | 
38 |         print("Deleting anndata object and temporary file...", file=sys.stderr)
39 |         del adata
40 |         gc.collect()
41 |         os.remove(tmp_name)
42 | 
43 | if __name__ == '__main__':
44 |     parser = argparse.ArgumentParser(description="Process and upload h5ad files to GCP.")
45 |     parser.add_argument('-i', '--input', required=True, help="Input directory containing h5ad.gz files")
46 |     parser.add_argument('-o', '--output', required=True, help="Output GCP directory")
47 |     parser.add_argument('-t', '--temp', required=True, help="Temporary directory")
48 |     args = parser.parse_args()
49 |     main(args.input, args.output, args.temp)
50 | 
51 | # example
52 | # ./gcp-loader-tahoe100.py -t /scratch/multiomics/nickyoungblut/gcp-loader/ -i /processed_datasets/scRecount/tahoe -o gs://arc-ctc-tahoe100/2025-02-25/h5ad


--------------------------------------------------------------------------------
/workflows/download.nf:
--------------------------------------------------------------------------------
 1 | include { readAccessions; } from '../lib/download.groovy'
 2 | include { joinReads; addStats; } from '../lib/utils.nf'
 3 | include { SRA_STAT } from '../lib/utils.nf'
 4 | 
 5 | workflow DOWNLOAD_WF {
 6 |     take:
 7 |     ch_accessions
 8 | 
 9 |     main:
10 |     // Run prefetch & fastq-dump
11 |     ch_fqdump = FASTQ_DUMP(ch_accessions)
12 | 
13 |     /// Merge logs
14 |     FQDUMP_LOG_MERGE(ch_fqdump.log.collect())
15 |     
16 |     // Join R1 and R2 channels, which will filter out empty R2 records
17 |     ch_fastq = joinReads(ch_fqdump.R1, ch_fqdump.R2)
18 | 
19 |     emit:
20 |     fastq = ch_fastq
21 | }
22 | 
23 | process FQDUMP_LOG_MERGE {
24 |     publishDir file(params.output_dir) / "logs", mode: "copy", overwrite: true
25 |     label "download_env"
26 | 
27 |     input:
28 |     path "*_log.csv"
29 | 
30 |     output:
31 |     path "fq-dump_summary.csv"
32 | 
33 |     script:
34 |     """
35 |     csv-merge.py --outfile fq-dump_summary.csv *_log.csv
36 |     """
37 | 
38 |     stub:
39 |     """
40 |     touch fq-dump_summary.csv 
41 |     """
42 | }
43 | 
44 | process FASTQ_DUMP {
45 |     label "download_env"
46 | 
47 |     input:
48 |     tuple val(sample), val(accession), val(metadata), val(sra_file_size_gb)
49 | 
50 |     output:
51 |     tuple val(sample), val(accession), val(metadata), path("reads/read_1.fastq"), emit: "R1"
52 |     tuple val(sample), val(accession), val(metadata), path("reads/read_2.fastq"), emit: "R2", optional: true
53 |     path "reads/fq-dump_log.csv", emit: "log"
54 | 
55 |     script:
56 |     """
57 |     export GCP_SQL_DB_HOST="${params.db_host}"
58 |     export GCP_SQL_DB_NAME="${params.db_name}"
59 |     export GCP_SQL_DB_USERNAME="${params.db_username}"
60 | 
61 |     fq-dump.py \\
62 |       --sample ${sample} \\
63 |       --accession ${accession} \\
64 |       --threads ${task.cpus} \\
65 |       --bufsize 10MB \\
66 |       --curcache 50MB \\
67 |       --mem 5GB \\
68 |       --temp TMP_FILES \\
69 |       --min-read-length ${params.min_read_len} \\
70 |       --maxSpotId ${params.max_spots} \\
71 |       --outdir reads \\
72 |       ${accession}
73 | 
74 |     # remove the temporary files
75 |     rm -rf TMP_FILES
76 |     """
77 | 
78 |     stub:
79 |     """
80 |     mkdir -p reads
81 |     touch reads/${accession}_1.fastq reads/${accession}_2.fastq
82 |     """
83 | }
84 | 


--------------------------------------------------------------------------------
/data/star_indices.csv:
--------------------------------------------------------------------------------
 1 | organism,star_index
 2 | human,/large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38
 3 | mouse,/large_storage/goodarzilab/public/scRecount/genomes/star2.7.11_refData_2020_mm10
 4 | Macaca_mulatta,/large_storage/goodarzilab/public/scRecount/genomes/MMUL-10_scRecount/MMUL-10_scRecount
 5 | Anopheles_gambiae,/scratch/multiomics/nickyoungblut/star_refs/Anopheles_gambiae/star
 6 | Arabidopsis_thaliana,/scratch/multiomics/nickyoungblut/star_refs/Arabidopsis_thaliana/star
 7 | Bos_taurus,/scratch/multiomics/nickyoungblut/star_refs/Bos_taurus/star
 8 | Caenorhabditis_elegans,/scratch/multiomics/nickyoungblut/star_refs/Caenorhabditis_elegans/star
 9 | Callithrix_jacchus,/scratch/multiomics/nickyoungblut/star_refs/Callithrix_jacchus/star
10 | Canis_lupus_familiaris,/scratch/multiomics/nickyoungblut/star_refs/Canis_lupus_familiaris/star
11 | Danio_rerio,/scratch/multiomics/nickyoungblut/star_refs/Danio_rerio/star
12 | Drosophila_melanogaster,/scratch/multiomics/nickyoungblut/star_refs/Drosophila_melanogaster/star
13 | Equus_caballus,/scratch/multiomics/nickyoungblut/star_refs/Equus_caballus/star
14 | Gallus_gallus,/scratch/multiomics/nickyoungblut/star_refs/Gallus_gallus/star
15 | Heterocephalus_glaber,/scratch/multiomics/nickyoungblut/star_refs/Heterocephalus_glaber/star
16 | Oryctolagus_cuniculus,/scratch/multiomics/nickyoungblut/star_refs/Oryctolagus_cuniculus/star
17 | Oryza_sativa,/scratch/multiomics/nickyoungblut/star_refs/Oryza_sativa/star
18 | Ovis_aries,/scratch/multiomics/nickyoungblut/star_refs/Ovis_aries/star
19 | Pan_troglodytes,/scratch/multiomics/nickyoungblut/star_refs/Pan_troglodytes/star
20 | Gorilla_gorilla,/scratch/multiomics/nickyoungblut/star_refs/Gorilla_gorilla/star
21 | Rattus_norvegicus,/scratch/multiomics/nickyoungblut/star_refs/Rattus_norvegicus/star
22 | Saccharomyces_cerevisiae,/scratch/multiomics/nickyoungblut/star_refs/Saccharomyces_cerevisiae/star
23 | Schistosoma_mansoni,/scratch/multiomics/nickyoungblut/star_refs/Schistosoma_mansoni/star
24 | Solanum_lycopersicum,/scratch/multiomics/nickyoungblut/star_refs/Solanum_lycopersicum/star
25 | Sus_scrofa,/scratch/multiomics/nickyoungblut/star_refs/Sus_scrofa/star
26 | Troglodytes_gorilla,/scratch/multiomics/nickyoungblut/star_refs/Troglodytes_gorilla/star
27 | Xenopus_tropicalis,/scratch/multiomics/nickyoungblut/star_refs/Xenopus_tropicalis/star
28 | Zea_mays,/scratch/multiomics/nickyoungblut/star_refs/Zea_mays/star


--------------------------------------------------------------------------------
/scripts/gcp-upload/bin/agg-obs-metadata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # import
 3 | ## batteries
 4 | import os
 5 | import logging
 6 | import argparse 
 7 | from uuid import uuid4
 8 | from pathlib import Path
 9 | from typing import List, Set, Tuple, Optional
10 | ## 3rd party
11 | import pandas as pd
12 | from pypika import Query, Table, Criterion
13 | ## package
14 | from db_utils import db_connect
15 | 
16 | # format logging
17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
18 | 
19 | # classes
20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
21 |     pass
22 | 
23 | # functions
24 | def parse_arguments() -> argparse.Namespace:
25 |     """
26 |     Parse command-line arguments.
27 |     """
28 |     desc = 'Publish database results as parquet files.'
29 |     epi = """DESCRIPTION:
30 |     """
31 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
32 |     parser.add_argument(
33 |         'csv_files', type=str, help="csv files", nargs='+'
34 |     )
35 |     parser.add_argument(
36 |         '--feature-type', default='GeneFull_Ex50pAS', 
37 |         choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto'], 
38 |         help='Feature type to process'
39 |     )
40 |     return parser.parse_args()
41 | 
42 | def merge_csv_files(csv_files: List[str], feature_type: str):
43 |     """
44 |     Load all CSV files into memory and write them together
45 |     """
46 |     outdir = os.path.join("metadata_TMP", feature_type)
47 |     os.makedirs(outdir, exist_ok=True)
48 |     outfile = os.path.join(outdir, f"{uuid4()}.csv.gz")
49 |     
50 |     # Load all dataframes into a list
51 |     data = []
52 |     for csv_file in csv_files:
53 |         logging.info(f"Processing {csv_file}...")
54 |         df = pd.read_csv(csv_file)
55 |         data.append(df)
56 |     
57 |     # Concatenate all dataframes and write to file
58 |     pd.concat(data, axis=0, ignore_index=True).to_csv(outfile, index=False, compression='gzip')
59 |     logging.info(f"Saved merged csv to {outfile}")
60 |     
61 | def main():
62 |     """Main function to run the TileDB loader workflow."""
63 |     args = parse_arguments()
64 | 
65 |     # merge csv files
66 |     merge_csv_files(args.csv_files, args.feature_type)
67 | 
68 | if __name__ == "__main__":
69 |     #from dotenv import load_dotenv
70 |     #load_dotenv(override=True)
71 |     main()


--------------------------------------------------------------------------------
/scripts/gcp-upload/config/profiles.config:
--------------------------------------------------------------------------------
 1 | profiles {
 2 |     conda {
 3 |         conda.enabled          = true
 4 |         conda.useMamba         = false
 5 |         docker.enabled         = false
 6 |         singularity.enabled    = false
 7 |         podman.enabled         = false
 8 |         shifter.enabled        = false
 9 |         charliecloud.enabled   = false
10 |     }
11 |     docker {
12 |         docker.enabled         = true
13 |         docker.sudo            = false
14 |         docker.runOptions      = "-u \$(id -u):\$(id -g) --platform=linux/amd64"
15 |         singularity.enabled    = false
16 |         podman.enabled         = false
17 |         shifter.enabled        = false
18 |         charliecloud.enabled   = false
19 |     }
20 |     vm {
21 |         workDir            = "tmp/work"
22 |         process {
23 |             errorStrategy  = "terminate"
24 |             maxRetries     = 0
25 |             resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ]
26 |         }
27 |     }
28 |     slurm {
29 |         workDir            = getWorkDir()
30 |         conda.cacheDir     = getCondaCacheDir()
31 |         cleanup            = true
32 |         executor.queueSize = 300
33 |         process {
34 |             executor       = "slurm"
35 |             queue          = "cpu_batch"
36 |             errorStrategy  = "retry" 
37 |             maxRetries     = 3
38 |             resourceLimits = [ cpus: 24, memory: 900.GB, time: 72.h ]
39 |         }
40 |     }
41 |     dev {
42 |         params {
43 |             input_dir    = "/processed_datasets/scRecount/scRecounter/prod3"
44 |             log_dir      = "tmp/logs"
45 |             output_dir   = "gs://arc-ctc-nextflow/gcp-loader/output/" 
46 |             max_datasets = 3
47 |             db_name      = "sragent-prod"
48 |             update_db    = false
49 |         }
50 |     }
51 |     report {
52 |         report {
53 |             enabled   = true
54 |             overwrite = true
55 |             file      = "${params.log_dir}/nf-report/${params.timestamp}.html"
56 |         }
57 |     }
58 |     trace {
59 |         trace {
60 |             enabled   = true
61 |             overwrite = true
62 |             file      = "${params.log_dir}/nf-trace/${params.timestamp}.txt"
63 |             fields    = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch"
64 |         }
65 |     }
66 | }
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/scripts/tiledb-loader/config/profiles.config:
--------------------------------------------------------------------------------
 1 | profiles {
 2 |     conda {
 3 |         conda.enabled          = true
 4 |         conda.useMamba         = false
 5 |         docker.enabled         = false
 6 |         singularity.enabled    = false
 7 |         podman.enabled         = false
 8 |         shifter.enabled        = false
 9 |         charliecloud.enabled   = false
10 |     }
11 |     docker {
12 |         docker.enabled         = true
13 |         docker.sudo            = false
14 |         docker.runOptions      = "-u \$(id -u):\$(id -g) --platform=linux/amd64"
15 |         singularity.enabled    = false
16 |         podman.enabled         = false
17 |         shifter.enabled        = false
18 |         charliecloud.enabled   = false
19 |     }
20 |     vm {
21 |         workDir            = "tmp/work"
22 |         process {
23 |             errorStrategy  = "terminate"
24 |             maxRetries     = 0
25 |             resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ]
26 |         }
27 |     }
28 |     slurm {
29 |         workDir            = getWorkDir()
30 |         conda.cacheDir     = getCondaCacheDir()
31 |         cleanup            = true
32 |         executor.queueSize = 30
33 |         process {
34 |             executor       = "slurm"
35 |             queue          = "cpu_batch"
36 |             errorStrategy  = "retry"  // "terminate"
37 |             maxRetries     = 2
38 |             resourceLimits = [ cpus: 24, memory: 900.GB, time: 72.h ]
39 |         }
40 |     }
41 |     dev {
42 |         params {
43 |             input_dir       = "/processed_datasets/scRecount/scRecounter/tmp/prod_tmp"
44 |             db_uri          = "/scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod_tmp" 
45 |             mtx_batch_size  = 8
46 |             h5ad_batch_size = 4
47 |             max_datasets    = 50
48 |         }
49 |     }
50 |     report {
51 |         report {
52 |             enabled   = true
53 |             overwrite = true
54 |             file      = "${params.log_dir}/nf-report/${params.timestamp}.html"
55 |         }
56 |     }
57 |     trace {
58 |         trace {
59 |             enabled   = true
60 |             overwrite = true
61 |             file      = "${params.log_dir}/nf-trace/${params.timestamp}.txt"
62 |             fields    = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch"
63 |         }
64 |     }
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/bin/subsample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # import
 3 | from __future__ import print_function
 4 | import os
 5 | import re
 6 | import sys
 7 | import gzip
 8 | import argparse
 9 | import logging
10 | from time import sleep
11 | from subprocess import Popen, PIPE
12 | 
13 | 
14 | # logging
15 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
16 | 
17 | # argparse
18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
19 |                       argparse.RawDescriptionHelpFormatter):
20 |     pass
21 | 
22 | desc = 'Subsample reads'
23 | epi = """DESCRIPTION:
24 | Subsample reads from a fastq file.
25 | Just taking the first N reads from the head of the file.
26 | gzip input fastq files are supported.
27 | """
28 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
29 |                                  formatter_class=CustomFormatter)
30 | parser.add_argument('fastq_file', type=str, nargs='+',
31 |                     help='file(s) to subsample')
32 | parser.add_argument('--num-seqs', type=int, default=100000,
33 |                     help='Number of sequences to subsample')
34 | parser.add_argument('--out-file', type=str, default='subsampled.fastq',
35 |                     help='Output file') 
36 | 
37 | # functions
38 | def subsample(infile: str, num_seqs: int, outF, is_gzip: bool=False) -> None:
39 |     # use gzip if file is gzipped
40 |     if is_gzip:
41 |         _open = gzip.open
42 |     else:
43 |         _open = open
44 |     # subsample
45 |     with _open(infile, 'r') as inF:
46 |         for ii,line in enumerate(inF, 1):
47 |             # decode if gzip
48 |             if is_gzip:
49 |                 line = line.decode('utf-8')
50 |             # write
51 |             outF.write(line)
52 |             if ii / 4 >= num_seqs:
53 |                 return None
54 | 
55 | def main(args):
56 |     # divide num_seqs by number of files
57 |     num_files = len(args.fastq_file)
58 |     num_seqs = int(args.num_seqs / num_files)
59 |     
60 |     # loop through each file
61 |     with open(args.out_file, 'w') as outF:
62 |         for i, infile in enumerate(args.fastq_file, 1):
63 |             logging.info(f'Processing file {i}/{num_files}: {infile}')
64 |             # subsample
65 |             try:
66 |                 subsample(infile, num_seqs, outF)
67 |             except UnicodeDecodeError:
68 |                 subsample(infile, num_seqs, outF, is_gzip=True)
69 | 
70 |     # status
71 |     logging.info(f'Output written to: {args.out_file}')
72 | 
73 | ## script main
74 | if __name__ == '__main__':
75 |     args = parser.parse_args()
76 |     main(args)


--------------------------------------------------------------------------------
/scripts/tiledb-loader/README.md:
--------------------------------------------------------------------------------
  1 | tiledb loader
  2 | =============
  3 | 
  4 | A simple Nextflow pipeline for efficiently loading single-cell data into a TileDB-SOMA database.
  5 | 
  6 | Workflow:
  7 | * Find new datasets (SRX accessions)
  8 | * For each batch of datasets:
  9 |   * Convert MTX to h5ad
 10 |   * Load h5ad into TileDB-SOMA database
 11 | 
 12 | 
 13 | 
 14 | # Dev
 15 | 
 16 | Local run
 17 | 
 18 | ```bash
 19 | nextflow run main.nf -profile conda,vm,dev -resume
 20 | ```
 21 | 
 22 | Slurm run
 23 | 
 24 | ```bash
 25 | nextflow run main.nf -profile conda,slurm,dev -resume
 26 | ```
 27 | 
 28 | ## Test prod
 29 | 
 30 | ```bash
 31 | nextflow run main.nf -profile conda,vm \
 32 |   --max_datasets 8 \
 33 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp \
 34 |   --input_dir /processed_datasets/scRecount/scRecounter/prod3
 35 | ```
 36 | 
 37 | ```bash
 38 | nextflow run main.nf -profile conda,slurm \
 39 |   --max_datasets 8 \
 40 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp \
 41 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
 42 | ```
 43 | 
 44 | ## Test scale
 45 | 
 46 | ```bash
 47 | rm -rf /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_prod3_tmp
 48 | ```
 49 | 
 50 | ```bash
 51 | nextflow run main.nf -profile conda,slurm,report,trace \
 52 |   --max_datasets 100 \
 53 |   --mtx_batch_size 4 \
 54 |   --h5ad_batch_size 4 \
 55 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_4-4 \
 56 |   --input_dir /processed_datasets/scRecount/scRecounter/prod3
 57 | ```
 58 | 
 59 | Time: 10m 15s
 60 | 
 61 | ```bash
 62 | nextflow run main.nf -profile conda,slurm,report,trace \
 63 |   --max_datasets 100 \
 64 |   --mtx_batch_size 20 \
 65 |   --h5ad_batch_size 4 \
 66 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_20-4 \
 67 |   --input_dir /processed_datasets/scRecount/scRecounter/prod3
 68 | ```
 69 | 
 70 | Time: 9m 55s (from-memory)
 71 | Time: 11m (from-disk)
 72 | 
 73 | ```bash
 74 | nextflow run main.nf -profile conda,slurm,report,trace \
 75 |   --max_datasets 100 \
 76 |   --mtx_batch_size 50 \
 77 |   --h5ad_batch_size 4 \
 78 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_50-4 \
 79 |   --input_dir /processed_datasets/scRecount/scRecounter/prod3
 80 | ```
 81 | 
 82 | Time: 10m 25s
 83 | 
 84 | 
 85 | ```bash
 86 | nextflow run main.nf -profile conda,slurm,report,trace \
 87 |   --max_datasets 100 \
 88 |   --mtx_batch_size 25 \
 89 |   --h5ad_batch_size 2 \
 90 |   --db_uri /scratch/multiomics/nickyoungblut/tiledb-loader/tiledb_tmp_25-2 \
 91 |   --input_dir /processed_datasets/scRecount/scRecounter/prod3
 92 | ```
 93 | 
 94 | Time: `TODO`
 95 | 
 96 | 
 97 | # Backups
 98 | 
 99 | ```console
100 | ~/tmp/tiledb/db_bkup
101 | /large_storage/multiomics/projects/tiledb_bkup
102 | ```


--------------------------------------------------------------------------------
/scripts/gcp-upload/bin/db-to-parquet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # import
 3 | ## batteries
 4 | import os
 5 | import logging
 6 | import argparse 
 7 | from uuid import uuid4
 8 | from pathlib import Path
 9 | from typing import List, Set, Tuple, Optional
10 | ## 3rd party
11 | import pandas as pd
12 | from pypika import Query, Table, Criterion
13 | ## package
14 | from db_utils import db_connect
15 | 
16 | # format logging
17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
18 | 
19 | # classes
20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
21 |     pass
22 | 
23 | # functions
24 | def parse_arguments() -> argparse.Namespace:
25 |     """
26 |     Parse command-line arguments.
27 |     """
28 |     desc = 'Publish database results as parquet files.'
29 |     epi = """DESCRIPTION:
30 |     """
31 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
32 |     parser.add_argument(
33 |         '--feature-type', default='GeneFull_Ex50pAS', 
34 |         choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto'], 
35 |         help='Feature type to process'
36 |     )
37 |     return parser.parse_args()
38 | 
39 | 
40 | def load_scbasecamp_metadata(feature_type: str) -> pd.DataFrame:
41 |     """
42 |     Load metadata from scBasecamp database.
43 |     Args:
44 |         feature_type: Feature type to filter on.
45 |     Returns:
46 |         DataFrame with metadata.
47 |     """
48 |     logging.info("Obtaining scbasecamp metadata...")
49 | 
50 |     # get metadata from scRecounter postgresql database
51 |     srx_metadata = Table("scbasecamp_metadata")
52 |     stmt = (
53 |         Query
54 |         .from_(srx_metadata)
55 |         .select("*")
56 |         .where(srx_metadata.feature_type == feature_type)
57 |     )
58 |     with db_connect() as conn:
59 |         metadata = pd.read_sql(str(stmt), conn)
60 |     return metadata.drop(columns=['created_at', 'updated_at'])
61 |     
62 | def main():
63 |     """Main function to run the TileDB loader workflow."""
64 |     args = parse_arguments()
65 | 
66 |     # Load metadata
67 |     metadata = load_scbasecamp_metadata(feature_type=args.feature_type)
68 | 
69 |     ## split by organism and save to parquet
70 |     for organism, df in metadata.groupby('organism'):
71 |         logging.info(f"Processing metadata for {organism}...")
72 |         organism_str = organism.replace(" ", "_")
73 |         # create directory
74 |         out_dir = Path("metadata") / Path(args.feature_type) / Path(organism_str)
75 |         out_dir.mkdir(parents=True, exist_ok=True)
76 |         # write to parquet
77 |         outfile = out_dir / 'sample_metadata.parquet.gz'
78 |         df.to_parquet(outfile, index=False, compression='gzip')
79 |         logging.info(f"Saved metadata for {organism} to {outfile}")
80 | 
81 | if __name__ == "__main__":
82 |     main()


--------------------------------------------------------------------------------
/data/accessions_all-org.csv:
--------------------------------------------------------------------------------
 1 | sample,accession,entrez_id,organism
 2 | SRX20288331,SRR24503416,27709479,Anopheles_gambiae
 3 | SRX19498702,SRR23613944,26779668,Arabidopsis_thaliana
 4 | SRX19992927,SRR24196182,26767425,Bos_taurus
 5 | SRX27335695,SRR31980676,36879050,Caenorhabditis_elegans
 6 | SRX23995681,SRR28390731,32301733,Callithrix_jacchus
 7 | SRX24164706,SRR28565449,32475612,Canis_lupus_familiaris
 8 | SRX26737879,SRR31364130,36125471,Danio_rerio
 9 | ERX5671941,ERR6032665,15014737,Drosophila_melanogaster
10 | SRX26348969,SRR30946435,35575331,Equus_caballus
11 | SRX21269173,SRR25539693,28706133,Gallus_gallus
12 | ERX6700420,ERR7133201,17777602,Gorilla_gorilla
13 | ERX10138362,ERR10669350,30605311,Heterocephalus_glaber
14 | ERX9511946,ERR9970924,24063361,Oryctolagus_cuniculus
15 | SRX22683047,SRR26990030,30698812,Oryza_sativa
16 | SRX16872040,SRR20852785,23639073,Ovis_aries
17 | ERX3512851,ERR3491595,9149889,Pan_troglodytes
18 | SRX26473837,SRR31090734,34046074,Rattus_norvegicus
19 | ERX4639423,ERR4769579,12367094,Schistosoma_mansoni
20 | SRX15090985,SRR19019104,21568481,Solanum_lycopersicum
21 | SRX23732262,SRR28085873,32021971,Sus_scrofa
22 | ERX5927607,ERR6295317,15346129,Xenopus_tropicalis
23 | SRX19052019,SRR23099821,26222706,Zea_mays
24 | SRX19498738,SRR23614060,26779704,Arabidopsis_thaliana
25 | SRX17163012,SRR21151334,23968863,Bos_taurus
26 | SRX27335698,SRR31980670,36879053,Caenorhabditis_elegans
27 | ERX9648260,ERR10111034,29618243,Callithrix_jacchus
28 | SRX17820910,SRR21831760,24747069,Canis_lupus_familiaris
29 | SRX22195232,SRR26491369,30150470,Danio_rerio
30 | SRX15014603,SRR18937045,21480053,Drosophila_melanogaster
31 | SRX26348970,SRR30946434,35575332,Equus_caballus
32 | SRX22821312,SRR27139512,30863257,Gallus_gallus
33 | ERX6700421,ERR7133209,17777603,Gorilla_gorilla
34 | ERX10213964,ERR10763189,30604755,Heterocephalus_glaber
35 | ERX9511931,ERR9970495,24063321,Oryctolagus_cuniculus
36 | SRX22985906,SRR27308612,31059128,Oryza_sativa
37 | SRX19482704,SRR23597689,26761938,Ovis_aries
38 | SRX18295180,SRR22321952,25305686,Pan_troglodytes
39 | SRX21325315,SRR25597644,28762465,Rattus_norvegicus
40 | ERX11749473,ERR12372788,31020827,Schistosoma_mansoni
41 | SRX15090986,SRR19019100,21568482,Solanum_lycopersicum
42 | SRX22722562,SRR27030753,30756300,Sus_scrofa
43 | SRX19052018,SRR23099822,26222705,Zea_mays
44 | SRX16110579,SRR20072545,22798499,Arabidopsis_thaliana
45 | SRX16110572,SRR20072552,22798492,Arabidopsis_thaliana
46 | SRX19885394,SRR24084458,27255941,Caenorhabditis_elegans
47 | SRX20684355,SRR24923839,28114027,Caenorhabditis_elegans
48 | SRX24172447,SRR28572865,32483421,Drosophila_melanogaster
49 | ERX5671946,ERR6032679,15014742,Drosophila_melanogaster
50 | SRX23498637,SRR27835303,31746997,Equus_caballus
51 | SRX23498644,SRR27835295,31747004,Equus_caballus
52 | SRX21646415,SRR25926818,29301782,Gallus_gallus
53 | SRX20765161,SRR25009824,28195890,Gallus_gallus
54 | ERX3512833,ERR3491577,9149737,Pan_troglodytes
55 | ERX3512988,ERR3491732,9149904,Pan_troglodytes


--------------------------------------------------------------------------------
/bin/upload-final-star-params.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # import
 3 | ## batteries
 4 | from __future__ import print_function
 5 | import os
 6 | import re
 7 | import sys
 8 | import argparse
 9 | import logging
10 | import pandas as pd
11 | from db_utils import db_connect, db_upsert
12 | 
13 | # logging
14 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
15 | 
16 | # argparse
17 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
18 |                       argparse.RawDescriptionHelpFormatter):
19 |     pass
20 | 
21 | desc = 'Upload final STAR parameters to the scRecounter database'
22 | epi = """DESCRIPTION:
23 | Upload final STAR parameters to the scRecounter database.
24 | """
25 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
26 |                                  formatter_class=CustomFormatter)
27 | parser.add_argument('--sample', type=str, default=None,
28 |                     help='Sample name')
29 | parser.add_argument('--barcodes', type=str, default=None,
30 |                     help='Barcodes file path')
31 | parser.add_argument('--star-index', type=str, default=None,
32 |                     help='STAR index path')
33 | parser.add_argument('--cell-barcode-length', type=int, default=None,
34 |                     help='Cell barcode length')
35 | parser.add_argument('--umi-length', type=int, default=None,
36 |                     help='UMI length')
37 | parser.add_argument('--strand', type=str, default=None,
38 |                     help='Strandness')
39 | parser.add_argument('--outfile', type=str, default="star_params.csv",
40 |                     help='Output file path')
41 | 
42 | # functions
43 | def main(args):
44 |     # set pandas display optionqs
45 |     pd.set_option('display.max_columns', 30)
46 |     pd.set_option('display.width', 300)
47 | 
48 |     # create dataframe
49 |     df = pd.DataFrame({
50 |         'sample': [args.sample],
51 |         'barcodes': [os.path.basename(args.barcodes)],
52 |         'star_index': [os.path.basename(args.star_index.rstrip("/"))],
53 |         'cell_barcode_length': [args.cell_barcode_length],
54 |         'umi_length': [args.umi_length],
55 |         'strand': [args.strand]
56 |     })
57 | 
58 |     # write to file
59 |     if os.path.exists(args.outfile):
60 |         os.remove(args.outfile)
61 |     df.to_csv(args.outfile, index=False)
62 | 
63 |     # upload to the scRecounter database
64 |     with db_connect() as conn:
65 |         db_upsert(df, "screcounter_star_params", conn)
66 | 
67 |     # update screcounter log
68 |     log_df = pd.DataFrame({
69 |         "sample": [args.sample],
70 |         "accession": [""],
71 |         "process": ["STAR save params"],
72 |         "step": ["Final"],
73 |         "status": ["Success"],
74 |         "message": ["STAR final parameters saved to database"],
75 |     })
76 |     with db_connect() as conn:
77 |         db_upsert(log_df, "screcounter_log", conn)
78 |    
79 | 
80 | ## script main
81 | if __name__ == '__main__':
82 |     args = parser.parse_args()
83 |     main(args)


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
 1 | includeConfig "config/utils.config"
 2 | 
 3 | params {
 4 |   accessions         = ""                       // CSV of accessions to download
 5 |   barcodes           = "data/barcodes.csv"      // CSV listing barcode files
 6 |   star_indices       = "data/star_indices.csv"  // CSV listing STAR indices
 7 |   output_dir         = "results"                // Output directory location
 8 |   max_samples        = 3                        // Max number of samples to process, if no accessions are provided
 9 |   max_accessions     = 1                        // Max number of accessions per sample to use for STAR parameter determination
10 |   max_spots          = 1000000                  // Max number of spots (read-pairs) for STAR param assessment
11 |   fallback_max_spots = 200000000                // Max number of spots (read-pairs) if fasterq-dump fails
12 |   min_read_len       = 26                       // Minimum read length for R1 & R2 (shorter read files will be ignored)
13 |   max_sra_size       = 300                      // Max SRA file size in GB (determined via sra-stat); all larger will be filtered
14 |   organisms          = "human,mouse"            // Organisms to process if pulling from the scRecounter SQL database
15 |   define             = false                    // Just define the STAR parameters for each sample
16 |   fasterq_tmp        = "TEMP"                   // Temporary directory for fasterq-dump
17 |   db_host            = "35.243.133.29"          // scRecounter SQL database host (GCP_SQL_DB_HOST)
18 |   db_name            = "sragent-prod"           // scRecounter SQL database name (GCP_SQL_DB_NAME)
19 |   db_username        = "postgres"               // scRecounter SQL database username (GCP_SQL_DB_USERNAME)
20 | }
21 | 
22 | 
23 | //-- Extra configs --//
24 | includeConfig "config/process.config"
25 | includeConfig "config/profiles.config"
26 | 
27 | //-- Functions --//
28 | // Remove trailing forward slashes in a string
29 | def fmtPath(path_str) {
30 |     return path_str.replaceAll(/\/+$/, '')
31 | }
32 | 
33 | // Limit to the max resources of the available machine
34 | def check_max(obj, type){
35 |     if(type == 'memory'){
36 |         if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1){
37 |             return params.max_memory as nextflow.util.MemoryUnit
38 |         }
39 |     } else if(type == 'time'){
40 |         if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1){
41 |             return params.max_time as nextflow.util.Duration
42 |         }
43 |     } else if(type == 'cpus'){
44 |         if (obj > params.max_cpus as int){
45 |             return params.max_cpus as int
46 |         }
47 |     }
48 |     return obj
49 | }
50 | 
51 | def getWorkDir() {
52 |     def userGroup = "id -gn".execute().text.trim()
53 |     def userName = "whoami".execute().text.trim()
54 |     def workDir = "/scratch/$userGroup/$userName/nextflow-work/scRecounter"
55 |     return workDir
56 | }
57 | 
58 | def getCondaCacheDir() {
59 |     def userName = "whoami".execute().text.trim()
60 |     cacheDir = "/home/$userName/nextflow/conda-cache/scRecounter"
61 |     return cacheDir
62 | }


--------------------------------------------------------------------------------
/scripts/gcp-upload/README.md:
--------------------------------------------------------------------------------
  1 | gcp-loader
  2 | ==========
  3 | 
  4 | A simple Nextflow pipeline for efficiently loading single-cell data as h5ad files onto GCP
  5 | 
  6 | 
  7 | # Dev
  8 | 
  9 | Local run
 10 | 
 11 | ```bash
 12 | nextflow run main.nf -profile conda,vm,dev --feature_type GeneFull -resume
 13 | ```
 14 | 
 15 | Slurm run
 16 | 
 17 | ```bash
 18 | nextflow run main.nf -profile conda,slurm,dev -resume 
 19 | ```
 20 | 
 21 | ## prod
 22 | 
 23 | ### GeneFull_Ex50pAS
 24 | 
 25 | ### CZI
 26 | 
 27 | ```bash
 28 | nextflow run main.nf \
 29 |   -profile conda,slurm \
 30 |   --feature_type GeneFull_Ex50pAS \
 31 |   --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \
 32 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
 33 | ```
 34 | 
 35 | ### SRA
 36 | 
 37 | ```bash
 38 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull_Ex50pAS
 39 | ```
 40 | 
 41 | ### Clean up
 42 | 
 43 | ```bash
 44 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/
 45 | ```
 46 | 
 47 | ### Velocyto
 48 | 
 49 | ### CZI
 50 | 
 51 | ```bash
 52 | nextflow run main.nf \
 53 |   -profile conda,slurm \
 54 |   --feature_type Velocyto \
 55 |   --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \
 56 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
 57 | ```
 58 | 
 59 | ### SRA
 60 | 
 61 | ```bash
 62 | nextflow run main.nf -profile conda,slurm --feature_type Velocyto
 63 | ```
 64 | 
 65 | **HERE**
 66 | 
 67 | ### Clean up
 68 | 
 69 | ```bash
 70 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/
 71 | ```
 72 | 
 73 | ### Gene  ==> REDO
 74 | 
 75 | ### CZI
 76 | 
 77 | ```bash
 78 | nextflow run main.nf \
 79 |   -profile conda,slurm \
 80 |   --feature_type Gene \
 81 |   --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \
 82 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
 83 | ```
 84 | 
 85 | ### SRA
 86 | 
 87 | ```bash
 88 | nextflow run main.nf -profile conda,slurm --feature_type Gene
 89 | ```
 90 | 
 91 | ### Clean up
 92 | 
 93 | ```bash
 94 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/
 95 | ```
 96 | 
 97 | ### GeneFull
 98 | 
 99 | ### CZI
100 | 
101 | ```bash
102 | nextflow run main.nf \
103 |   -profile conda,slurm \
104 |   --feature_type GeneFull \
105 |   --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \
106 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
107 | ```
108 | 
109 | ### SRA
110 | 
111 | ```bash
112 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull
113 | ```
114 | 
115 | ### Clean up
116 | 
117 | ```bash
118 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/
119 | ```
120 | 
121 | ### GeneFull_ExonOverIntron
122 | 
123 | ### CZI
124 | 
125 | ```bash
126 | nextflow run main.nf \
127 |   -profile conda,slurm \
128 |   --feature_type GeneFull_ExonOverIntron \
129 |   --organisms "Mus musculus,Homo sapiens,Macaca mulatta" \
130 |   --input_dir /processed_datasets/scRecount/cellxgene/counted_SRXs
131 | ```
132 | 
133 | ### SRA
134 | 
135 | ```bash
136 | nextflow run main.nf -profile conda,slurm --feature_type GeneFull_ExonOverIntron
137 | ```
138 | 
139 | ### Clean up
140 | 
141 | ```bash
142 | rm -rf /scratch/multiomics/nickyoungblut/nextflow-work/gcp-loader/
143 | ```
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/docker/sc-recounter-run/README.md:
--------------------------------------------------------------------------------
  1 | sc-recounter-run
  2 | ================
  3 | 
  4 | The container for running the pipeline on GCP Cloud Run Jobs.
  5 | 
  6 | ## Setup 
  7 | 
  8 | Env vars
  9 | 
 10 | ```bash
 11 | IMG_NAME=sc-recounter-run
 12 | IMG_VERSION=0.1.9
 13 | REGION="us-east1"
 14 | GCP_PROJECT_ID="c-tc-429521"
 15 | SERVICE_ACCOUNT_EMAIL="nick-nextflow@c-tc-429521.iam.gserviceaccount.com"
 16 | SERVICE_ACCOUNT_JSON="c-tc-429521-6f6f5b8ccd93.json"
 17 | ```
 18 | 
 19 | ### Docker
 20 | 
 21 | Build
 22 | 
 23 | > from the base directory of the repository
 24 | 
 25 | ```bash
 26 | docker build \
 27 |   --file docker/${IMG_NAME}/Dockerfile \
 28 |   --build-arg CONDA_ENV_YAML=docker/${IMG_NAME}/environment.yml \
 29 |   --platform linux/amd64 \
 30 |   --tag ${IMG_NAME}:${IMG_VERSION} \
 31 |   .
 32 | ```
 33 | 
 34 | Run the image (`-help`)
 35 | 
 36 | ```bash
 37 | docker run -it --rm \
 38 |   -u $(id -u):$(id -g) \
 39 |   -v ${PWD}:/data \
 40 |   -v ${HOME}/.gcp/:/.gcp \
 41 |   --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \
 42 |   --platform linux/amd64 \
 43 |   ${IMG_NAME}:${IMG_VERSION} \
 44 |   -help
 45 | ```
 46 | 
 47 | Run the image (`-profile`)
 48 | 
 49 | ```bash
 50 | docker run -it --rm \
 51 |   -u $(id -u):$(id -g) \
 52 |   -v ${PWD}:/data \
 53 |   -v ${HOME}/.gcp/:/.gcp \
 54 |   --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \
 55 |   --platform linux/amd64 \
 56 |   ${IMG_NAME}:${IMG_VERSION} \
 57 |   -profile docker,gcp,gcp_dev,dev,no_acc_dev
 58 | ```
 59 | 
 60 | Run with bash entrypoint
 61 | 
 62 | ```bash
 63 | docker run -it --rm \
 64 |   -u $(id -u):$(id -g) \
 65 |   -v ${PWD}:/data \
 66 |   -v ${HOME}/.gcp/:/.gcp \
 67 |   --env GOOGLE_APPLICATION_CREDENTIALS=/.gcp/${SERVICE_ACCOUNT_JSON} \
 68 |   --entrypoint /bin/bash \
 69 |   --platform linux/amd64 \
 70 |   ${IMG_NAME}:${IMG_VERSION}
 71 | ```
 72 | 
 73 | ### GCP Artifact Registry
 74 | 
 75 | Create (if needed)
 76 | 
 77 | ```bash
 78 | DESCRIPTION="Run the scRecounter nextflow pipeline"
 79 | gcloud artifacts repositories create ${IMG_NAME} \
 80 |   --repository-format=docker \
 81 |   --project=${GCP_PROJECT_ID} \
 82 |   --location=${REGION} \
 83 |   --description="${DESCRIPTION}" \
 84 |   --async
 85 | ```
 86 | 
 87 | Push
 88 | 
 89 | ```bash
 90 | docker tag ${IMG_NAME}:${IMG_VERSION} \
 91 |   ${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \
 92 |   && docker push ${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION}
 93 | ```
 94 | 
 95 | ### GCP Cloud Run Jobs
 96 | 
 97 | Create/update the job
 98 | 
 99 | ```bash
100 | JOB_NAME="${IMG_NAME}"
101 | gcloud beta run jobs update ${JOB_NAME} \
102 |   --service-account=${SERVICE_ACCOUNT_EMAIL} \
103 |   --project=${GCP_PROJECT_ID} \
104 |   --region=${REGION} \
105 |   --image=${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \
106 |   --set-env-vars=TZ=America/Los_Angeles \
107 |   --cpu=2 \
108 |   --memory=2Gi \
109 |   --task-timeout=4320m \
110 |   --max-retries=0 \
111 |   --args="docker","gcp","report","trace"
112 | ```
113 | 
114 | Non-human/mouse genomes
115 | 
116 | ```bash
117 | JOB_NAME="${IMG_NAME}-all-org"
118 | gcloud beta run jobs update ${JOB_NAME} \
119 |   --service-account=${SERVICE_ACCOUNT_EMAIL} \
120 |   --project=${GCP_PROJECT_ID} \
121 |   --region=${REGION} \
122 |   --image=${REGION}-docker.pkg.dev/${GCP_PROJECT_ID}/${IMG_NAME}/${IMG_NAME}:${IMG_VERSION} \
123 |   --set-env-vars=TZ=America/Los_Angeles \
124 |   --cpu=2 \
125 |   --memory=2Gi \
126 |   --task-timeout=4320m \
127 |   --max-retries=0 \
128 |   --args="docker","gcp","all_org","report","trace"
129 | ```
130 | 
131 | 


--------------------------------------------------------------------------------
/bin/format-star-params.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | ## batteries
  4 | from __future__ import print_function
  5 | import os
  6 | import re
  7 | import sys
  8 | import argparse
  9 | import logging
 10 | import pandas as pd
 11 | 
 12 | # logging
 13 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 14 | 
 15 | # argparse
 16 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 17 |                       argparse.RawDescriptionHelpFormatter):
 18 |     pass
 19 | 
 20 | desc = 'Set STAR parameters for each sample.'
 21 | epi = """DESCRIPTION:
 22 | The script reads the STAR summary CSV file and determines
 23 | the STAR parameters, based on the number of valid barcodes
 24 | for each parameter set among the test STAR runs.
 25 | """
 26 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 27 |                                  formatter_class=CustomFormatter)
 28 | parser.add_argument('star_summary_csv', type=str,
 29 |                     help='Path to the STAR summary CSV file')
 30 | parser.add_argument('--sample', type=str, default=None,
 31 |                     help='Sample name')
 32 | parser.add_argument('--accession', type=str, default=None,
 33 |                     help='Accession name')
 34 | parser.add_argument('--strand', type=str, default=None,
 35 |                     help='Strandness')
 36 | parser.add_argument('--barcodes-name', type=str, default=None,
 37 |                     help='Barcodes name')
 38 | parser.add_argument('--barcodes-file', type=str, default=None,
 39 |                     help='Barcodes file path')
 40 | parser.add_argument('--cell-barcode-length', type=int, default=None,
 41 |                     help='Cell barcode length')
 42 | parser.add_argument('--umi-length', type=int, default=None,
 43 |                     help='UMI length')
 44 | parser.add_argument('--organism', type=str, default=None,
 45 |                     help='Organism')
 46 | parser.add_argument('--star-index', type=str, default=None,
 47 |                     help='STAR index path')
 48 | parser.add_argument('--outfile', type=str, default="star_params.csv",
 49 |                     help='Output file path')
 50 | 
 51 | # functions
 52 | def main(args):
 53 |     # set pandas display optionqs
 54 |     pd.set_option('display.max_columns', 30)
 55 |     pd.set_option('display.width', 300)
 56 | 
 57 |     # create param table
 58 |     star_params = {
 59 |         "sample" : args.sample,
 60 |         "accession" : args.accession,
 61 |         "strand" : args.strand,
 62 |         "barcodes_name" : args.barcodes_name,
 63 |         "barcodes_file" : args.barcodes_file,
 64 |         "cell_barcode_length" : args.cell_barcode_length,
 65 |         "umi_length" : args.umi_length,
 66 |         "organism" : args.organism,
 67 |         "star_index" : args.star_index
 68 |     }
 69 |     # convert to dataframe
 70 |     star_params = pd.DataFrame([star_params])
 71 | 
 72 |     # print to stderr
 73 |     #print("#-- raw dataframe --#", file=sys.stderr)
 74 |     #star_params.to_csv(sys.stderr, index=False)
 75 | 
 76 |     # read star summary
 77 |     star_summary = pd.read_csv(args.star_summary_csv, header=None)
 78 |     star_summary["sample"] = args.sample
 79 |     star_summary["accession"] = args.accession 
 80 |     # pivot
 81 |     star_summary = star_summary.pivot(index=["sample", "accession"], columns=0, values=1) 
 82 |     star_summary["sample"] = star_summary.index.get_level_values('sample')  
 83 |     star_summary["accession"] = star_summary.index.get_level_values('accession')
 84 |     star_summary.reset_index(drop=True, inplace=True)
 85 | 
 86 |     # merge dataframes on sample and accession
 87 |     star_params = star_params.merge(star_summary, on=["sample", "accession"], how="inner")
 88 | 
 89 |     # print to stderr
 90 |     #print("#-- final dataframe --#", file=sys.stderr)
 91 |     #star_params.to_csv(sys.stderr, index=False)
 92 | 
 93 |     # write to file
 94 |     star_params.to_csv(args.outfile, index=False)
 95 | 
 96 | 
 97 | ## script main
 98 | if __name__ == '__main__':
 99 |     args = parser.parse_args()
100 |     main(args)


--------------------------------------------------------------------------------
/bin/star-summary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | ## batteries
  4 | from __future__ import print_function
  5 | import os
  6 | import re
  7 | import sys
  8 | import argparse
  9 | import logging
 10 | from typing import List, Dict, Any, Tuple
 11 | import pandas as pd
 12 | from db_utils import db_connect, db_upsert
 13 | 
 14 | # logging
 15 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 16 | 
 17 | # argparse
 18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 19 |                       argparse.RawDescriptionHelpFormatter):
 20 |     pass
 21 | 
 22 | desc = 'Summarize STAR summary files'
 23 | epi = """DESCRIPTION:
 24 | Summarize STAR summary files into a single table. The summary files
 25 | are generated by the STAR aligner and contain information about the
 26 | alignment statistics. The script reads in all summary files and
 27 | concatenates them into a single table. The table is then written to
 28 | a file and upserted into the database.
 29 | """
 30 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 31 |                                  formatter_class=CustomFormatter)
 32 | parser.add_argument('summary_csv', type=str, nargs='+',
 33 |                     help='STAR summary csv file(s)')
 34 | parser.add_argument('--sample', type=str, default="",
 35 |                     help='Sample name')
 36 | parser.add_argument('--outfile', type=str, default="Summary.csv",
 37 |                     help='Output file')
 38 |          
 39 | # functions
 40 | def main(args):
 41 |     # set pandas display optionqs
 42 |     pd.set_option('display.max_columns', 50)
 43 |     pd.set_option('display.max_rows', 100)
 44 |     pd.set_option('display.width', 300)
 45 | 
 46 |     # read in all summary csv files and concatenate
 47 |     df = []
 48 |     regex = re.compile(r"_summary.csv$")
 49 |     for infile in args.summary_csv:
 50 |         x = pd.read_csv(infile, header=None)
 51 |         x.columns = ["category", "value"]
 52 |         x["feature"] = regex.sub("", os.path.basename(infile))
 53 |         df.append(x)
 54 |     df = pd.concat(df)
 55 | 
 56 |     # status
 57 |     logging.info(f"Number of rows in the raw table: {df.shape[0]}")
 58 | 
 59 |     # format category
 60 |     for x in ["Gene", "GeneFull", "GeneFull_Ex50pAS", "GeneFull_ExonOverIntron", "Velocyto"]:
 61 |         regex = re.compile(f" {x} ")
 62 |         df["category"] = df["category"].apply(lambda x: regex.sub(" feature ", x))
 63 | 
 64 |     # pivot table
 65 |     df = df.pivot(index='feature', columns='category', values='value').reset_index()
 66 |     
 67 |     # format columns: no spaces and lowercase
 68 |     df.columns = df.columns.str.replace(r'\W', '_', regex=True).str.lower() 
 69 | 
 70 |     # coerce columns to numeric
 71 |     for col in df.columns.to_list():
 72 |         if col != "feature":
 73 |             df[col] = pd.to_numeric(df[col], errors='coerce')
 74 | 
 75 |     # float columns to integer
 76 |     cols_to_convert = ["estimated_number_of_cells", "number_of_reads", "umis_in_cells"]
 77 |     for col in cols_to_convert:
 78 |         if col in df.columns:
 79 |             df[col] = df[col].fillna(0).replace([float('inf'), -float('inf')], 0).astype(int)
 80 | 
 81 |     # add sample name
 82 |     df["sample"] = args.sample
 83 | 
 84 |     # status
 85 |     logging.info(f"Number of rows after formattings: {df.shape[0]}")
 86 | 
 87 |     # upsert results to database
 88 |     logging.info("Updating screcounter_star_results...")
 89 |     with db_connect() as conn:
 90 |         db_upsert(df, "screcounter_star_results", conn)
 91 | 
 92 |     # write output table
 93 |     outdir = os.path.dirname(args.outfile)
 94 |     if outdir != "":
 95 |         os.makedirs(outdir, exist_ok=True)
 96 |     df.to_csv(args.outfile, index=False)
 97 | 
 98 |     # update screcounter log
 99 |     logging.info("Updating screcounter_log...")
100 |     log_df = pd.DataFrame({
101 |         "sample": [args.sample],
102 |         "accession": [""],
103 |         "process": ["STAR-full"],
104 |         "step": ["Final"],
105 |         "status": ["Success"],
106 |         "message": ["STAR summary table generated"]
107 |     })
108 |     with db_connect() as conn:
109 |         db_upsert(log_df, "screcounter_log", conn)
110 | 
111 | 
112 | ## script main
113 | if __name__ == '__main__':
114 |     args = parser.parse_args()
115 |     main(args)
116 | 
117 |     


--------------------------------------------------------------------------------
/scripts/gcp-find-soft-delete.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import argparse
  5 | from typing import Tuple, List, Dict
  6 | from datetime import datetime
  7 | import pandas as pd
  8 | from google.cloud import storage
  9 | 
 10 | 
 11 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 12 |     pass
 13 | 
 14 | def parse_args() -> argparse.Namespace:
 15 |     """
 16 |     Parse command-line arguments.
 17 |     Returns:
 18 |         argparse.Namespace containing arguments.
 19 |     """
 20 |     desc = 'List all files in a bucket that are designated as soft-delete'
 21 |     epi = """DESCRIPTION:
 22 | 
 23 |     """
 24 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 25 |     parser.add_argument('gcs_bucket', type=str,
 26 |                         help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/)')
 27 |     # parser.add_argument('--min-date-time', type=str, default='2025-01-13_00-00-00',
 28 |     #                     help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)')
 29 |     # parser.add_argument('--max-date-time', type=str, default='2025-01-15_00-00-00',
 30 |     #                     help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)')
 31 |     return parser.parse_args()
 32 | 
 33 | def parse_gs_path(gs_path: str) -> Tuple[str, str]:
 34 |     """
 35 |     Parse a GCP bucket path.
 36 |     Args:
 37 |         gs_path: GCP bucket path starting with gs://
 38 |     Returns:
 39 |         A tuple of (bucket_name, prefix).
 40 |     """
 41 |     if not gs_path.startswith("gs://"):
 42 |         raise ValueError("Path must start with 'gs://'")
 43 |     parts = gs_path[5:].split("/", 1)
 44 |     bucket_name = parts[0]
 45 |     prefix = parts[1] if len(parts) > 1 else ""
 46 |     return bucket_name, prefix.rstrip("/") + "/"
 47 | 
 48 | def list_soft_deleted_files(bucket: storage.Bucket) -> List[Dict[str, str]]:   
 49 |     """
 50 |     List all files in a GCP bucket that are designated as soft-deleted
 51 |     Args:
 52 |         bucket: A GCP bucket object.
 53 |     Returns:   
 54 |         A list of dictionaries containing the name and generation of soft-deleted files.
 55 |     """
 56 |     # List all blobs in the bucket
 57 |     blobs = bucket.list_blobs(versions=True)
 58 | 
 59 |     # Dictionary to track the latest generation of each object
 60 |     latest_generations = {}
 61 | 
 62 |     # First pass: determine the latest generation for each object
 63 |     print("First pass: determine the latest generation for each object", file=sys.stderr)
 64 |     for blob in blobs:
 65 |         if blob.name not in latest_generations:
 66 |             latest_generations[blob.name] = blob.generation
 67 |         else:
 68 |             latest_generations[blob.name] = max(latest_generations[blob.name], blob.generation)
 69 | 
 70 |     ## status
 71 |     print(f"Num blobs: {len(latest_generations)}", file=sys.stderr)
 72 | 
 73 |     # Second pass: collect non-current versions
 74 |     print("Second pass: collect non-current versions", file=sys.stderr)
 75 |     soft_deleted_files = []
 76 |     blobs = bucket.list_blobs(versions=True)
 77 |     for blob in blobs:
 78 |         try:
 79 |             if blob.generation < latest_generations[blob.name]:
 80 |                 soft_deleted_files.append({"name": blob.name, "generation": blob.generation})
 81 |         except KeyError:
 82 |             pass
 83 | 
 84 |     return soft_deleted_files
 85 | 
 86 | def main(args: argparse.Namespace) -> None:
 87 |     """
 88 |     Main function that:
 89 |     1. Lists all files in a GCP bucket that are designated as soft-deleted
 90 | 
 91 |     Args:
 92 |         args: An argparse.Namespace holding command-line arguments.
 93 |     """
 94 |     # Format arg date/time strings
 95 |     #min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S")
 96 |     #max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S")
 97 | 
 98 |     # Parse GCP bucket path
 99 |     bucket_name, path_prefix = parse_gs_path(args.gcs_bucket)
100 | 
101 |     # Initialize GCP client and bucket
102 |     client = storage.Client()
103 |     bucket = client.bucket(bucket_name)
104 | 
105 |     # list soft-deleted files
106 |     soft_del_files = list_soft_deleted_files(bucket)
107 |     print(soft_del_files)
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     from dotenv import load_dotenv
112 |     load_dotenv()
113 |     args = parse_args()
114 |     main(args)


--------------------------------------------------------------------------------
/bin/sra-stat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | from __future__ import print_function
  4 | import os
  5 | import re
  6 | import sys
  7 | import argparse
  8 | import logging
  9 | from time import sleep
 10 | from shutil import which
 11 | from typing import Dict, List
 12 | from subprocess import Popen, PIPE
 13 | import xml.etree.ElementTree as ET
 14 | import pandas as pd
 15 | 
 16 | # logging
 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 18 | 
 19 | # argparse
 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 21 |                       argparse.RawDescriptionHelpFormatter):
 22 |     pass
 23 | 
 24 | desc = 'Run sra-tools sra-stat'
 25 | epi = """DESCRIPTION:
 26 | Run sra-tools sra-stat with handling of errors and formatting of the output
 27 | """
 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 29 |                                  formatter_class=CustomFormatter)
 30 | parser.add_argument('accession', type=str, help='SRA accession')
 31 | parser.add_argument('--tries', type=int, default=5,
 32 |                     help='Number of tries to download')
 33 | parser.add_argument('--outfile', type=str, default='sra-stat.csv',
 34 |                     help='Output file')
 35 | 
 36 | # functions
 37 | def run_cmd(cmd: str) -> tuple:
 38 |     """
 39 |     Run sub-command and return returncode, output, and error.
 40 |     Args:
 41 |         cmd: Command to run
 42 |     Returns:
 43 |         tuple: (returncode, output, error)
 44 |     """
 45 |     logging.info(f'Running: {cmd}')
 46 |     p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
 47 |     output, err = p.communicate()
 48 |     return p.returncode, output, err
 49 | 
 50 | def run_sra_stat(accession: str, tries: int=5) -> pd.DataFrame:
 51 |     """
 52 |     Run prefetch with error handling.
 53 |     Args:
 54 |         accession: SRA accession
 55 |         tries: Number of tries
 56 |     Returns:
 57 |         
 58 |     """
 59 |     cmd = f'sra-stat --xml --quick {accession}'
 60 |     for i in range(tries):
 61 |         logging.info(f'Attempt: {i+1}/{tries}')
 62 |         rc,output,err = run_cmd(cmd)
 63 |         if rc == 0:
 64 |             return output
 65 |         else:
 66 |             logging.error('Download failed')
 67 |             logging.error(err)
 68 |         # sleep prior to next attempt
 69 |         sleep_time = 10 * (i + 1)
 70 |         logging.info(f'Sleeping for {sleep_time} seconds...')
 71 |         sleep(sleep_time)
 72 |     return None
 73 |     
 74 | def parse_sra_stats(xml_string: str) -> Dict:
 75 |     """Parse SRA statistics XML and return key metrics.
 76 |     
 77 |     Args:
 78 |         xml_string: XML string containing SRA run statistics
 79 |         
 80 |     Returns:
 81 |         Dictionary containing parsed statistics
 82 |     """
 83 |     # Parse XML string
 84 |     root = ET.fromstring(xml_string)
 85 |     
 86 |     # Get run-level attributes
 87 |     stats = {
 88 |         'accession': root.get('accession'),
 89 |         'spot_count': int(root.get('spot_count')),
 90 |         'base_count': int(root.get('base_count'))
 91 |     }
 92 |     
 93 |     # Get file size
 94 |     size_elem = root.find('Size')
 95 |     if size_elem is not None:
 96 |         file_size = int(size_elem.get('value'))
 97 |         file_size_units = size_elem.get('units')
 98 |         if file_size and file_size_units:
 99 |             # convert to Gb
100 |             if file_size_units == 'bytes':
101 |                 file_size = file_size / 1e9
102 |             elif file_size_units == 'kilobytes':
103 |                 file_size = file_size / 1e6
104 |             elif file_size_units == 'megabytes':
105 |                 file_size = file_size / 1e3
106 |             stats['file_size_gb'] = file_size
107 |     else:
108 |         stats['file_size_gb'] = 10  # default size (Gb), if no found
109 |     
110 |     # convert to pandas dataframe
111 |     return pd.DataFrame(stats, index=[0])
112 | 
113 | def main(args):
114 |     # check for prefetch in path
115 |     for exe in ['sra-stat']:
116 |         if not which(exe):
117 |             logging.error(f'{exe} not found in PATH')
118 |             sys.exit(1)
119 | 
120 |     # run sra-state
121 |     data = run_sra_stat(args.accession, args.tries)
122 |     if not data:
123 |         logging.error('sra-stat failed')
124 |         sys.exit(1)
125 | 
126 |     # parse sra-stat output
127 |     stats = parse_sra_stats(data)
128 |     
129 |     # write to file
130 |     stats.to_csv(args.outfile, index=False)
131 |     logging.info(f'Output written to: {args.outfile}')
132 | 
133 | ## script main
134 | if __name__ == '__main__':
135 |     args = parser.parse_args()
136 |     main(args)


--------------------------------------------------------------------------------
/lib/star_params.groovy:
--------------------------------------------------------------------------------
  1 | import groovy.json.JsonSlurper
  2 | 
  3 | def expandStarParams(ch_fastq, ch_star_params_json) {
  4 |     def processedSamples = [] 
  5 |     
  6 |     // read the JSON file with the STAR parameters and join with the fastq channel
  7 |     ch_params = ch_fastq.join(ch_star_params_json, by: [0,1])
  8 |         .map{ sample, accession, metadata, read1, read2, json_file -> 
  9 |             processedSamples << [sample, accession] 
 10 |             def params = new JsonSlurper().parseText(json_file.text)
 11 |             def barcodes_file = params.barcodes_file
 12 |             def star_index = params.star_index
 13 |             def cell_barcode_length = params.cell_barcode_length
 14 |             def umi_length = params.umi_length
 15 |             def strand = params.strand
 16 |             return [sample, accession,
 17 |                     barcodes_file, star_index, 
 18 |                     cell_barcode_length, umi_length, strand]
 19 |         }
 20 | 
 21 |     // status on number of parameter combinations
 22 |     ch_params.ifEmpty{ 
 23 |         println "WARNING: No valid parameter set found for the following samples:"
 24 |         processedSamples.each { sampleInfo -> 
 25 |             println "- Sample: ${sampleInfo[0]}, Accession: ${sampleInfo[1]}"
 26 |         }
 27 |     }
 28 |     return ch_params
 29 | }
 30 | 
 31 | def makeParamSets(ch_subsample, ch_barcodes, ch_star_indices) {
 32 |     // pairwise combine the subsample, barcodes and star indices channels
 33 |     ch_params = ch_subsample
 34 |         .combine(Channel.of("Forward", "Reverse"))
 35 |         .combine(ch_barcodes)
 36 |         .combine(ch_star_indices)
 37 |         .map { sample, accession, metadata, r1, r2, strand, barcodes_name, cb_len, umi_len, barcodes_file, organism, star_index ->
 38 |             if (metadata["organism"] != "" & metadata["organism"] != organism) {
 39 |                 return null
 40 |             }
 41 |             def params = [
 42 |                 sample: sample,
 43 |                 accession: accession,
 44 |                 strand: strand,
 45 |                 barcodes_name: barcodes_name,
 46 |                 cell_barcode_length: cb_len,
 47 |                 umi_length: umi_len,
 48 |                 barcodes_file: barcodes_file,
 49 |                 organism: organism,
 50 |                 star_index: star_index
 51 |             ]
 52 |             return [sample, accession, metadata, r1, r2, barcodes_file, star_index, params] 
 53 |         }
 54 |         .filter { it != null }
 55 | 
 56 |     // status on number of parameter combinations
 57 |     ch_params
 58 |         .ifEmpty("No valid parameter set found")
 59 |         .count().view{ count -> "Param sets to test across all SRX: ${count}" }
 60 |     return ch_params
 61 | }
 62 | 
 63 | def validateRequiredColumns(row, required) {
 64 |     // check if all required columns are present in the input CSV file
 65 |     def missing = required.findAll { !row.containsKey(it) }
 66 |     if (missing) {
 67 |         error "Missing columns in the input CSV file: ${missing}"
 68 |     }
 69 | }
 70 | 
 71 | def loadBarcodes(params) {
 72 |     // load the barcodes from the input CSV file
 73 |     ch_barcodes = Channel
 74 |         .fromPath(params.barcodes, checkIfExists: true)
 75 |         .splitCsv(header: true)
 76 |         .map { row ->
 77 |             def req_columns = ["name", "cell_barcode_length", "umi_length", "file_path"]
 78 |             validateRequiredColumns(row, req_columns)
 79 |             // remove special characters
 80 |             row.name = row.name.replaceAll("\\s", "_")
 81 |             return [
 82 |                 row.name, 
 83 |                 row.cell_barcode_length.toInteger(), 
 84 |                 row.umi_length.toInteger(),
 85 |                 row.file_path
 86 |             ]
 87 |         }
 88 |     // status on number of barcodes
 89 |     ch_barcodes
 90 |         .ifEmpty("No barcodes found in the input CSV file")
 91 |         .count().view{ count -> "Number of input barcodes: ${count}" }
 92 |     return ch_barcodes
 93 | }
 94 | 
 95 | def loadStarIndices(params) {
 96 |     // load the STAR indices from the input CSV file
 97 |     ch_indices = Channel
 98 |         .fromPath(params.star_indices, checkIfExists: true)
 99 |         .splitCsv(header: true)
100 |         .map { row ->
101 |             def req_columns = ["organism", "star_index"]
102 |             validateRequiredColumns(row, req_columns)
103 |             // remove special characters
104 |             row.organism = row.organism.replaceAll("\\s", "_")
105 |             return [row.organism, row.star_index]
106 |         }
107 |     // status on number of star indices
108 |     ch_indices
109 |         .ifEmpty("No star indices found in the input CSV file")
110 |         .count().view{ count -> "Number of input star indices: ${count}" }
111 |     return ch_indices
112 | }


--------------------------------------------------------------------------------
/scripts/gcp-upload/main.nf:
--------------------------------------------------------------------------------
  1 | workflow { 
  2 |     // find target MTX files to add to the database
  3 |     FIND_MTX()
  4 | 
  5 |     // list target MTX files
  6 |     mtx_files = FIND_MTX.out.csv
  7 |       .splitCsv( header: true )
  8 |       .map{ row -> 
  9 |         tuple(row.srx, file(row.matrix_path), file(row.features_path), file(row.barcodes_path))
 10 |       }
 11 | 
 12 |     // group Velocyto MTX files by SRX
 13 |     if( params.feature_type == "Velocyto"){
 14 |       mtx_files = mtx_files.groupTuple().map{ group -> 
 15 |         tuple(group[0], group[1], group[2][0], group[3][0])
 16 |       }
 17 |     }
 18 | 
 19 |     // convert to h5ad and publish
 20 |     MTX_TO_H5AD( mtx_files, Channel.fromPath(params.tissue_categories) )
 21 | 
 22 |     // write parquet after all MTX_TO_H5AD jobs complete
 23 |     if( params.update_db ){
 24 |       DB_TO_PARQUET( MTX_TO_H5AD.out.h5ad.collect() )
 25 |     }
 26 |     
 27 |     // aggregate obs metadata
 28 |     AGG_OBS_METADATA( MTX_TO_H5AD.out.csv.collate(100) )
 29 | }
 30 | 
 31 | process AGG_OBS_METADATA {
 32 |     publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "metadata_TMP/${params.feature_type}/*.csv.gz"
 33 |     publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log"
 34 |     label "process_low"
 35 | 
 36 |     input:
 37 |     path csv_files
 38 | 
 39 |     output:
 40 |     path "metadata_TMP/${params.feature_type}/*.csv.gz", emit: obs_meta
 41 |     path "agg-obs-metadata.log",                         emit: log
 42 | 
 43 |     script:
 44 |     """
 45 |     agg-obs-metadata.py \\
 46 |       --feature-type ${params.feature_type} \\
 47 |       ${csv_files} 2>&1 | tee agg-obs-metadata.log
 48 |     """
 49 | }
 50 | 
 51 | process DB_TO_PARQUET {
 52 |     publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "metadata/${params.feature_type}/*/sample_metadata.parquet.gz"
 53 |     publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log"
 54 |     label "process_low"
 55 | 
 56 |     input:
 57 |     path csv_files
 58 | 
 59 |     output:
 60 |     path "metadata/${params.feature_type}/*/sample_metadata.parquet.gz", emit: samp_meta
 61 |     path "db-to-parquet.log",                                            emit: log
 62 | 
 63 |     script:
 64 |     """
 65 |     export GCP_SQL_DB_HOST="${params.db_host}"
 66 |     export GCP_SQL_DB_NAME="${params.db_name}"
 67 |     export GCP_SQL_DB_USERNAME="${params.db_username}"
 68 | 
 69 |     db-to-parquet.py \\
 70 |       --feature-type ${params.feature_type} \\
 71 |       2>&1 | tee db-to-parquet.log
 72 |     """
 73 | }
 74 | 
 75 | process MTX_TO_H5AD {
 76 |     publishDir file(params.output_dir), mode: "copy", overwrite: true, pattern: "h5ad/${params.feature_type}/*/*.h5ad.gz"
 77 |     publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log"
 78 |     errorStrategy { task.attempt <= maxRetries ? 'retry' : 'ignore' }
 79 |     label "process_low"
 80 |     maxForks 200
 81 | 
 82 |     input:
 83 |     tuple val(srx), path(mtx_path), path(features_path), path(barcodes_path)
 84 |     each path(tissue_categories)
 85 | 
 86 |     output:
 87 |     path "h5ad/${params.feature_type}/*/${srx}.h5ad.gz",  emit: h5ad
 88 |     path "metadata/${srx}.csv.gz", emit: csv
 89 |     path "mtx-to-h5ad_${srx}.log", emit: log
 90 | 
 91 |     script:
 92 |     def update_db = params.update_db ? "--update-database" : ""
 93 |     """
 94 |     export GCP_SQL_DB_HOST="${params.db_host}"
 95 |     export GCP_SQL_DB_NAME="${params.db_name}"
 96 |     export GCP_SQL_DB_USERNAME="${params.db_username}"
 97 | 
 98 |     mtx-to-h5ad.py ${update_db} \\
 99 |       --feature-type ${params.feature_type} \\
100 |       --missing-metadata "${params.missing_metadata}" \\
101 |       --tissue-categories "${tissue_categories}" \\
102 |       --srx ${srx} \\
103 |       --matrix ${mtx_path} \\
104 |       --publish-path "${params.output_dir}" \\
105 |       2>&1 | tee mtx-to-h5ad_${srx}.log
106 |     """
107 | }
108 | 
109 | process FIND_MTX {
110 |     publishDir file(params.log_dir) / params.feature_type, mode: "copy", overwrite: true, pattern: "*.log"
111 |     label "process_low"
112 | 
113 |     output:
114 |     path "mtx_files.csv", emit: csv
115 |     path "find-mtx.log",  emit: log
116 | 
117 |     script:
118 |     def organisms = params.organisms != "" ? "--organisms \"${params.organisms}\"" : ""
119 |     def redo_processed = params.redo_processed.toString() == "true" ? "--redo-processed" : ""
120 |     """
121 |     export GCP_SQL_DB_HOST="${params.db_host}"
122 |     export GCP_SQL_DB_NAME="${params.db_name}"
123 |     export GCP_SQL_DB_USERNAME="${params.db_username}"
124 | 
125 |     find-mtx.py ${organisms} ${redo_processed} \\
126 |       --feature-type ${params.feature_type} \\
127 |       --max-datasets ${params.max_datasets} \\
128 |       ${params.input_dir} \\
129 |       2>&1 | tee find-mtx.log
130 |     """
131 | }


--------------------------------------------------------------------------------
/config/profiles.config:
--------------------------------------------------------------------------------
  1 | profiles {
  2 |     conda {
  3 |         conda.enabled          = true
  4 |         conda.useMamba         = false
  5 |         docker.enabled         = false
  6 |         singularity.enabled    = false
  7 |         podman.enabled         = false
  8 |         shifter.enabled        = false
  9 |         charliecloud.enabled   = false
 10 |     }
 11 |     docker {
 12 |         docker.enabled         = true
 13 |         docker.sudo            = false
 14 |         docker.runOptions      = "-u \$(id -u):\$(id -g) --platform=linux/amd64"
 15 |         singularity.enabled    = false
 16 |         podman.enabled         = false
 17 |         shifter.enabled        = false
 18 |         charliecloud.enabled   = false
 19 |     }
 20 |     vm {
 21 |         workDir            = getWorkDir()
 22 |         conda.cacheDir     = getCondaCacheDir()
 23 |         process {
 24 |             errorStrategy  = "terminate"
 25 |             maxRetries     = 0
 26 |             resourceLimits = [ cpus: 24, memory: 96.GB, time: 72.h ]
 27 |         }
 28 |     }
 29 |     slurm {
 30 |         executor.queueSize = 30
 31 |         process {
 32 |             executor       = "slurm"
 33 |             queue          = "cpu_batch"
 34 |             errorStrategy  = "retry"  // "terminate"
 35 |             maxRetries     = 1
 36 |             resourceLimits = [ cpus: 24, memory: 800.GB, time: 72.h ]
 37 |         }
 38 |     }
 39 |     gcp {
 40 |         workDir            = "gs://arc-ctc-nextflow/scRecounter/prod/work"
 41 |         fusion.enabled     = false
 42 |         wave.enabled       = false
 43 |         executor {
 44 |             queueSize      = 200
 45 |             pollInterval   = "15 sec"
 46 |         }
 47 |         params {
 48 |             barcodes       = "data/gcp/barcodes.csv"
 49 |             star_indices   = "data/gcp/star_indices.csv"
 50 |             fasterq_tmp    = "/tmp/TEMP"
 51 |         }
 52 |         process {
 53 |             executor       = "google-batch"
 54 |             errorStrategy  = "retry"
 55 |             maxRetries     = 2
 56 |             scratch        = true
 57 |             resourceLimits = [ cpus: 36, memory: 700.GB, time: 120.h ]
 58 |         }
 59 |         google {
 60 |             project   = "c-tc-429521"
 61 |             location  = "us-east1"
 62 |             batch {
 63 |                 serviceAccountEmail = "nick-nextflow@c-tc-429521.iam.gserviceaccount.com"
 64 |                 spot                = true
 65 |                 maxSpotAttempts     = 3
 66 |                 bootDiskSize        = 150.GB
 67 |             }
 68 |             storage {
 69 |                 multiplier = 2.0
 70 |             }
 71 |         }
 72 |     }
 73 |     dev {
 74 |         params {
 75 |             min_read_len       = 20
 76 |             db_name            = "sragent-test"
 77 |             fallback_max_spots = 10000000
 78 |         }
 79 |     }
 80 |     vm_dev {
 81 |         params {
 82 |             barcodes     = "data/barcodes_n2.csv"
 83 |             star_indices = "data/star_indices.csv"
 84 |         }
 85 |     }
 86 |     slurm_dev {
 87 |         params {
 88 |             barcodes     = "data/barcodes_n2.csv"
 89 |             star_indices = "data/star_indices.csv"
 90 |         }
 91 |     }
 92 |     gcp_dev {
 93 |         workDir = "gs://arc-ctc-nextflow/scRecounter/dev/work/"
 94 |         params {
 95 |             //barcodes     = "data/gcp/barcodes_n2.csv"
 96 |             barcodes     = "data/gcp/barcodes.csv"
 97 |             star_indices = "data/gcp/star_indices.csv"
 98 |         }
 99 |     }
100 |     acc_dev {
101 |         params {
102 |             accessions = "data/accessions_small_n2.csv"
103 |             output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/acc-n2"
104 |         }
105 |     }
106 |     acc_dev_problems {
107 |         params {
108 |             accessions = "data/accessions_problems.csv"
109 |             output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/acc-problems"
110 |         }
111 |     }
112 |     acc_all_org {
113 |         params {
114 |             accessions = "data/accessions_all-org.csv"
115 |             star_indices = "data/star_indices_all-org.csv"
116 |             output_dir = "/scratch/multiomics/nickyoungblut/scRecounter/acc_all-org"
117 |             define = true
118 |         }
119 |     }
120 |     no_acc_dev {
121 |         params {
122 |             accessions = ""
123 |             output_dir = "gs://arc-ctc-nextflow/scRecounter/dev/results/no-acc"
124 |         }
125 |     }
126 |     all_org {
127 |         params {
128 |             organisms = "Macaca mulatta,Anopheles gambiae,Arabidopsis thaliana,Bos taurus,Caenorhabditis elegans,Callithrix jacchus,Canis lupus familiaris,Danio rerio,Drosophila melanogaster,Equus caballus,Gallus gallus,Gorilla gorilla,Heterocephalus glaber,Oryctolagus cuniculus,Oryza sativa,Ovis aries,Pan troglodytes,Rattus norvegicus,Saccharomyces cerevisiae,Schistosoma mansoni,Solanum lycopersicum,Sus scrofa,Xenopus tropicalis,Zea mays"
129 |         }
130 |     }
131 |     report {
132 |         report {
133 |             enabled   = true
134 |             overwrite = true
135 |             file      = "${params.output_dir}/nf-report/${params.timestamp}.html"
136 |         }
137 |     }
138 |     trace {
139 |         trace {
140 |             enabled   = true
141 |             overwrite = true
142 |             file      = "${params.output_dir}/nf-trace/${params.timestamp}.txt"
143 |             fields    = "task_id,hash,native_id,name,status,exit,submit,container,cpus,time,disk,memory,attempt,submit,duration,realtime,%cpu,peak_rss,peak_vmem,rchar,wchar,workdir,scratch"
144 |         }
145 |     }
146 | }
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/lib/utils.groovy:
--------------------------------------------------------------------------------
  1 | def readStarParams(star_params_file){
  2 |     // read the input CSV file and check if all required columns are present
  3 |     return Channel
  4 |         .fromPath(star_params_file, checkIfExists: true)
  5 |         .splitCsv(header: true, sep: ',')
  6 |         .map { row ->
  7 |             def req_columns = ["sample", "fastq_1", "fastq_2", "barcodes_file", "star_index", 
  8 |                                 "cell_barcode_length", "umi_length", "strand"]
  9 |             def miss_columns = req_columns.findAll { !row.containsKey(it) }
 10 |             if (miss_columns) {
 11 |             error "Missing columns in the input CSV file: ${miss_columns}"
 12 |         }
 13 |         // remove special characters from the sample name
 14 |         row.sample = row.sample.replaceAll("\\s", "_")
 15 |         return [row.sample, row.fastq_1, row.fastq_2, row.barcodes_file, row.star_index, 
 16 |                 row.cell_barcode_length, row.umi_length, row.strand]
 17 |     }
 18 | }
 19 | 
 20 | def readAccessions(accessions_input){
 21 |     // read the input accessions CSV file and check if all required columns are present
 22 |     ch_acc = accessions_input
 23 |         .splitCsv(header: true, sep: ",")
 24 |         .map { row ->
 25 |             def req_columns = ["sample", "accession"]
 26 |             def miss_columns = req_columns.findAll { !row.containsKey(it) }
 27 |             if (miss_columns) {
 28 |                 error "Missing columns in the input CSV file: ${miss_columns}"
 29 |             }
 30 |             // remove special characters from the sample name
 31 |             row.sample = row.sample.replaceAll("\\s", "_")
 32 |             def result = [row.sample, row.accession]
 33 |             // add optional, metadata columns
 34 |             def metadata = [:]
 35 |             ["organism", "tech_10x"].each { col ->
 36 |                 metadata[col] = row.containsKey(col) ? row[col].replaceAll("\\s", "_") : ""
 37 |             }
 38 |             result << metadata
 39 |             return result
 40 |         }
 41 | 
 42 |     // print srx values
 43 |     ch_acc
 44 |         .map{ sample, accession, metadata -> sample }
 45 |         .distinct()
 46 |         .collect() 
 47 |         .map{ it.join(',') }
 48 |         .view{ "SRX accessions: ${it}" } 
 49 | 
 50 |     return ch_acc
 51 | }
 52 | 
 53 | def addStats(ch_accessions, ch_sra_stat){
 54 |     // add file size information to the accessions
 55 |     ch_stats = ch_sra_stat
 56 |         .map{ sample,acc,csv -> csv }
 57 |         .splitCsv(header: true, sep: ",")
 58 |         .map{ row -> [row.accession, row.file_size_gb.toDouble()] }
 59 |         .join(ch_sra_stat.map{ sample,acc,csv -> [acc, sample] }, by: [0])
 60 |         .map{ acc, size, sample -> [sample, acc, size] }
 61 |     return ch_accessions.join(ch_stats, by: [0,1]) // sample, acc, metadata, size
 62 | }
 63 | 
 64 | def joinReads(ch_read1, ch_read2){
 65 |     // extract metadata to prevent incorrect joining
 66 |     ch_metadata = ch_read1.map{ sample,accession,metadata,fastq -> [sample,accession,metadata] }
 67 | 
 68 |     // join the read1 and read2 channels
 69 |     return ch_read1
 70 |         .map{ sample,accession,metadata,fastq -> [sample,accession,fastq] }
 71 |         .join(
 72 |             ch_read2.map{ sample,accession,metadata,fastq -> [sample,accession,fastq] }, 
 73 |             by: [0,1]
 74 |         )
 75 |         .join(
 76 |             ch_metadata, by: [0,1]
 77 |         )
 78 |         .map{ 
 79 |             sample,accession,fastq1,fastq2,metadata -> [sample,accession,metadata,fastq1,fastq2] 
 80 |         }
 81 | }
 82 | 
 83 | def saveAsLog(filename, sample=null, accession=null) {
 84 |     if (filename.endsWith(".log")) {
 85 |         def basename = filename.tokenize("/")[-1]
 86 |         def path = "logs"
 87 |         if (sample){
 88 |             path = "${path}/${sample}"
 89 |         }
 90 |         if (accession) {
 91 |             path = "${path}/${accession}"
 92 |         } 
 93 |         path = "${path}/${basename}"
 94 |         return path
 95 |     }
 96 |     return null
 97 | }
 98 | 
 99 | def subsampleByGroup(ch_accessions, max_per_group, seed) {
100 |     ch_accessions
101 |         .groupTuple()
102 |         .map { samples, accessions, meta, sra_stat ->
103 |             accessions = accessions.toList()
104 |             meta = meta.toList()
105 |             sra_stat = sra_stat.toList()
106 |         
107 |             if (accessions) { // Ensure lists are not empty
108 |                 def indices = (0..<accessions.size()).toList() // Create indices
109 |                 indices.shuffle(new Random(seed)) // Shuffle indices with seed
110 |             
111 |                 def shuffledAcc = indices.collect { accessions[it] } // Shuffle accessions based on indices
112 |                 def shuffledMeta = indices.collect { meta[it] } // Shuffle meta based on the same indices
113 |                 def shuffledStat = indices.collect { sra_stat[it] } // Shuffle meta based on the same indices
114 |             
115 |                 def maxSize = Math.min(shuffledAcc.size(), max_per_group)
116 |                 shuffledAcc = maxSize > 0 ? shuffledAcc[0..<maxSize] : []
117 |                 shuffledMeta = maxSize > 0 ? shuffledMeta[0..<maxSize] : []
118 |                 shuffledStat = maxSize > 0 ? shuffledStat[0..<maxSize] : []
119 |             
120 |                 [samples, shuffledAcc, shuffledMeta, shuffledStat]
121 |             } else {
122 |                 [samples, [], [], []] // Handle empty groups gracefully
123 |             }
124 |         }
125 |         .flatMap { samples, accessions, meta, sra_stat ->
126 |             def flattened = []
127 |             for (int i = 0; i < accessions.size(); i++) {
128 |                 flattened << [samples, accessions[i], meta[i], sra_stat[i]]
129 |             }
130 |             flattened
131 |         }
132 | }


--------------------------------------------------------------------------------
/scripts/tiledb-loader-tahoe100.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # import
  3 | ## batteries
  4 | import os
  5 | import gc
  6 | import sys
  7 | import logging
  8 | import argparse
  9 | from glob import glob
 10 | from typing import List, Set, Tuple, Optional
 11 | ## 3rd party
 12 | import pandas as pd
 13 | import tiledbsoma
 14 | import tiledbsoma.io
 15 | import scanpy as sc
 16 | 
 17 | # format logging
 18 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 19 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING)
 20 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING)
 21 | logging.getLogger("tiledb").setLevel(logging.WARNING) 
 22 | 
 23 | # classes
 24 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 25 |     pass
 26 | 
 27 | # functions
 28 | def parse_arguments() -> argparse.Namespace:
 29 |     """
 30 |     Parse command-line arguments.
 31 |     """
 32 |     desc = 'Convert Tahoe-100 dataset to TileDB format.'
 33 |     epi = """DESCRIPTION:
 34 |     Test example:
 35 |     ./scripts/tiledb-loader-tahoe.py --db-uri ~/dev/nextflow/scRecounter/tmp/tiledb/srx3/tiledb-soma ~/dev/nextflow/scRecounter/tmp/tiledb/srx3/
 36 | 
 37 |     Production (scRecounter):
 38 |     ./scripts/tiledb-loader-tahoe.py --h5ad-ext h5ad.gz --db-uri /processed_datasets/scRecount/tahoe/tiledb-soma /processed_datasets/scRecount/tahoe/
 39 |     """
 40 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 41 |     parser.add_argument(
 42 |         'base_dir',  type=str, help='Base directory to search for input data files'
 43 |     )
 44 |     parser.add_argument(
 45 |         '--db-uri', type=str, default="tiledb-soma", 
 46 |         help='URI of existing TileDB database, or it will be created if it does not exist'
 47 |     )
 48 |     parser.add_argument(
 49 |         '--h5ad-ext', type=str, default="h5ad", 
 50 |         help='File extension (suffix) for h5ad files'
 51 |     )
 52 |     parser.add_argument(
 53 |         '--max-datasets', type=int, default=None,
 54 |         help='Maximum number of datasets to process'
 55 |     )
 56 |     return parser.parse_args()
 57 | 
 58 | 
 59 | def find_matrix_files(
 60 |         base_dir: str, 
 61 |         file_ext: str,
 62 |         max_datasets: Optional[int]=None
 63 |     ) -> List[tuple]:
 64 |     """
 65 |     Recursively find matrix.mtx.gz files and extract SRX IDs.
 66 |     Args:
 67 |         base_dir: Base directory to search
 68 |         max_datasets: Maximum number of datasets to process
 69 |     Returns:
 70 |         List of tuples (matrix_path, srx_id)
 71 |     """
 72 |     logging.info(f"Searching for new data files in {base_dir}...")
 73 |     h5ad_files = glob(f"{base_dir}/*{file_ext}")
 74 |     if max_datasets:
 75 |         h5ad_files = h5ad_files[:max_datasets]
 76 |         
 77 |     logging.info(f"  Found {len(h5ad_files)} new data files to process.")
 78 |     return h5ad_files
 79 | 
 80 | 
 81 | def append_to_database(db_uri: str, adata: sc.AnnData) -> None:
 82 |     """
 83 |     Append an AnnData object to the TileDB database.
 84 |     Args:
 85 |         db_uri: URI of the TileDB database
 86 |         adata: AnnData object to append
 87 |     """
 88 |     logging.info("  Appending data...")
 89 | 
 90 |     # Register AnnData objects
 91 |     rd = tiledbsoma.io.register_anndatas(
 92 |         db_uri,
 93 |         [adata],
 94 |         measurement_name="RNA",
 95 |         obs_field_name="obs_id",
 96 |         var_field_name="var_id",
 97 |     )
 98 | 
 99 |     # Apply resize
100 |     with tiledbsoma.Experiment.open(db_uri) as exp:
101 |         tiledbsoma.io.resize_experiment(
102 |             exp.uri,
103 |             nobs=rd.get_obs_shape(),
104 |             nvars=rd.get_var_shapes()
105 |         )
106 | 
107 |     # Ingest new data into the db
108 |     tiledbsoma.io.from_anndata(
109 |         db_uri,
110 |         adata,
111 |         measurement_name="RNA",
112 |         registration_mapping=rd,
113 |     )
114 | 
115 | def create_tiledb(db_uri: str, adata: sc.AnnData) -> None:
116 |     """
117 |     Create a new tiledb database.
118 |     Args:
119 |         db_uri: URI of the TileDB database
120 |         adata: AnnData object to append
121 |     """
122 |     logging.info(f"  Creating new database...")
123 |     tiledbsoma.io.from_anndata(
124 |         db_uri,
125 |         adata,
126 |         measurement_name="RNA",
127 |     )
128 | 
129 | def load_tiledb(h5ad_files: List[str], db_uri: str) -> None:
130 |     """
131 |     Load all h5ad files into TileDB-SOMA database
132 |     Args:
133 |         h5ad_files: List of h5ad files to load
134 |         db_uri: URI of the TileDB database
135 |     """
136 |     for infile in h5ad_files:
137 |         logging.info(f"Processing {infile}...")
138 | 
139 |         # load anndata object
140 |         adata = sc.read_h5ad(infile)
141 |         ## format obs and var
142 |         if not "obs_id" in adata.obs.columns: 
143 |             adata.obs["obs_id"] = adata.obs.index
144 |         if not "var_id" in adata.var.columns:
145 |             adata.var["var_id"] = adata.var.index
146 | 
147 |         # add to database
148 |         if not os.path.exists(db_uri):
149 |             create_tiledb(db_uri, adata)
150 |         else:
151 |             append_to_database(db_uri, adata)
152 | 
153 |         # clear memory
154 |         del adata
155 |         gc.collect()
156 | 
157 | def main():
158 |     """Main function to run the TileDB loader workflow."""
159 |     args = parse_arguments()
160 |     
161 |     # Find all matrix files and their corresponding SRX IDs
162 |     h5ad_files = find_matrix_files(
163 |         args.base_dir, 
164 |         args.h5ad_ext,
165 |         max_datasets=args.max_datasets
166 |     )
167 | 
168 |     # Load data into memory and append to TileDB
169 |     load_tiledb(h5ad_files, args.db_uri)
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     from dotenv import load_dotenv
174 |     load_dotenv(override=True)
175 |     main()
176 | 
177 | 
178 | 
179 | 
180 | 


--------------------------------------------------------------------------------
/bin/parallel-fastq-dump.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys, os, shutil, tempfile, subprocess, argparse, logging
  3 | 
  4 | __version__ = "0.6.7"
  5 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
  6 | 
  7 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass
  8 | 
  9 | desc = "parallel fastq-dump wrapper, extra args will be passed through"
 10 | epi = """DESCRIPTION:
 11 | Example: parallel-fastq-dump --sra-id SRR2244401 --threads 4 --outdir out/ --split-files --gzip
 12 | """
 13 | 
 14 | parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 15 | parser.add_argument("-s","--sra-id", help="SRA id", action="append")
 16 | parser.add_argument("-t","--threads", help="number of threads", default=1, type=int)
 17 | parser.add_argument("-O","--outdir", help="output directory", default=".")
 18 | parser.add_argument("-T","--tmpdir", help="temporary directory", default=None)
 19 | parser.add_argument("-N","--minSpotId", help="Minimum spot id", default=1, type=int)
 20 | parser.add_argument("-X","--maxSpotId", help="Maximum spot id", default=None, type=int)
 21 | parser.add_argument("-V","--version", help="shows version", action="store_true", default=False)
 22 | 
 23 | def pfd(args: argparse.Namespace, srr_id: str, extra_args: list[str]) -> None:
 24 |     """Parallel fastq-dump.
 25 |     Args:
 26 |         args: Parsed command-line arguments.
 27 |         srr_id: Identifier for the SRA run.
 28 |         extra_args: Additional arguments to pass to fastq-dump.
 29 |     """
 30 |     tmp_dir = tempfile.TemporaryDirectory(prefix="pfd_", dir=args.tmpdir)
 31 |     logging.info(f"tempdir: {tmp_dir.name}")
 32 |     n_spots = get_spot_count(srr_id)
 33 |     logging.info(f"{srr_id} spots: {n_spots}")
 34 |     start = max(args.minSpotId, 1)
 35 |     end = min(args.maxSpotId, n_spots) if args.maxSpotId is not None else n_spots
 36 |     blocks = split_blocks(start, end, args.threads)
 37 |     logging.info(f"blocks: {blocks}")
 38 |     ps = []
 39 |     for i in range(args.threads):
 40 |         d = os.path.join(tmp_dir.name, str(i))
 41 |         os.mkdir(d)
 42 |         cmd = ["fastq-dump","-N",str(blocks[i][0]),"-X",str(blocks[i][1]),"-O",d]+extra_args+[srr_id]
 43 |         logging.info(f"CMD: {' '.join(cmd)}")
 44 |         p = subprocess.Popen(cmd)
 45 |         ps.append(p)
 46 |     wfd = {}
 47 |     for i,p in enumerate(ps):
 48 |         exit_code = p.wait()
 49 |         if exit_code != 0:
 50 |             logging.warning(f"fastq-dump error! exit code: {exit_code}")
 51 |             sys.exit(1)
 52 |         tmp_path = os.path.join(tmp_dir.name, str(i))
 53 |         for fo in os.listdir(tmp_path):
 54 |             if fo not in wfd: wfd[fo] = open(os.path.join(args.outdir, fo), "wb")
 55 |             with open(os.path.join(tmp_path, fo), "rb") as fd:
 56 |                 shutil.copyfileobj(fd, wfd[fo])
 57 |             os.remove(os.path.join(tmp_path, fo))
 58 |     for fd in wfd.values(): fd.close()
 59 | 
 60 | def split_blocks(start: int, end: int, n_pieces: int) -> list[list[int]]:
 61 |     """Split a range of spot IDs into smaller blocks.
 62 |     Args:
 63 |         start: The first spot ID.
 64 |         end: The last spot ID.
 65 |         n_pieces: Number of blocks to split into.
 66 |     Returns:
 67 |         A list of lists, where each sub-list is [block_start, block_end].
 68 |     """
 69 |     total = end - start + 1
 70 |     avg = total // n_pieces
 71 |     out = []
 72 |     last = start
 73 |     for i in range(n_pieces):
 74 |         out.append([last, last + avg - 1])
 75 |         last += avg
 76 |         if i == n_pieces - 1: out[i][1] += total % n_pieces
 77 |     return out
 78 | 
 79 | def get_spot_count(sra_id: str) -> int:
 80 |     """Get spot count using sra-stat.
 81 |     Args:
 82 |         sra_id: Identifier for the SRA run.
 83 |     Returns:
 84 |         Total number of spots in the specified SRA.
 85 |     """
 86 |     cmd = ["sra-stat","--meta","--quick",sra_id]
 87 |     logging.info(f"CMD: {' '.join(cmd)}")
 88 |     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 89 |     stdout, stderr = p.communicate()
 90 |     txt = stdout.decode().rstrip().split("\n")
 91 |     total = 0
 92 |     try:
 93 |         for l in txt: total += int(l.split("|")[2].split(":")[0])
 94 |     except IndexError:
 95 |         msg = "sra-stat output parsing error!\n--sra-stat STDOUT--\n{}\n--sra-stat STDERR--\n{}"
 96 |         raise IndexError(msg.format("\n".join(txt), stderr.decode().rstrip()))
 97 |     return total
 98 | 
 99 | def partition(f, l: list) -> tuple[list, list]:
100 |     """Partition a list into two groups based on a predicate.
101 |     Args:
102 |         f: A function that returns True or False for a given element.
103 |         l: The list to be partitioned.
104 |     Returns:
105 |         A tuple of two lists: (matching, not_matching).
106 |     """
107 |     r = ([],[])
108 |     for i in l: r[0 if f(i) else 1].append(i)
109 |     return r
110 | 
111 | def is_sra_file(path: str) -> bool:
112 |     """Check if a file path is potentially an SRA file.
113 |     Args:
114 |         path: File path.
115 |     Returns:
116 |         True if the file is recognized as SRA-related, otherwise False.
117 |     """
118 |     f = os.path.basename(path)
119 |     if f.lower().endswith(".sra"): return True
120 |     if any(x in f.upper() for x in ["SRR","ERR","DRR"]): return True
121 |     return False
122 | 
123 | def main() -> None:
124 |     """Main entry point to parse arguments and run parallel fastq-dump."""
125 |     args, extra = parser.parse_known_args()
126 |     if args.version:
127 |         print(f"parallel-fastq-dump : {__version__}")
128 |         subprocess.Popen(["fastq-dump","-V"]).wait()
129 |         sys.exit(0)
130 |     elif args.sra_id:
131 |         extra_srrs, extra_args = partition(is_sra_file, extra)
132 |         args.sra_id.extend(extra_srrs)
133 |         logging.info(f"SRR ids: {args.sra_id}")
134 |         logging.info(f"extra args: {extra_args}")
135 |         if not os.path.isdir(args.outdir) and args.outdir != ".":
136 |             os.makedirs(args.outdir)
137 |         if args.tmpdir and not os.path.isdir(args.tmpdir) and args.tmpdir != ".":
138 |             os.makedirs(args.tmpdir)
139 |         for si in args.sra_id: pfd(args, si, extra_args)
140 |     else:
141 |         parser.print_help()
142 |         sys.exit(1)
143 | 
144 | if __name__ == "__main__":
145 |     main()


--------------------------------------------------------------------------------
/scripts/tiledb-loader/bin/mtx-to-h5ad.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # import
  3 | ## batteries
  4 | import os
  5 | import logging
  6 | import argparse 
  7 | import concurrent.futures
  8 | from pathlib import Path
  9 | from itertools import chain, repeat
 10 | from typing import List, Set, Tuple, Optional
 11 | ## 3rd party
 12 | import numpy as np
 13 | import scipy.sparse
 14 | import pandas as pd
 15 | import tiledbsoma
 16 | import tiledbsoma.io
 17 | import scanpy as sc
 18 | from pypika import Query, Table
 19 | ## package
 20 | from db_utils import db_connect
 21 | 
 22 | # format logging
 23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 24 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING)
 25 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING)
 26 | logging.getLogger("tiledb").setLevel(logging.WARNING) 
 27 | 
 28 | # classes
 29 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 30 |     pass
 31 | 
 32 | # functions
 33 | def parse_arguments() -> argparse.Namespace:
 34 |     """
 35 |     Parse command-line arguments.
 36 |     """
 37 |     desc = 'Convert mtx files to h5ad.'
 38 |     epi = """DESCRIPTION:
 39 |     Convert mtx files to h5ad in parallel.
 40 |     """
 41 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 42 |     parser.add_argument(
 43 |         '--srx', type=str, help="SRX accessions", required=True
 44 |     )
 45 |     parser.add_argument(
 46 |         '--path', type=str, help="Path to matrix.mtx.gz files", required=True
 47 |     )
 48 |     parser.add_argument(
 49 |         '--missing-metadata', type=str, default="error", 
 50 |         choices=["error", "skip", "allow"],
 51 |         help="How do handle missing metadata?"
 52 |     )
 53 |     parser.add_argument(
 54 |         '--threads', type=int, default=8, help="Number of threads to use"
 55 |     )
 56 |     return parser.parse_args()
 57 | 
 58 | 
 59 | def load_matrix_as_anndata(
 60 |         srx_id: str, 
 61 |         matrix_path: str, 
 62 |         missing_metadata: str="error",
 63 |     ) -> sc.AnnData:
 64 |     """
 65 |     Load a matrix.mtx.gz file as an AnnData object.
 66 |     Args:
 67 |         srx_id: SRX accession
 68 |         matrix_path: Path to matrix.mtx.gz file
 69 |         missing_metadata: How to handle missing metadata
 70 |     Returns:
 71 |         AnnData object
 72 |     """
 73 |     # get metadata from scRecounter postgresql database
 74 |     srx_metadata = Table("srx_metadata")
 75 |     stmt = (
 76 |         Query
 77 |         .from_(srx_metadata)
 78 |         .select(
 79 |             srx_metadata.lib_prep, 
 80 |             srx_metadata.tech_10x,
 81 |             srx_metadata.organism,
 82 |             srx_metadata.tissue,
 83 |             srx_metadata.disease,
 84 |             srx_metadata.purturbation,
 85 |             srx_metadata.cell_line,            # TODO: add cell_prep
 86 |             srx_metadata.czi_collection_id,
 87 |             srx_metadata.czi_collection_name,
 88 |         )
 89 |         .where(srx_metadata.srx_accession == srx_id)
 90 |     )
 91 |     metadata = None
 92 |     with db_connect() as conn:
 93 |         metadata = pd.read_sql(str(stmt), conn)
 94 | 
 95 |     ## if metadata is not found, return None
 96 |     if metadata is None or metadata.shape[0] == 0:
 97 |         if missing_metadata == "allow":
 98 |             logging.warning(
 99 |                 f"    Metadata not found for SRX accession {srx_id}, but `--missing-metadata allow` used"
100 |             )
101 |             pass
102 |         elif missing_metadata == "skip":
103 |             logging.warning(
104 |                 f"    Metadata not found for SRX accession {srx_id}, but `--missing-metadata skip` used"
105 |             )
106 |             return None
107 |         elif missing_metadata == "error":
108 |             raise ValueError(f"    Metadata not found for SRX accession {srx_id}")
109 |         else:
110 |             raise ValueError(f"    Invalid value for `--missing-metadata`")
111 |     if metadata.shape[0] > 1:
112 |         raise ValueError(f"Multiple metadata entries found for SRX accession {srx_id}")
113 | 
114 |     # load count matrix
115 |     adata = sc.read_10x_mtx(
116 |         os.path.dirname(matrix_path),
117 |         var_names="gene_ids",
118 |         make_unique=True
119 |     )
120 | 
121 |     # calculate total counts
122 |     if scipy.sparse.issparse(adata.X):
123 |         adata.obs["gene_count"] = (adata.X > 0).sum(axis=1).A1
124 |         adata.obs["umi_count"] = adata.X.sum(axis=1).A1
125 |     else:
126 |         adata.obs["gene_count"] = (adata.X > 0).sum(axis=1)
127 |         adata.obs["umi_count"] = adata.X.sum(axis=1)
128 |     adata.obs["barcode"] = adata.obs.index
129 | 
130 |     # append SRX to barcode to create a global-unique index
131 |     adata.obs.index = adata.obs.index + f"_{srx_id}"
132 | 
133 |     # add metadata to adata
134 |     adata.obs["SRX_accession"] = srx_id
135 |     for col in metadata.columns:
136 |         try:
137 |             adata.obs[col] = str(metadata[col].values[0])
138 |         except IndexError:
139 |             adata.obs[col] = None
140 | 
141 |     return adata
142 | 
143 | def mtx_to_h5ad(
144 |     matrix_files: str, 
145 |     missing_metadata: str="error",
146 |     threads: int=8
147 |     ) -> sc.AnnData:
148 |     """
149 |     Convert a list of matrix.mtx.gz files to a single h5ad file.
150 |     """
151 |     logging.info("Loading mtx files to h5ad...")
152 | 
153 |     # paralle load mtx files
154 |     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
155 |         adata = list(executor.map(
156 |             lambda x: load_matrix_as_anndata(
157 |                 x[0], x[1], missing_metadata=missing_metadata
158 |             ), 
159 |             matrix_files
160 |         ))
161 |     ## filter out empty objects
162 |     adata = [a for a in adata if a is not None]
163 | 
164 |     ## concat
165 |     adata = sc.concat(adata, join="outer")
166 | 
167 |     ## write to h5ad
168 |     adata.write_h5ad(f"data.h5ad")
169 |     logging.info(f"Saved h5ad file to data.h5ad")
170 | 
171 | def parse_arg(arg: str) -> List[str]:
172 |     """Parse a comma-separated argument into a list."""
173 |     return [x.strip() for x in arg.lstrip("[").rstrip("]").split(",")]
174 | 
175 | def main():
176 |     """Main function to run the TileDB loader workflow."""
177 |     args = parse_arguments()
178 | 
179 |     # parse args
180 |     mtx_files = list(zip(parse_arg(args.srx), parse_arg(args.path)))
181 |     logging.info(f"mtx file count: {len(mtx_files)}")
182 | 
183 |     # create h5ad files
184 |     mtx_to_h5ad(
185 |         mtx_files, 
186 |         threads=args.threads,
187 |         missing_metadata=args.missing_metadata
188 |     )
189 | 
190 | if __name__ == "__main__":
191 |     from dotenv import load_dotenv
192 |     load_dotenv(override=True)
193 |     main()


--------------------------------------------------------------------------------
/bin/get-db-accessions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | from __future__ import print_function
  4 | import os
  5 | import re
  6 | import sys
  7 | import argparse
  8 | import logging
  9 | from typing import List
 10 | ## 3rd party
 11 | import pandas as pd
 12 | import psycopg2
 13 | import pandas as pd
 14 | from pypika import Query, Table, Criterion
 15 | from psycopg2.extras import execute_values
 16 | from psycopg2.extensions import connection
 17 | ## pipeline
 18 | from db_utils import db_connect, db_upsert
 19 | 
 20 | 
 21 | # logging
 22 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 23 | 
 24 | # argparse
 25 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 26 |                       argparse.RawDescriptionHelpFormatter):
 27 |     pass
 28 | 
 29 | desc = 'Get SRA accessions from the scRecounter database'
 30 | epi = """DESCRIPTION:
 31 | Get SRA accessions from the scRecounter database. 
 32 | Write out the accessions csv table:
 33 | - sample: SRX accession
 34 | - accession: SRR accession
 35 | - organism: organism name
 36 | """
 37 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 38 |                                  formatter_class=CustomFormatter)
 39 | parser.add_argument('--max-srx', type=int, default=5,
 40 |                     help='Max number of srx records to return')
 41 | parser.add_argument('--database', type=str, default=["sra", "gds"], nargs="+",
 42 |                     help='Only return records from these databases')
 43 | parser.add_argument('--organisms', type=str, default="human,mouse", 
 44 |                     help='Organisms to filter by; comma-separated list')
 45 | parser.add_argument('--outfile', type=str, default="accessions.csv",
 46 |                     help='Output file name')
 47 | 
 48 | # functions
 49 | def db_get_unprocessed_records(
 50 |     conn: connection, 
 51 |     process: str,
 52 |     database: List[str], 
 53 |     max_srx: int=3,
 54 |     organisms: List[str] = ["human", "mouse"]
 55 |     ) -> pd.DataFrame:
 56 |     """
 57 |     Get all suitable unprocessed SRX records, limiting by unique srx_accession values.
 58 |     Args:
 59 |         conn: Connection to the database.
 60 |         database: Name of the database to query.
 61 |         max_srx: Maximum number of SRX records to return.
 62 |     Returns:
 63 |         dataframe of unprocessed SRX records.
 64 |     """
 65 |     # init tables
 66 |     srx_metadata = Table("srx_metadata")
 67 |     srx_srr = Table("srx_srr")
 68 |     scr_log = Table("screcounter_log")
 69 | 
 70 |     # subquery to get srx_accessions
 71 |     ## find already-processed records in sc-recounter log
 72 |     nontarget_srx = (
 73 |         Query
 74 |         .from_(scr_log)
 75 |         .select(scr_log.sample)
 76 |         .where(
 77 |             Criterion.all([
 78 |                 scr_log.process == process,
 79 |                 scr_log.step == "Final",
 80 |                 scr_log.status == "Success"
 81 |             ])
 82 |         )
 83 |         .distinct()
 84 |     )
 85 | 
 86 |     # status
 87 |     num_nontarget = pd.read_sql(str(nontarget_srx), conn).shape[0]
 88 |     logging.info(f"No. of non-target records: {num_nontarget}")
 89 | 
 90 |     ## find unprocessed records
 91 |     target_srx = (
 92 |         Query
 93 |         .from_(srx_metadata)
 94 |         .left_join(nontarget_srx)
 95 |         .on(srx_metadata.srx_accession == nontarget_srx.sample)
 96 |         .select(srx_metadata.srx_accession)
 97 |         .where(
 98 |             Criterion.all([
 99 |                 nontarget_srx.sample.isnull(),      # filters out already processed records
100 |                 srx_metadata.database.isin(database),
101 |                 srx_metadata.srx_accession != "",
102 |                 (srx_metadata.srx_accession.like("SRX%") | srx_metadata.srx_accession.like("ERX%")),
103 |                 srx_metadata.is_illumina == "yes",
104 |                 srx_metadata.is_single_cell == "yes",
105 |                 srx_metadata.is_paired_end == "yes",
106 |                 srx_metadata.lib_prep == "10x_Genomics",
107 |                 srx_metadata.organism.isin(organisms),
108 |                 srx_metadata.czi_collection_id.isnull() | srx_metadata.czi_collection_id.isin(["", "NaN", "None"]),
109 |                 #~srx_metadata.tech_10x.isin(["other", "not_applicable"])  # TODO: comment to make the query more permissive
110 |             ])
111 |         )
112 |         .distinct()
113 |         .limit(max_srx)
114 |     )
115 | 
116 |     # status
117 |     #df_target = pd.read_sql(str(target_srx), conn)
118 |     #print(f"No. of target records: {df_target.shape[0]}")
119 | 
120 |     # main query to obtain the SRR for each SRX and then format the output
121 |     stmt = (
122 |         Query
123 |         .from_(srx_metadata)
124 |         .inner_join(srx_srr)
125 |         .on(srx_metadata.srx_accession == srx_srr.srx_accession)
126 |         .where(
127 |             srx_metadata.srx_accession.isin(target_srx)
128 |         )
129 |         .select(
130 |             srx_metadata.srx_accession.as_("sample"),
131 |             srx_srr.srr_accession.as_("accession"),
132 |             srx_metadata.organism.as_("organism"),
133 |             srx_metadata.tech_10x.as_("tech_10x"),
134 |         )
135 |         .distinct()
136 |     )
137 |         
138 |     # fetch as pandas dataframe
139 |     return pd.read_sql(str(stmt), conn)
140 | 
141 | def main(args):
142 |     # parse organisms
143 |     args.organisms = args.organisms.split(",")
144 | 
145 |     # set process name; used to determine which records have been processed
146 |     process = "Get db accessions"
147 | 
148 |     # get unprocessed records
149 |     with db_connect() as conn:
150 |         df = db_get_unprocessed_records(
151 |             conn, process, args.database, max_srx=args.max_srx, organisms=args.organisms
152 |         )
153 | 
154 |     # remove spaces from organism
155 |     df["organism"] = df["organism"].str.replace(" ", "_")
156 | 
157 |     # log number of records
158 |     num_unique_srx = df["sample"].nunique()
159 |     logging.info(f"No. of target SRX accessions: {num_unique_srx}")
160 |     num_unique_acc = df["accession"].nunique()
161 |     logging.info(f"No. of target SRR accessions: {num_unique_acc}")
162 |     srr_per_srx = df.groupby("sample")["accession"].count()
163 |     logging.info(f"No. of target SRR per SRX: {srr_per_srx.to_dict()}")
164 | 
165 |     ## write out records
166 |     df.to_csv(args.outfile, index=False)
167 | 
168 |     # write to log table in scRecounter database
169 |     ## convert df
170 |     df["process"] = process
171 |     df["step"] = "Final"
172 |     df["status"] = "Success"
173 |     df["message"] = "Obtained database accession for processing"
174 | 
175 |     ## filter columns
176 |     df = df[["sample", "accession", "process", "step", "status", "message"]]
177 | 
178 |     ## upsert log to database
179 |     logging.info("Updating scRecounter log table...")
180 |     with db_connect() as conn:
181 |         db_upsert(df, "screcounter_log", conn)
182 | 
183 | ## script main
184 | if __name__ == '__main__':
185 |     args = parser.parse_args()
186 |     main(args)
187 | 
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | scRecouter
  2 | ==========
  3 | 
  4 | A Nextflow pipeline to re-process single-cell RNA-seq data from the Sequence Read Archive.
  5 | 
  6 | # Workflow
  7 | 
  8 | * **User provides:**
  9 |   * A table of samples & associated accessions
 10 |     * Alternatively, the pipeline can pull accessions from the scRecounter SQL database
 11 |   * Associated files required:
 12 |     * A table of barcodes to use for cell barcode and UMI identification
 13 |     * A table of STAR index directories to use for mapping
 14 | * **Pipeline:**
 15 |   * Load accessions from provided table or SQL database
 16 |   * For each accession:
 17 |     * Use `fastq-dump` to download a subset of reads as fastq files from the SRA
 18 |     * Determine the "best" STAR parameters by mapping the reads using various parameter combinations
 19 |       * Parameters: version of cell barcodes, cell barcode length, UMI length, strand, STAR reference index
 20 |       * The STAR parameters are selected based on the fraction of valid barcodes
 21 |     * Download all reads with `fasterq-dump`
 22 |       * If download fails, try again with `fastq-dump` using a max of `fallback_max_spots` reads (see `nextflow.config`).
 23 |     * Map the reads with STARsolo using the "best" STAR parameters
 24 | 
 25 | # Manuscript
 26 | 
 27 | **scBaseCamp: An AI agent-curated, uniformly processed, and continually expanding single cell data repository**.
 28 | Nicholas D Youngblut, Christopher Carpenter, Jaanak Prashar, Chiara Ricci-Tam, Rajesh Ilango, Noam Teyssier,
 29 | Silvana Konermann, Patrick Hsu, Alexander Dobin, David P Burke, Hani Goodarzi, Yusuf H Roohani.
 30 | bioRxiv 2025.02.27.640494; doi: [https://doi.org/10.1101/2025.02.27.640494](https://doi.org/10.1101/2025.02.27.640494)
 31 | 
 32 | # Installation
 33 | 
 34 | ## Conda & mamba install
 35 | 
 36 | `mamba` is needed to run the pipeline. 
 37 | It is a faster version of `conda`. 
 38 | `mamba` can be installed via `conda`. 
 39 | You can use `conda` instead of `mamba` if you prefer.
 40 | 
 41 | ## Nextflow install
 42 | 
 43 | It is easiest to install Nextflow using `mamba` (or `conda`).
 44 | 
 45 | ```bash
 46 | mamba create -n nextflow_env -c bioconda nextflow
 47 | ```
 48 | 
 49 | Make sure to activate the environment before running the pipeline:
 50 | 
 51 | ```bash
 52 | mamba activate nextflow_env
 53 | ```
 54 | 
 55 | All other dependencies will be installed by Nextflow.
 56 | 
 57 | 
 58 | ## Pipeline install
 59 | 
 60 | ### Clone the repo
 61 | 
 62 | ```bash
 63 | git clone https://github.com/ArcInstitute/scRecounter.git \
 64 |   && cd scRecounter
 65 | ```
 66 | 
 67 | ### Pipeline conda environments (if running locally)
 68 | 
 69 | The pipeline uses conda environments to manage dependencies. 
 70 | Nextflow will automatically create the environments as long as `mamba` is installed.
 71 | 
 72 | **Note:** it can take a while to create the environments, even with `mamba`.
 73 | 
 74 | ### Pipeline Docker containers (if running on GCP) 
 75 | 
 76 | The pipeline defaults to using custom Docker containers hosted on Google Artifact Registry.
 77 | 
 78 | You can build the Docker containers yourself. See [./docker/README.md](./docker/README.md) for details.
 79 | Be sure to update the [profiles.config](./config/profiles.config) file to point to the new containers.
 80 | 
 81 | # Usage
 82 | 
 83 | ## Input
 84 | 
 85 | ### Accessions table
 86 | 
 87 | Lists the samples and their associated SRA experiment accessions.
 88 | 
 89 | > This table is not required if the pipeline is pulling accessions from the scRecounter SQL database.
 90 |   To pull accessions from the database, do not provide `--accessions` via the command line.
 91 | 
 92 | Example:
 93 | 
 94 | | sample      | accession   | organism |
 95 | |-------------|-------------|----------|
 96 | | SRX22716300 | SRR27024456 | human    |
 97 | | SRX25994842 | SRR30571763 | mouse    |
 98 | 
 99 | > `organism` is optional. It will determine the STAR index to use for mapping. Otherwise all indexes will be used for parameter selection.
100 | 
101 | ### Barcode table
102 | 
103 | Lists all of the possible barcodes that will be used to determine the cell barcode and UMI for the samples.
104 | 
105 | Example:
106 | 
107 | | name             | cell_barcode_length | umi_length | file_path                                                                |
108 | |------------------|---------------------|------------|--------------------------------------------------------------------------|
109 | | 737K-arc-v1      | 16                  | 12         | /large_storage/goodarzilab/public/scRecount/genomes/737K-arc-v1.txt      |
110 | | 737K-august-2016 | 16                  | 12         | /large_storage/goodarzilab/public/scRecount/genomes/737K-august-2016.txt |
111 | | 3M-february-2018 | 16                  | 10         | /large_storage/goodarzilab/public/scRecount/genomes/3M-february-2018.txt |
112 | 
113 | 
114 | ### STAR index table
115 | 
116 | Lists the STAR index files that will be used to map the reads.
117 | 
118 | Example:
119 | 
120 | | Organism | Star Index Path                                                                   |
121 | |----------|-----------------------------------------------------------------------------------|
122 | | human    | /large_storage/goodarzilab/public/scRecount/genomes/star_refData_2020_hg38        |
123 | | mouse    | /large_storage/goodarzilab/public/scRecount/genomes/star2.7.11_refData_2020_mm10  |
124 | 
125 | 
126 | > If `organism` is provided in the `Accessions` table, the STAR index will be selected based on the `organism` column.
127 |   Thus, it reduces the number of parameter combinations that need to be tested.
128 | 
129 | ## Nextflow run 
130 | 
131 | ### Test runs
132 | 
133 | Local run with provided accessions:
134 | 
135 | ```bash
136 | nextflow run main.nf \
137 |   -work-dir tmp/work \
138 |   -profile conda,trace,report,vm,vm_dev,dev,acc_dev
139 | ```
140 | 
141 | Local run with provided accessions (problematic datasets)
142 | 
143 | ```bash
144 | nextflow run main.nf \
145 |   -work-dir tmp/work \
146 |   -profile conda,trace,report,vm,vm_dev,dev,acc_dev_problems
147 | ```
148 | 
149 | With conda, accessions pulled from scRecounter database:
150 | 
151 | ```bash
152 | nextflow run main.nf \
153 |   -work-dir tmp/work \
154 |   -profile conda,trace,report,vm,vm_dev,dev,no_acc_dev
155 | ```
156 | 
157 | GCP run with provided accessions:
158 | 
159 | ```bash
160 | nextflow run main.nf \
161 |   -profile docker,trace,report,gcp,gcp_dev,dev,acc_dev
162 | ```
163 | 
164 | GCP run with accessions pulled from scRecounter SQL database:
165 | 
166 | ```bash
167 | nextflow run main.nf \
168 |   -profile docker,trace,report,gcp,gcp_dev,dev,no_acc_dev
169 | ```
170 | 
171 | ### Characterize datasets
172 | 
173 | Use just a small subset of reads in the dataset to identify library prep method, species, etc.
174 | 
175 | ```bash
176 | nextflow run /home/nickyoungblut/dev/nextflow/scRecounter/main.nf \
177 |   -work-dir gs://arc-ctc-nextflow/scRecounter/work \
178 |   -profile docker,gcp \
179 |   -ansi-log false \
180 |   --max_spots 100000 \
181 |   --output_dir gs://arc-ctc-nextflow/scRecounter/results/ \
182 |   --accessions TMP/SRX22716300.csv
183 | ```
184 | 
185 | ### Deploy to GCP Cloud Run
186 | 
187 | See [./docker/sc-recounter-run/README.md](./docker/sc-recounter-run/README.md) for details.
188 | 
189 | 
190 | # Contributing
191 | 
192 | Feel free to fork the repository and submit a pull request.
193 | However, the top priority is to keep SRAgent functioning 
194 | for the ongoing scBaseCamp project.


--------------------------------------------------------------------------------
/scripts/tiledb-loader/bin/h5ad-to-db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # import
  3 | ## batteries
  4 | import os
  5 | import gc
  6 | import logging
  7 | import argparse
  8 | import concurrent.futures
  9 | from typing import List, Set, Tuple, Optional
 10 | ## 3rd party
 11 | import pandas as pd
 12 | import tiledbsoma
 13 | import tiledbsoma.io
 14 | import scanpy as sc
 15 | 
 16 | # format logging
 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 18 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING)
 19 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING)
 20 | logging.getLogger("tiledb").setLevel(logging.WARNING) 
 21 | 
 22 | # classes
 23 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 24 |     pass
 25 | 
 26 | # functions
 27 | def parse_arguments() -> argparse.Namespace:
 28 |     """
 29 |     Parse command-line arguments.
 30 |     """
 31 |     desc = 'Add scRNA-seq data to a TileDB database.'
 32 |     epi = """DESCRIPTION:
 33 |     """
 34 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 35 |     parser.add_argument(
 36 |         'h5ad_files', type=str, nargs="+", help='Path to the h5ad file(s) to load.'
 37 |     )
 38 |     parser.add_argument(
 39 |         '--db-uri', type=str, help='URI of the TileDB database.', required=True
 40 |     )
 41 |     parser.add_argument(
 42 |         '--from-disk', action='store_true', default=False, help='Load from disk instead of memory.'
 43 |     )
 44 |     parser.add_argument(
 45 |         '--threads', type=int, default=8, help='Number of threads to use.'
 46 |     )
 47 |     return parser.parse_args()
 48 | 
 49 | def append_to_database_from_mem(db_uri: str, adata: sc.AnnData) -> None:
 50 |     """
 51 |     Append an AnnData object to the TileDB database.
 52 |     Args:
 53 |         db_uri: URI of the TileDB database
 54 |         adata: AnnData object to append
 55 |     """
 56 |     logging.info("  Appending data...")
 57 | 
 58 |     # Register AnnData objects
 59 |     rd = tiledbsoma.io.register_anndatas(
 60 |         db_uri,
 61 |         [adata],
 62 |         measurement_name="RNA",
 63 |         obs_field_name="obs_id",
 64 |         var_field_name="var_id",
 65 |     )
 66 | 
 67 |     with tiledbsoma.Experiment.open(db_uri) as exp:
 68 |         tiledbsoma.io.resize_experiment(
 69 |             exp.uri,
 70 |             nobs=rd.get_obs_shape(),
 71 |             nvars=rd.get_var_shapes()
 72 |         )
 73 | 
 74 |     # Ingest new data into the db
 75 |     tiledbsoma.io.from_anndata(
 76 |         db_uri,
 77 |         adata,
 78 |         measurement_name="RNA",
 79 |         registration_mapping=rd,
 80 |     )
 81 | 
 82 | def create_tiledb_from_mem(db_uri: str, adata: sc.AnnData) -> None:
 83 |     """
 84 |     Create a new tiledb database.
 85 |     Args:
 86 |         db_uri: URI of the TileDB database
 87 |         adata: AnnData object to append
 88 |     """
 89 |     logging.info("  Creating new database...")
 90 |     tiledbsoma.io.from_anndata(
 91 |         db_uri,
 92 |         adata,
 93 |         measurement_name="RNA",
 94 |     )
 95 | 
 96 | def load_tiledb_from_mem(h5ad_files: List[str], db_uri: str, threads: int=8) -> None:
 97 |     """
 98 |     Loads `batch_size` files in parallel, then appends them all at once to the database.
 99 |     Args:
100 |         matrix_files: List of tuples (matrix_path, srx_id)
101 |         db_uri: URI of the TileDB database
102 |         threads: Number of threads to use
103 |     """
104 |     # load anndata objects in parallel
105 |     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
106 |         ann_list = executor.map(sc.read_h5ad, h5ad_files)
107 | 
108 |     # append to database
109 |     for i,adata in enumerate(ann_list, 1):
110 |         logging.info(f"Processing matrix file {i} of {len(h5ad_files)}")
111 |         if not os.path.exists(db_uri):
112 |             create_tiledb_from_mem(db_uri, adata)
113 |         else:
114 |             append_to_database_from_mem(db_uri, adata)
115 | 
116 |     # status
117 |     logging.info("All matrix files processed!")
118 | 
119 | def append_to_database_from_disk(db_uri: str, h5ad_files: List[str], threads: int) -> None:
120 |     """
121 |     Append a anndata object from h5ad files to the TileDB database.
122 |     Args:
123 |         db_uri: URI of the TileDB database
124 |         h5ad_files: List of h5ad files to append
125 |         threads: Number of threads to use
126 |     """
127 |     logging.info("  Appending data...")
128 | 
129 |     # Register h5ad objects
130 |     rd = tiledbsoma.io.register_h5ads(
131 |         db_uri,
132 |         h5ad_files,
133 |         measurement_name="RNA",
134 |         obs_field_name="obs_id",
135 |         var_field_name="var_id",
136 |     )
137 | 
138 |     # Resize the experiment
139 |     with tiledbsoma.Experiment.open(db_uri) as exp:
140 |         tiledbsoma.io.resize_experiment(
141 |             exp.uri,
142 |             nobs=rd.get_obs_shape(),
143 |             nvars=rd.get_var_shapes()
144 |         )
145 | 
146 |     # Ingest new data into the db
147 |     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
148 |         futures = [
149 |             executor.submit(
150 |                 tiledbsoma.io.from_h5ad,
151 |                 db_uri,
152 |                 h5ad_file,
153 |                 measurement_name="RNA",
154 |                 registration_mapping=rd
155 |             )
156 |             for h5ad_file in h5ad_files
157 |         ]
158 |         # Wait for all futures to complete
159 |         concurrent.futures.wait(futures)
160 |         # Raise any exceptions that occurred
161 |         for future in futures:
162 |             future.result()
163 | 
164 | def create_tiledb_from_disk(db_uri: str, h5ad_file: str) -> None:
165 |     """
166 |     Create a new tiledb database.
167 |     Args:
168 |         db_uri: URI of the TileDB database
169 |         h5ad_file: Path to the h5ad file to load
170 |     """
171 |     logging.info("  Creating new database...")
172 |     tiledbsoma.io.from_h5ad(
173 |         db_uri, h5ad_file, measurement_name="RNA",
174 |     )
175 | 
176 | def load_tiledb_from_disk(h5ad_files: List[str], db_uri: str, threads: int) -> None:
177 |     """
178 |     Load h5ad files from disk and append them to the TileDB database.
179 |     The database is created if it does not exist.
180 |     Args:
181 |         h5ad_files: List of h5ad files to load
182 |         db_uri: URI of the TileDB database
183 |         threads: Number of threads to use
184 |     """
185 |     logging.info("Loading data from disk...")
186 | 
187 |     # append/create database
188 |     if not os.path.exists(db_uri):
189 |         create_tiledb_from_disk(db_uri, h5ad_files[0])
190 |         h5ad_files = h5ad_files[1:]
191 |     append_to_database_from_disk(db_uri, h5ad_files, threads)
192 | 
193 |     # status
194 |     logging.info("All matrix files processed!")
195 | 
196 | 
197 | def main():
198 |     """Main function to run the TileDB loader workflow."""
199 |     args = parse_arguments()
200 |     
201 |     # Load data into memory and append to TileDB
202 |     if args.from_disk:
203 |         load_tiledb_from_disk(args.h5ad_files, args.db_uri, args.threads)
204 |     else:
205 |         load_tiledb_from_mem(args.h5ad_files, args.db_uri, args.threads)
206 | 
207 | 
208 | if __name__ == "__main__":
209 |     from dotenv import load_dotenv
210 |     load_dotenv(override=True)
211 |     main()
212 | 
213 | 


--------------------------------------------------------------------------------
/scripts/gcp2chimera.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import argparse
  5 | from shutil import which, rmtree
  6 | from typing import Tuple, List, Dict
  7 | from datetime import datetime, timedelta
  8 | from google.cloud import storage
  9 | from subprocess import run
 10 | 
 11 | 
 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 13 |     pass
 14 | 
 15 | def parse_args() -> argparse.Namespace:
 16 |     """
 17 |     Parse command-line arguments.
 18 |     Returns:
 19 |         argparse.Namespace containing arguments.
 20 |     """
 21 |     # default min/max datetime
 22 |     fmt = "%Y-%m-%d_%H-%M-%S"
 23 |     min_dt = (datetime.now() - timedelta(days=3)).strftime(fmt)
 24 |     max_dt = (datetime.now() - timedelta(days=2)).strftime(fmt)
 25 | 
 26 |     desc = 'Transfer scRecounter output files from GCP to Chimera.'
 27 |     epi = """DESCRIPTION:
 28 |     Transfer scRecounter output files from GCP to Chimera.
 29 |     Example:
 30 |     ./scripts/gcp2chimera.py \
 31 |         --min-date-time 2025-02-18_00-00-00 \
 32 |         --max-date-time 2025-02-19_00-00-00 \
 33 |         --dest-dir /processed_datasets/scRecount/scRecounter/prod3 \
 34 |         --dry-run \
 35 |         gs://arc-ctc-screcounter/prod3/
 36 |     """
 37 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 38 |     parser.add_argument('gcs_dir', type=str,
 39 |                         help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/prod3/)')
 40 |     parser.add_argument('--dest-dir', type=str, default="/processed_datasets/scRecount/scRecounter/prod3",
 41 |                         help='Destination location on Chimera')
 42 |     parser.add_argument('--min-date-time', type=str, default=min_dt,
 43 |                         help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)')
 44 |     parser.add_argument('--max-date-time', type=str, default=max_dt,
 45 |                         help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)')
 46 |     parser.add_argument('--dry-run', action='store_true',
 47 |                         help='Print commands without executing')
 48 |     parser.add_argument('--force', action='store_true',
 49 |                         help='Force overwrite of existing directories in the dest-dir')
 50 |     return parser.parse_args()
 51 | 
 52 | def parse_gs_path(gs_path: str) -> Tuple[str, str]:
 53 |     """
 54 |     Parse a GCP bucket path.
 55 |     Args:
 56 |         gs_path: GCP bucket path starting with gs://
 57 |     Returns:
 58 |         A tuple of (bucket_name, prefix).
 59 |     """
 60 |     if not gs_path.startswith("gs://"):
 61 |         raise ValueError("Path must start with 'gs://'")
 62 |     parts = gs_path[5:].split("/", 1)
 63 |     bucket_name = parts[0]
 64 |     prefix = parts[1] if len(parts) > 1 else ""
 65 |     return bucket_name, prefix.rstrip("/") + "/"
 66 | 
 67 | def list_screcounter_directories(
 68 |     bucket: storage.bucket.Bucket,
 69 |     prefix: str,
 70 |     min_dt: datetime,
 71 |     max_dt: datetime
 72 | ) -> List[str]:
 73 |     """
 74 |     List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket
 75 |     under the given prefix, filtered by date/time range.
 76 |     Args:
 77 |         bucket: The GCS bucket object.
 78 |         prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories.
 79 |         min_dt: The minimum datetime (inclusive).
 80 |         max_dt: The maximum datetime (inclusive).
 81 | 
 82 |     Returns:
 83 |         A list of directory prefixes that fall within the specified date/time range.
 84 |     """
 85 |     print(f"Listing directories under {prefix}...")
 86 |     num_searched = 0
 87 |     dir_list = []
 88 |     # Delimiter forces listing top-level folders under prefix
 89 |     iterator = bucket.list_blobs(prefix=prefix, delimiter='/')
 90 |     for page in iterator.pages:
 91 |         for folder in page.prefixes:
 92 |             folder_name = folder.rstrip('/').split('/')[-1]
 93 |             # Expecting folder_name like SCRECOUNTER_YYYY-MM-DD_hh-mm-ss
 94 |             if folder_name.startswith("SCRECOUNTER_"):
 95 |                 num_searched += 1
 96 |                 try:
 97 |                     date_str = folder_name.replace("SCRECOUNTER_", "")
 98 |                     dt = datetime.strptime(date_str, "%Y-%m-%d_%H-%M-%S")
 99 |                     if min_dt <= dt <= max_dt:
100 |                         dir_list.append(folder)
101 |                 except ValueError:
102 |                     pass
103 |     print(f"  Num. dirs searched: {num_searched}")
104 |     print(f"  Num target dirs: {len(dir_list)}")
105 |     return dir_list
106 | 
107 | 
108 | def gsutil_copy(
109 |     screcounter_dirs: List[str], dest_dir: str, bucket_name: str, 
110 |     dry_run: bool=False, force: bool=False
111 |     ) -> None:
112 |     """
113 |     Use gsutil to copy files from GCP to Chimera.
114 |     Args:
115 |         screcounter_dirs: A list of GCP bucket directory prefixes.
116 |         dest_dir: Destination directory on Chimera.
117 |     """
118 |     os.makedirs(dest_dir, exist_ok=True)
119 | 
120 |     print(f"Copying files to {dest_dir}...", file=sys.stderr)
121 |     for src_dir in screcounter_dirs:
122 |         src_dir = "gs://" + os.path.join(bucket_name, src_dir)
123 |         dest_dir_full = os.path.join(dest_dir, os.path.basename(os.path.dirname(src_dir)))
124 |         print(f"  Copying {src_dir} to {dest_dir_full}...", file=sys.stderr)
125 |         if os.path.exists(dest_dir_full):
126 |             msg = f"    Destination directory already exists."
127 |             if force:
128 |                 print(f"{msg} Deleting...", file=sys.stderr)
129 |                 if not dry_run:
130 |                     rmtree(dest_dir_full) 
131 |             else:
132 |                 print(f"{msg} Skipping.", file=sys.stderr)
133 |                 continue
134 |         if not dry_run:
135 |             cmd = f"gsutil -m cp -r {src_dir} {dest_dir}"
136 |             print(f"  CMD: {cmd}", file=sys.stderr)
137 |             run(cmd, shell=True, check=True)
138 | 
139 | def main(args: argparse.Namespace) -> None:
140 |     """
141 |     Main function that:
142 |      1) Parses GCP bucket path.
143 |      2) Lists all SCRECOUNTER directories in the bucket (non-recursive).
144 |      3) Filters directories by date range.
145 |      4) For each target directory, use gsutil to copy files from bucket to Chimera.
146 |     Args:
147 |         args: An argparse.Namespace holding command-line arguments.
148 |     """
149 |     # check if gsutil is installed
150 |     if which("gsutil") is None:
151 |         print("gsutil is not installed. Please install it first.")
152 |         sys.exit(1)
153 | 
154 |     # Format arg date/time strings
155 |     min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S")
156 |     max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S")
157 | 
158 |     # Parse GCP bucket path
159 |     bucket_name, path_prefix = parse_gs_path(args.gcs_dir)
160 | 
161 |     # Initialize GCP client and bucket
162 |     client = storage.Client()
163 |     bucket = client.bucket(bucket_name)
164 | 
165 |     # list all SCRECOUNTER directories in the bucket, filtered by date/time range
166 |     screcounter_dirs = list_screcounter_directories(bucket, path_prefix, min_dt, max_dt)
167 | 
168 |     # for each directory, copy files to Chimera
169 |     gsutil_copy(screcounter_dirs, args.dest_dir, bucket_name, args.dry_run, args.force)
170 |     
171 | 
172 | if __name__ == "__main__":
173 |     from dotenv import load_dotenv
174 |     load_dotenv()
175 |     args = parse_args()
176 |     main(args)


--------------------------------------------------------------------------------
/scripts/search-cloud-run-job-logs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | import json
  4 | import argparse
  5 | import subprocess
  6 | from typing import Optional
  7 | from datetime import datetime, timedelta, timezone
  8 | import pytz
  9 | import pandas as pd
 10 | 
 11 | 
 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass
 13 | 
 14 | def parse_args() -> argparse.Namespace:
 15 |     desc = 'Search for logs in Cloud Run Jobs that contain a specific keyword.'
 16 |     epi = """DESCRIPTION:
 17 |     Search for logs in Cloud Run Jobs.
 18 |     Examples:
 19 |       $ search-cloud-run-job-logs.py --keyword "ALREADY_EXISTS" 
 20 |       $ search-cloud-run-job-logs.py --content
 21 |     """
 22 |     # default datetime of N day ago
 23 |     default_datetime = (datetime.now(timezone.utc) - timedelta(days=3)).strftime("%Y-%m-%dT%H:%M:%SZ")
 24 | 
 25 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 26 |     parser.add_argument(
 27 |         "-k", "--keyword", default=None, help="The keyword to search for in Cloud Run Job logs."
 28 |     )
 29 |     parser.add_argument(
 30 |         "-p", "--project-id", default="c-tc-429521", help="The Google Cloud project ID."
 31 |     )
 32 |     parser.add_argument(
 33 |         "-n", "--job-name", default="sc-recounter-run", help="The name of the Cloud Run Job."
 34 |     )
 35 |     parser.add_argument(
 36 |         "--start-datetime", type=str, default=default_datetime,
 37 |         help="Start datetime for logs in ISO 8601 format."
 38 |     )
 39 |     parser.add_argument(
 40 |         "--severity", type=str, default="ERROR", 
 41 |         help="The minimum severity level of the logs to retrieve."
 42 |     )
 43 |     parser.add_argument(
 44 |         "--content", action="store_true", default=False,
 45 |         help="Print the content of the logs."
 46 |     )
 47 |     parser.add_argument(
 48 |         "--limit", type=int, default=None, 
 49 |         help="The total maximum number of logs to retrieve. Use None for unlimited."
 50 |     )
 51 |     return parser.parse_args()
 52 | 
 53 | def convert_time(timestamp: str) -> str:
 54 |     if timestamp == "Unknown":
 55 |         return timestamp
 56 |     try:
 57 |         gmt_time = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
 58 |         pct_timezone = pytz.timezone("America/Los_Angeles")
 59 |         timestamp = gmt_time.astimezone(pct_timezone)
 60 |         timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S %Z")
 61 |     except Exception as e:
 62 |         timestamp_str = f"Error converting time: {e}"
 63 |     return timestamp_str
 64 | 
 65 | def find_logs(
 66 |         project_id: str, 
 67 |         start_datetime: str, 
 68 |         job_name: str="sc-recounter-run", 
 69 |         region: str="us-east1",
 70 |         keyword: Optional[str]=None, 
 71 |         severity: Optional[str]="ERROR",
 72 |         limit: int=None,
 73 |     ) -> None:
 74 |     """
 75 |     Find logs in Cloud Run Jobs 
 76 |     """
 77 |     next_page_token = None
 78 |     logs_retrieved = 0
 79 | 
 80 |     # add 8 hours to start_datetime to account for the difference between GMT and PCT
 81 |     start_datetime = (datetime.fromisoformat(start_datetime) + timedelta(hours=8)).strftime("%Y-%m-%dT%H:%M:%SZ")
 82 | 
 83 | 
 84 |     job_info = []
 85 |     while True:
 86 |         # Construct the gcloud command
 87 |         query = [
 88 |             'resource.type="cloud_run_job"',
 89 |             f'resource.labels.job_name="{job_name}"',
 90 |             f'resource.labels.location="{region}"',
 91 |             f'timestamp>="{start_datetime}"',
 92 |         ]
 93 |         if severity:
 94 |             query.append(f'severity>={severity}')
 95 |         if keyword:
 96 |             query.append(f'textPayload:{keyword}')
 97 |         query = " AND ".join(query)
 98 |         cmd = f"gcloud logging read '{query}' --project={project_id} --format=json --limit=1000"
 99 |         if next_page_token:
100 |             cmd += f" --page-token={next_page_token}"
101 | 
102 |         # Execute the gcloud command
103 |         print(f"Executing command: {cmd}", file=sys.stderr)
104 |         try:
105 |             result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
106 |             logs = json.loads(result.stdout)
107 |             if not logs:
108 |                 print("No more logs found.")
109 |                 break
110 | 
111 |             for log in logs:
112 |                 job_name = log.get("resource", {}).get("labels", {}).get("job_name", "Unknown")
113 |                 execution_id = log.get("labels", {}).get("run.googleapis.com/execution_name", "Unknown")
114 |                 timestamp = convert_time(log.get("timestamp", "Unknown"))
115 |                 #print(f"Job Name: {job_name}, Execution ID: {execution_id}, Timestamp: {timestamp}")
116 |                 job_info.append([job_name, execution_id, timestamp])
117 |                 logs_retrieved += 1
118 | 
119 |                 # Stop if we've reached the limit
120 |                 if limit and logs_retrieved >= limit:
121 |                     print(f"Reached the limit of {limit} logs.")
122 |                     return job_info
123 | 
124 |             # Check if there's a next page token
125 |             next_page_token = result.stderr.split("nextPageToken: ")[-1].strip() if "nextPageToken" in result.stderr else None
126 |             if not next_page_token:
127 |                 break
128 |         except subprocess.CalledProcessError as e:
129 |             print(f"Error executing gcloud command: {e.stderr.strip()}")
130 |             break
131 |         except json.JSONDecodeError:
132 |             print("Failed to parse the JSON response from gcloud.")
133 |             break
134 |     return job_info
135 | 
136 | def get_content(
137 |     log_info: pd.DataFrame,
138 |     project_id: str,
139 | ) -> None:
140 |     for index, row in log_info.iterrows():
141 |         job_name = row["Job Name"]
142 |         execution_id = row["Execution ID"]
143 |         query = f'labels."run.googleapis.com/execution_name"="{execution_id}"'
144 |         cmd = f"gcloud logging read '{query}' --project={project_id} --format=json"
145 |         print(f"Executing command: {cmd}", file=sys.stderr)
146 |         try:
147 |             result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
148 |             logs = json.loads(result.stdout)
149 |             print(f"#-- Execution ID: {execution_id} --#")
150 |             for log in logs:
151 |                 print(log.get("textPayload", ""))
152 |         except subprocess.CalledProcessError as e:
153 |             print(f"Error executing gcloud command: {e.stderr.strip()}")
154 |         except json.JSONDecodeError:
155 |             print("Failed to parse the JSON response from gcloud.")
156 | 
157 | if __name__ == "__main__":
158 |     args = parse_args()
159 | 
160 |     # find the logs
161 |     log_info = find_logs(
162 |         keyword=args.keyword, 
163 |         project_id=args.project_id,
164 |         start_datetime=args.start_datetime, 
165 |         job_name=args.job_name,
166 |         severity=args.severity,
167 |         limit=args.limit,
168 |     )
169 |     # convert to a pandas dataframe
170 |     log_info = pd.DataFrame(log_info, columns=["Job Name", "Execution ID", "Timestamp"])
171 | 
172 |     # get the content of the logs or save general log info to a CSV file
173 |     if args.content:
174 |         # get the content of the logs
175 |         get_content(log_info, args.project_id)
176 |     else:
177 |         # save to a CSV file
178 |         log_info.to_csv(sys.stdout, index=False)
179 | 
180 |         


--------------------------------------------------------------------------------
/scripts/purge-srx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import argparse
  5 | from typing import Tuple, List, Dict
  6 | import pandas as pd
  7 | from google.cloud import storage
  8 | from psycopg2.extensions import connection
  9 | from db_utils import db_connect, db_update
 10 | 
 11 | 
 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): pass
 13 | 
 14 | def parse_args() -> argparse.Namespace:
 15 |     """
 16 |     Parse command-line arguments.
 17 |     Returns:
 18 |         argparse.Namespace containing arguments.
 19 |     """
 20 |     desc = 'Purge SRX accessions from the scRecounter system.'
 21 |     epi = """DESCRIPTION:
 22 |     Purging:
 23 |      - Removes SRX records from scRecounter SQL database.
 24 |      - Removes the SRX directories from the GCP output folder of the scRecounter pipeline.
 25 | 
 26 |     Note: only scRecounter is purged, not SRAgent.
 27 | 
 28 |     Examples:
 29 |     purge-srx.py ERX10024831 ERX10086874
 30 |     """
 31 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 32 |     parser.add_argument('srx_accession', type=str, nargs='+',
 33 |                         help='>=1 SRX accession to purge from the scRecounter system.')
 34 |     parser.add_argument('--dry-run', action='store_true', default=False,
 35 |                         help='Print actions without executing.')
 36 |     parser.add_argument('--gcs-dir', type=str, default='gs://arc-ctc-screcounter/prod3/',
 37 |                         help='Base directory in GCP bucket where SCRECOUNTER directories are stored.')               
 38 |     return parser.parse_args()
 39 | 
 40 | def parse_gs_path(gs_path: str) -> Tuple[str, str]:
 41 |     """
 42 |     Parse a GCP bucket path.
 43 |     Args:
 44 |         gs_path: GCP bucket path starting with gs://
 45 |     Returns:
 46 |         A tuple of (bucket_name, prefix).
 47 |     """
 48 |     if not gs_path.startswith("gs://"):
 49 |         raise ValueError("Path must start with 'gs://'")
 50 |     parts = gs_path[5:].split("/", 1)
 51 |     bucket_name = parts[0]
 52 |     prefix = parts[1] if len(parts) > 1 else ""
 53 |     return bucket_name, prefix.rstrip("/") + "/"
 54 | 
 55 | def list_screcounter_directories(
 56 |     bucket: storage.bucket.Bucket,
 57 |     prefix: str,
 58 |     srx_accesions: List[str],
 59 | ) -> Dict[str,str]:
 60 |     """
 61 |     List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket under the given prefix.
 62 |     Args:
 63 |         bucket: The GCS bucket object.
 64 |         prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories.
 65 |     Returns:
 66 |         A dictionary of {srx_accession: directory_path} for the target SRX accessions.
 67 |     """
 68 |     print(f"Searching for SRX directories...", file=sys.stderr)
 69 |     srx_dirs = {}
 70 |     for blob in bucket.list_blobs(prefix=prefix):
 71 |         blob_dir = os.path.dirname(blob.name)
 72 |         blob_dir_base = os.path.basename(blob_dir)
 73 |         blob_dir_parent = os.path.basename(os.path.dirname(blob_dir))
 74 |         if blob_dir_parent == "STAR" and blob_dir_base in srx_accesions:
 75 |             srx_dirs[blob_dir_base] = blob_dir
 76 |     print(f"  Found {len(srx_dirs)} SRX directories", file=sys.stderr)
 77 |     return srx_dirs
 78 | 
 79 | def purge_accession_tables(
 80 |     srx_dirs: Dict[str,str], bucket: storage.bucket.Bucket, dry_run: bool=False
 81 |     ) -> None:
 82 |     """
 83 |     Purge SRX accessions from the accession tables in the GCP bucket.
 84 |     Args:
 85 |         srx_dirs: Dictionary of {srx_accession: directory_path} for the target SRX accessions.
 86 |         bucket: The GCS bucket object.
 87 |         dry_run: If True, only print actions without executing.
 88 |     """   
 89 |     if len(srx_dirs) == 0:
 90 |         return None
 91 |     print(f"Purging accession tables...", file=sys.stderr)
 92 |     target_parent_dirs = set()
 93 |     for srx, srx_dir in srx_dirs.items():
 94 |         target_parent_dirs.add(os.path.dirname(os.path.dirname(srx_dir)))
 95 |     
 96 |     for parent_dir in target_parent_dirs:
 97 |         for blob in bucket.list_blobs(prefix=parent_dir):
 98 |             if os.path.basename(blob.name) == "accessions.csv":
 99 |                 # read in accessions file
100 |                 if not dry_run:
101 |                     df = pd.read_csv(pd.io.common.StringIO(blob.download_as_text()))
102 |                     # filter out the SRX accessions
103 |                     df = df[~df["sample"].isin(srx_dirs.keys())]
104 |                     # write back to GCP
105 |                     blob.upload_from_string(df.to_csv(index=False))
106 |                 print(f"  Purged {blob.name}", file=sys.stderr)
107 | 
108 | def delete_srx(srx_accessions: List[str], conn: connection, dry_run: bool=False):
109 |     """
110 |     Delete SRX accessions from scRecounter tables
111 |     Args:
112 |         srx_accessions: list of SRX accessions to delete
113 |         conn: database connection
114 |         dry_run: if True, only print actions without executing
115 |     """
116 |     if len(srx_accessions) == 0:
117 |         return None
118 |     print("Purging SRX accessions from scRecounter DB tables...", file=sys.stderr)
119 |     target_tables = ["screcounter_log", "screcounter_star_params", "screcounter_star_results"]
120 |     with db_connect() as conn:
121 |         for srx in srx_accessions:
122 |             if not dry_run:
123 |                 for tbl_name in target_tables:
124 |                     with conn.cursor() as cur:
125 |                         cur.execute(f"DELETE FROM {tbl_name} WHERE sample = '{srx}'")
126 |                         conn.commit()
127 |             print(f"  Deleted: {srx}", file=sys.stderr)
128 | 
129 | def delete_srx_star_dirs(srx_dirs: Dict[str,str], bucket: storage.bucket.Bucket, dry_run: bool=False):
130 |     """
131 |     Delete SRX directories from the GCP bucket
132 |     Args:
133 |         srx_dirs: Dictionary of {srx_accession: directory_path} for the target SRX accessions.
134 |         bucket: The GCS bucket object.
135 |         dry_run: If True, only print actions without executing.
136 |     """
137 |     if len(srx_dirs) == 0:
138 |         return None
139 |     print(f"Deleting SRX STAR directories...", file=sys.stderr)
140 |     for srx_dir in srx_dirs.values():
141 |         print(f"  Deleting: {srx_dir}", file=sys.stderr)
142 |         if not dry_run:
143 |             for blob in bucket.list_blobs(prefix=srx_dir):
144 |                 blob.delete()
145 | 
146 | def main(args: argparse.Namespace) -> None:
147 |     """
148 |      - Input: 
149 |        - >=1 NCBI SRX accession
150 |        - Path to GCP directory
151 |      - Method
152 |        - Recursively search in the GCP directory for folders named the same as the SRX accession
153 |          - Delete each target folder
154 |        - For each target folder, find the "accessions.csv" file 2 levels up from the target folder
155 |        - Also delete the SRX from all scRecounter tables in the SQL database
156 |     """
157 |     print(f"GCP_SQL_DB_NAME: {os.getenv('GCP_SQL_DB_NAME')}", file=sys.stderr)
158 | 
159 |     # Parse the GCP bucket path
160 |     bucket_name, path_prefix = parse_gs_path(args.gcs_dir)
161 |     
162 |     # Initialize GCP client and bucket
163 |     client = storage.Client()
164 |     bucket = client.bucket(bucket_name)
165 | 
166 |     # Dind target SRX directories in GCP bucket
167 |     srx_dirs = list_screcounter_directories(bucket, path_prefix, args.srx_accession)
168 | 
169 |     # Selete SRX accessions from scRecounter tables
170 |     purge_accession_tables(srx_dirs, bucket, dry_run=args.dry_run)
171 | 
172 |     # Selete SRX directories from GCP bucket
173 |     delete_srx_star_dirs(srx_dirs, bucket, dry_run=args.dry_run)
174 | 
175 |     # Selete SRX accessions from scRecounter tables
176 |     with db_connect() as conn:
177 |         delete_srx(args.srx_accession, conn, dry_run=args.dry_run)
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     from dotenv import load_dotenv
182 |     load_dotenv()
183 |     args = parse_args()
184 |     main(args)


--------------------------------------------------------------------------------
/scripts/extract-from-result-files.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import sys
  4 | import argparse
  5 | from typing import Tuple, List, Dict
  6 | from datetime import datetime
  7 | import pandas as pd
  8 | from google.cloud import storage
  9 | from db_utils import db_connect, db_update
 10 | 
 11 | 
 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 13 |     pass
 14 | 
 15 | def parse_args() -> argparse.Namespace:
 16 |     """
 17 |     Parse command-line arguments.
 18 |     Returns:
 19 |         argparse.Namespace containing arguments.
 20 |     """
 21 |     desc = 'Extract data from STAR results in scRecounter output directory'
 22 |     epi = """DESCRIPTION:
 23 | 
 24 |     """
 25 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 26 |     parser.add_argument('gcs_dir', type=str,
 27 |                         help='GCP bucket path to work directory (e.g., gs://arc-ctc-screcounter/prod3/)')
 28 |     parser.add_argument('--min-date-time', type=str, default='2025-01-13_00-00-00',
 29 |                         help='Minimum date/time (YYYY-MM-DD_hh-mm-ss)')
 30 |     parser.add_argument('--max-date-time', type=str, default='2025-01-15_00-00-00',
 31 |                         help='Maximum date/time (YYYY-MM-DD_hh-mm-ss)')
 32 |     return parser.parse_args()
 33 | 
 34 | def parse_gs_path(gs_path: str) -> Tuple[str, str]:
 35 |     """
 36 |     Parse a GCP bucket path.
 37 |     Args:
 38 |         gs_path: GCP bucket path starting with gs://
 39 |     Returns:
 40 |         A tuple of (bucket_name, prefix).
 41 |     """
 42 |     if not gs_path.startswith("gs://"):
 43 |         raise ValueError("Path must start with 'gs://'")
 44 |     parts = gs_path[5:].split("/", 1)
 45 |     bucket_name = parts[0]
 46 |     prefix = parts[1] if len(parts) > 1 else ""
 47 |     return bucket_name, prefix.rstrip("/") + "/"
 48 | 
 49 | def list_screcounter_directories(
 50 |     bucket: storage.bucket.Bucket,
 51 |     prefix: str,
 52 |     min_dt: datetime,
 53 |     max_dt: datetime
 54 | ) -> List[str]:
 55 |     """
 56 |     List directories named 'SCRECOUNTER_YYYY-MM-DD_hh-mm-ss' in the bucket
 57 |     under the given prefix, filtered by date/time range.
 58 |     Args:
 59 |         bucket: The GCS bucket object.
 60 |         prefix: The prefix (subfolder) in which to look for SCRECOUNTER directories.
 61 |         min_dt: The minimum datetime (inclusive).
 62 |         max_dt: The maximum datetime (inclusive).
 63 | 
 64 |     Returns:
 65 |         A list of directory prefixes that fall within the specified date/time range.
 66 |     """
 67 |     dir_list = []
 68 |     # Delimiter forces listing top-level folders under prefix
 69 |     iterator = bucket.list_blobs(prefix=prefix, delimiter='/')
 70 |     for page in iterator.pages:
 71 |         for folder in page.prefixes:
 72 |             folder_name = folder.rstrip('/').split('/')[-1]
 73 |             # Expecting folder_name like SCRECOUNTER_YYYY-MM-DD_hh-mm-ss
 74 |             if folder_name.startswith("SCRECOUNTER_"):
 75 |                 try:
 76 |                     date_str = folder_name.replace("SCRECOUNTER_", "")
 77 |                     dt = datetime.strptime(date_str, "%Y-%m-%d_%H-%M-%S")
 78 |                     if min_dt <= dt <= max_dt:
 79 |                         dir_list.append(folder)
 80 |                 except ValueError:
 81 |                     pass
 82 |     return dir_list
 83 | 
 84 | def find_summary_files(
 85 |     bucket: storage.bucket.Bucket,
 86 |     directory_prefix: str
 87 | ) -> List[str]:
 88 |     """
 89 |     Recursively find all Summary.csv files within a given SCRECOUNTER directory.
 90 | 
 91 |     Args:
 92 |         bucket: The GCS bucket object.
 93 |         directory_prefix: The prefix for the specific SCRECOUNTER directory.
 94 | 
 95 |     Returns:
 96 |         A list of blob paths (strings) for Summary.csv files meeting criteria.
 97 |     """
 98 |     valid_parents = {"Velocyto", "GeneFull_ExonOverIntron", "GeneFull_Ex50pAS", "GeneFull", "Gene"}
 99 |     summary_blobs = []
100 |     for blob in bucket.list_blobs(prefix=directory_prefix):
101 |         if blob.name.endswith("Summary.csv"):
102 |             # The parent directory is right before the filename in the path
103 |             path_parts = blob.name.split('/')
104 |             if len(path_parts) > 1:
105 |                 parent_dir = path_parts[-2]
106 |                 if parent_dir in valid_parents:
107 |                     summary_blobs.append(blob.name)
108 |     return summary_blobs
109 | 
110 | def read_and_merge_summary_files(
111 |     bucket: storage.bucket.Bucket,
112 |     file_paths: List[str]
113 | ) -> List[pd.DataFrame]:
114 |     """
115 |     Read multiple Summary.csv files into dataframes and merge them.
116 | 
117 |     Args:
118 |         bucket: The GCS bucket object.
119 |         file_paths: A list of blob paths for Summary.csv files.
120 | 
121 |     Returns:
122 |         A merged pandas DataFrame of all summary data.
123 |     """
124 |     rename_idx = {
125 |         "Gene": "gene",
126 |         "GeneFull": "gene_full",
127 |         "GeneFull_ExonOverIntron": "gene_ex_int",
128 |         "GeneFull_Ex50pAS": "gene_ex50",
129 |         "Velocyto": "velocyto" 
130 |     }
131 | 
132 |     dfs = []
133 |     for path in file_paths:
134 |         # read CSV file from GCS
135 |         blob = bucket.blob(path)
136 |         data_str = blob.download_as_text()
137 |         df = pd.read_csv(pd.io.common.StringIO(data_str))
138 |         # format
139 |         df.columns = ["Category", "Value"]
140 |         df = df[df["Category"] == "Reads With Valid Barcodes"]
141 |         df = df.set_index("Category").transpose()
142 |         ## add file path info
143 |         p = os.path.dirname(path)
144 |         df["feature"] = rename_idx[os.path.basename(p)]
145 |         df["sample"] = os.path.basename(os.path.dirname(p))
146 |         # add to list 
147 |         dfs.append(df)
148 | 
149 |     print("No. of tables: ", len(dfs), file=sys.stderr)
150 |     return dfs
151 | 
152 | def main(args: argparse.Namespace) -> None:
153 |     """
154 |     Main function that:
155 |      1) Parses GCP bucket path.
156 |      2) Lists all SCRECOUNTER directories in the bucket (non-recursive).
157 |      3) Filters directories by date range.
158 |      4) For each directory, recursively searches for 'Summary.csv' files
159 |         in allowed parent subdirectories.
160 |      5) Merges summary data and upserts into a database.
161 | 
162 |     Args:
163 |         args: An argparse.Namespace holding command-line arguments.
164 |     """
165 |     # Format arg date/time strings
166 |     min_dt = datetime.strptime(args.min_date_time, "%Y-%m-%d_%H-%M-%S")
167 |     max_dt = datetime.strptime(args.max_date_time, "%Y-%m-%d_%H-%M-%S")
168 | 
169 |     # Parse GCP bucket path
170 |     bucket_name, path_prefix = parse_gs_path(args.gcs_dir)
171 | 
172 |     # Initialize GCP client and bucket
173 |     client = storage.Client()
174 |     bucket = client.bucket(bucket_name)
175 | 
176 |     # list all SCRECOUNTER directories in the bucket, filtered by date/time range
177 |     screcounter_dirs = list_screcounter_directories(bucket, path_prefix, min_dt, max_dt)
178 | 
179 |     # for each directory, find and merge Summary.csv files
180 |     merged_df = []
181 |     for directory in screcounter_dirs:
182 |         print(f"Processing directory: {directory}", file=sys.stderr)
183 |         summary_paths = find_summary_files(bucket, directory)
184 |         if summary_paths:
185 |             merged_df += read_and_merge_summary_files(bucket, summary_paths)
186 |     # concat all dataframes
187 |     merged_df = pd.concat(merged_df, ignore_index=True).rename(
188 |             columns={"Reads With Valid Barcodes": "reads_with_valid_barcodes"}
189 |     )
190 | 
191 |     # check if any valid data was found
192 |     if merged_df is None:
193 |         print("No valid data found.", file=sys.stderr)
194 |         return None
195 |     else:
196 |         print(f"No. of records found: {merged_df.shape[0]}", file=sys.stderr)
197 | 
198 |     # Upsert data into database
199 |     print("Updating data...", file=sys.stderr)
200 |     with db_connect() as conn:
201 |         db_update(merged_df,  "screcounter_star_results", conn)
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     from dotenv import load_dotenv
206 |     load_dotenv()
207 |     args = parse_args()
208 |     main(args)


--------------------------------------------------------------------------------
/docker/sc-recounter-run/cleanup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os
  3 | import argparse
  4 | from typing import Tuple, List, Dict
  5 | import pandas as pd
  6 | from google.cloud import storage
  7 | from db_utils import db_connect, db_upsert
  8 | 
  9 | # argparse
 10 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 11 |                       argparse.RawDescriptionHelpFormatter):
 12 |     pass
 13 | 
 14 | desc = 'Clean up after a scRecounter production run'
 15 | epi = """DESCRIPTION:
 16 | Examples:
 17 | python cleanup.py gs://arc-ctc-nextflow/scRecounter/prod/work/SCRECOUNTER_2025-01-06_15-46-04/ gs://arc-ctc-screcounter/prod/SCRECOUNTER_2025-01-06_15-46-04/ 
 18 | """
 19 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 20 |                                  formatter_class=CustomFormatter)
 21 | parser.add_argument(
 22 |     'work_dir', type=str, 
 23 |     help='GCP bucket path to work directory (e.g., gs://bucket-name/path/to/folder)'
 24 | )
 25 | parser.add_argument(
 26 |     'output_dir', type=str, 
 27 |     help='GCP bucket path to output directory (e.g., gs://bucket-name/path/to/folder)'
 28 | )
 29 | 
 30 | 
 31 | # functions
 32 | def list_bucket_contents(bucket_name: str, prefix: str) -> Tuple[List[str], Dict[str, int]]:
 33 |     """
 34 |     List directories and files in a GCP bucket path
 35 |     Args:
 36 |         bucket_name: GCP bucket name
 37 |         prefix: GCP bucket prefix    
 38 |     Returns:
 39 |         Tuple of (directories list, files list) in the bucket path
 40 |     """
 41 |     client = storage.Client()
 42 |     bucket = client.bucket(bucket_name)
 43 |     blobs = bucket.list_blobs(prefix=prefix, delimiter='/')
 44 |     
 45 |     directories = []
 46 |     files = {}
 47 |     
 48 |     # Get directories (prefixes)
 49 |     for page in blobs.pages:
 50 |         directories.extend(page.prefixes)
 51 |     directories = [d.rstrip('/') for d in directories]
 52 |     
 53 |     # Create a new iterator for blobs
 54 |     blobs = bucket.list_blobs(prefix=prefix, delimiter='/')
 55 | 
 56 |     # Get files
 57 |     for blob in blobs:
 58 |         if not blob.name.endswith('/'):  # Skip directory markers
 59 |             num_rows = 0
 60 |             if blob.name.split('/')[-1] == "accessions.csv":
 61 |                 # get the number of rows
 62 |                 blob.download_to_filename('/tmp/accessions.csv')
 63 |                 with open('/tmp/accessions.csv', 'r') as f:
 64 |                     num_rows = pd.read_csv(f).shape[0]
 65 |                 os.remove('/tmp/accessions.csv')
 66 |             files[os.path.basename(blob.name)] = num_rows         
 67 |     return directories, files
 68 | 
 69 | def delete_bucket_path(bucket_name: str, path: str) -> None:
 70 |     """
 71 |     Delete all objects in a GCP bucket path
 72 |     Args:
 73 |         bucket_name: GCP bucket name
 74 |     """
 75 |     client = storage.Client()
 76 |     bucket = client.bucket(bucket_name)
 77 |     blobs = bucket.list_blobs(prefix=path)
 78 |     
 79 |     for blob in blobs:
 80 |         blob.delete()
 81 | 
 82 | def parse_gs_path(gs_path: str) -> Tuple[str, str]:
 83 |     """
 84 |     Parse a GCP bucket path
 85 |     Args:
 86 |        gs_path: GCP bucket path
 87 |     Returns:
 88 |         Tuple of bucket name and prefix
 89 |     """
 90 |     if not gs_path.startswith("gs://"):
 91 |         raise ValueError("Path must start with 'gs://'")
 92 |     parts = gs_path[5:].split("/", 1)
 93 |     bucket_name = parts[0]
 94 |     prefix = parts[1] if len(parts) > 1 else ""
 95 |     return bucket_name, prefix.rstrip("/") + "/"
 96 | 
 97 | def clean_output_dir(output_dir: str) -> None:
 98 |     """
 99 |     Delete the contents of the output directory, 
100 |     if it only contains 'nf-report', 'nf-trace'.
101 |     Args:
102 |        output_dir: GCP bucket path to output directory
103 |     """
104 |     # parse the bucket path  
105 |     bucket_name, path_prefix = parse_gs_path(output_dir)
106 | 
107 |     # list directories in the bucket path
108 |     directories,files = list_bucket_contents(bucket_name, path_prefix)
109 |     directories = [os.path.basename(d) for d in directories]
110 |     print(f"Directories found: {', '.join(directories)}")
111 |     files_basename = [os.path.basename(f) for f in files]
112 |     print(f"Files found: {', '.join(files_basename)}")
113 | 
114 |     # if accessions.csv in the directory, get the number of lines
115 |     if files.get("accessions.csv") == 0:
116 |         print("No accessions found. Deleting the bucket path...")
117 |         delete_bucket_path(bucket_name, path_prefix)
118 |         print(f"Deleted path: {output_dir}")
119 |     elif set(directories).issubset({"nf-report", "nf-trace"}):
120 |         print("Just Nextflow report and/or trace found. Deleting the bucket path...")
121 |         delete_bucket_path(bucket_name, path_prefix)
122 |         print(f"Deleted path: {output_dir}")
123 |     else:
124 |         print("Bucket path contains pipeline results. No deletion performed.")
125 | 
126 | def clean_work_dir(work_dir: str) -> None:
127 |     """
128 |     Delete the contents of the work directory
129 |     Args:
130 |        work_dir: GCP bucket path to work directory
131 |     """
132 |     # parse the bucket path  
133 |     bucket_name, path_prefix = parse_gs_path(work_dir)
134 |     
135 |     print("Deleting the contents of the working directory...")
136 |     delete_bucket_path(bucket_name, path_prefix)
137 |     print(f"Deleted path: {work_dir}")
138 | 
139 | def download_gcs_file(
140 |     bucket_name: str, gcs_file_path: str, local_file_path: str="/tmp/temp_file.tsv"
141 |     ) -> str:
142 |     """
143 |     Download a file from a GCP bucket to a local file
144 |     Args:
145 |         bucket_name: GCP bucket name
146 |         gcs_file_path: GCP bucket path to the file
147 |         local_file_path: Local file path
148 |     Returns:
149 |         Local file path
150 |     """
151 |     client = storage.Client()
152 |     bucket = client.bucket(bucket_name)
153 |     blob = bucket.blob(gcs_file_path)
154 |     blob.download_to_filename(local_file_path)
155 |     return local_file_path
156 |   
157 | def upload_trace(output_dir: str) -> None:
158 |     """
159 |     Upload the trace file to the screcounter db
160 |     Args:
161 |         output_dir: GCP bucket path to output directory
162 |     """
163 |     # does nf-trace directory exists in gcp bucket location?
164 |     bucket_name, path_prefix = parse_gs_path(output_dir)
165 |     directories,files = list_bucket_contents(bucket_name, path_prefix)
166 |     directories = [os.path.basename(d) for d in directories]
167 | 
168 |     if "nf-trace" in directories:
169 |         # list the files in nf-trace directory
170 |         trace_dir = path_prefix + "nf-trace/"
171 |         trace_files = list_bucket_contents(bucket_name, trace_dir)[1]
172 |         # get the most recent based on the name
173 |         trace_file = sorted(list(trace_files.keys()))[-1]
174 |         # read the trace file as a pandas dataframe
175 |         trace_file_path = os.path.join(trace_dir, trace_file)
176 |         # read from gcp
177 |         local_file_path = download_gcs_file(bucket_name, trace_file_path)
178 |         # read the file
179 |         if not os.path.exists(local_file_path):
180 |             print(f"File not found: {local_file_path}")
181 |             return None
182 |         trace_df = pd.read_csv(local_file_path, sep="\t")
183 |         # remove the local file
184 |         os.remove(local_file_path)
185 |         # format
186 |         ## convert exit column to character
187 |         if "exit" in trace_df.columns:
188 |             trace_df["exit"] = trace_df["exit"].astype(str)
189 |         ## remove second "submit" column
190 |         if "submit.1" in trace_df.columns:
191 |             trace_df.drop(columns=["submit.1"], inplace=True)
192 |         ## rename "%cpu" to cpu_percent
193 |         if r"%cpu" in trace_df.columns:
194 |             trace_df.rename(columns={r"%cpu": "cpu_percent"}, inplace=True)
195 |         # upsert
196 |         with db_connect() as conn:
197 |             db_upsert(trace_df, "screcounter_trace", conn)
198 |         # status update
199 |         print(f"Uploaded trace file to screcounter db: {trace_file}")
200 |     else:
201 |         print("No nf-trace directory found. Skipping trace file db upload.")
202 | 
203 | def main(args): 
204 |     # clean up the work and output directories
205 |     clean_work_dir(args.work_dir)
206 |     clean_output_dir(args.output_dir)
207 |     # upload the trace file to the screcounter db
208 |     upload_trace(args.output_dir)
209 |     
210 | 
211 | if __name__ == "__main__":
212 |     args = parser.parse_args()
213 |     main(args)


--------------------------------------------------------------------------------
/bin/prefetch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | from __future__ import print_function
  4 | import os
  5 | import re
  6 | import sys
  7 | import argparse
  8 | import logging
  9 | from typing import Tuple, Optional
 10 | from time import sleep
 11 | from shutil import which
 12 | from subprocess import Popen, PIPE
 13 | import pandas as pd
 14 | from db_utils import db_connect, db_upsert, add_to_log
 15 | 
 16 | # logging
 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 18 | 
 19 | # argparse
 20 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 21 |                       argparse.RawDescriptionHelpFormatter):
 22 |     pass
 23 | 
 24 | desc = 'Run sra-tools prefetch'
 25 | epi = """DESCRIPTION:
 26 | Run sra-tools prefetch with handling of errors
 27 | """
 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 29 |                                  formatter_class=CustomFormatter)
 30 | parser.add_argument('accession', type=str, help='SRA accession')
 31 | parser.add_argument('--outdir', type=str, default='prefetch_out',
 32 |                     help='Output directory')
 33 | parser.add_argument('--max-size-gb', type=int, default=1000,
 34 |                     help='Max file size in Gb')
 35 | parser.add_argument('--tries', type=int, default=3,
 36 |                     help='Number of tries to download')
 37 | parser.add_argument('--sample', type=str, default="",
 38 |                     help='Sample name')
 39 | parser.add_argument('--gcp-download', action='store_true', default=False,
 40 |                     help='Obtain sequence data from SRA GCP mirror')
 41 | 
 42 | # functions
 43 | def run_cmd(cmd: str) -> Tuple[int,bytes,bytes]:
 44 |     """
 45 |     Run sub-command and return returncode, output, and error.
 46 |     Args:
 47 |         cmd: Command to run
 48 |     Returns:
 49 |         (returncode, output, error)
 50 |     """
 51 |     logging.info(f'Running: {cmd}')
 52 |     p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
 53 |     output, err = p.communicate()
 54 |     return p.returncode, output, err
 55 | 
 56 | def run_vdb_config() -> Tuple[str,str]:
 57 |     """
 58 |     Run vdb-config with error handling.
 59 |     Returns:
 60 |         Status and message
 61 |     """
 62 |     cmd = f"vdb-config --report-cloud-identity yes"
 63 |     rc,output,err = run_cmd(cmd)
 64 |     if rc != 0:
 65 |         logging.warning('vdb-config failed')
 66 |         logging.warning(err)
 67 |         return "Failure",f'vdb-config failed: {err}'
 68 |     return "Success","vdb-config successful"
 69 | 
 70 | def prefetch(accession: str, tries: int, max_size_gb: int, outdir: str) -> Tuple[str,str]:
 71 |     """
 72 |     Run prefetch with error handling.
 73 |     Args:
 74 |         accession: SRA accession
 75 |         tries: Number of tries
 76 |         max_size_gb: Max file size in Gb
 77 |         outdir: Output directory
 78 |     Returns:
 79 |         Status and message
 80 |     """
 81 |     logging.info(f"Downloading {accession}")
 82 |     cmd = f"prefetch --max-size {max_size_gb}G --output-directory {outdir} {accession}"
 83 |     err = ""
 84 |     for i in range(tries):
 85 |         logging.info(f"Attempt: {i+1}/{tries}")
 86 |         rc,output,err = run_cmd(cmd)
 87 |         if rc == 0:
 88 |             logging.info("Download successful")
 89 |             # run vdb-validate
 90 |             sra_dir = os.path.join(outdir, accession)
 91 |             rc,output,err = run_cmd(f"vdb-validate {sra_dir}")
 92 |             if rc == 0:
 93 |                 logging.info("Validation successful")
 94 |                 return "Success","Download and validation successful"
 95 |             else:
 96 |                 logging.warning("Validation failed")
 97 |                 logging.warning(err)
 98 |         else:
 99 |             logging.warning("Download failed")
100 |             logging.warning(err)
101 |         # sleep prior to next attempt
102 |         sleep_time = 20 * (i + 1)
103 |         logging.info(f"Sleeping for {sleep_time} seconds...")
104 |         sleep(sleep_time)
105 |     # assume failure
106 |     err = err.decode().replace('\n', ' ')
107 |     return "Failure",f"Failed to download and validate: {err}"
108 |     
109 | def run_vdb_dump(accession: str, min_size: int=1e6) -> Tuple[str,str]:
110 |     """
111 |     Run vdb-dump with error handling.
112 |     Args:
113 |         sra_file: SRA file
114 |         outdir: Output directory
115 |     Returns:
116 |         Status and message
117 |     """
118 |     cmd = f"vdb-dump --info {accession}"
119 |     rc,output,err = run_cmd(cmd)
120 |     if rc != 0:
121 |         logging.warning("Dump failed")
122 |         logging.warning(err)
123 |         return "Failure",f'vdb-dump failed: {err}'
124 | 
125 |     # parse the output
126 |     regex = re.compile(r' *: ')
127 |     data = {}
128 |     for line in output.decode().split('\n'):
129 |         line = regex.split(line.rstrip(), 1)
130 |         if len(line) < 2:
131 |             continue
132 |         data[line[0]] = line[1]
133 | 
134 |     # checks
135 |     ## keys
136 |     for x in ['acc', 'size', 'FMT', 'platf']:
137 |         if x not in data:
138 |             return "Failure","Missing key in vdb-dump output: {x}"
139 |     ## accession
140 |     if data['acc'] != accession:
141 |         return "Failure",f'Accession mismatch: {data["acc"]} != {accession}'
142 |     ## size
143 |     size = int(data['size'].replace(',', ''))
144 |     if size < min_size:
145 |         return "Failure",f'File size too small: {size} < {min_size}'
146 |     ## format
147 |     #fmt = data['FMT'].lower()
148 |     #if 'fastq' not in fmt and fmt not in ['sharq', 'sralite', 'sra']:
149 |     #    return "Failure",f'Invalid format: {data["FMT"]}'
150 |     ## platform
151 |     if 'illumina' not in data['platf'].lower():
152 |         return "Failure",f'Invalid platform: {data["platf"]}'
153 |     # all checks passed
154 |     return "Success","Validation successful"
155 | 
156 | def write_log(logF, sample: str, accession: str, step: str, msg: str) -> None:
157 |     """
158 |     Write log to file.
159 |     Args:
160 |         logF: Log file handle
161 |         sample: Sample name
162 |         accession: SRA accession
163 |         step: Step name
164 |         msg: Message
165 |     """
166 |     if len(msg) > 100:
167 |         msg = msg[:100] + '...'
168 |     logF.write(','.join([sample, accession, step, msg]) + '\n')
169 | 
170 | def prefetch_workflow(sample: str, accession: str, log_df: pd.DataFrame, outdir:str, 
171 |                       gcp_download: bool=False, tries: int=3, max_size_gb: float=1000) -> Optional[str]:
172 |     """
173 |     Run prefetch workflow.
174 |     Args:
175 |         sample: Sample name
176 |         accession: SRA accession
177 |         log_df: Log dataframe
178 |         outdir: Output directory
179 |         gcp_download: Use GCP mirror
180 |         tries: Number of tries
181 |         max_size_gb: Max file size in Gb
182 |     """
183 |     # check for prefetch in path
184 |     for exe in ['prefetch', 'vdb-dump']:
185 |         if not which(exe):
186 |             logging.error(f'{exe} not found in PATH')
187 |             sys.exit(1)
188 | 
189 |     # run vdb-config
190 |     if gcp_download:
191 |         status,msg = run_vdb_config()
192 |         add_to_log(log_df, sample, accession, "prefetch", "vdb-config", status, msg)
193 | 
194 |     # run vdb-dump
195 |     status,msg = run_vdb_dump(accession)
196 |     add_to_log(log_df, sample, accession, "prefetch", "vdb-dump", status, msg)
197 |     if status != "Success":
198 |        logging.warning(f'vdb-dump validation failed: {msg}')
199 |        return None
200 | 
201 |     # run prefetch
202 |     status,msg = prefetch(accession, tries, max_size_gb, outdir)
203 |     add_to_log(log_df, sample, accession, "prefetch", "prefetch", status, msg)
204 |     if status != "Success":
205 |         logging.warning(f'Failed to download: {msg}')
206 |         return None
207 | 
208 |     # print output file size
209 |     sra_file = os.path.join(outdir, accession)
210 |     if not os.path.exists(sra_file):
211 |         logging.warning(f'File not found: {sra_file}')
212 |         return None
213 |     file_size = os.path.getsize(sra_file)
214 |     logging.info(f"SRA file size: {file_size / 1e9:.3f} GB")
215 | 
216 |     # return output file
217 |     return sra_file
218 | 
219 | ## script main
220 | if __name__ == '__main__':
221 |     # arg parse
222 |     args = parser.parse_args()
223 | 
224 |     # setup
225 |     os.makedirs(args.outdir, exist_ok=True)
226 |     log_df = pd.DataFrame(
227 |         columns=["sample", "accession", "process", "step", "status", "message"]
228 |     )
229 | 
230 |     # run workflow
231 |     prefetch_workflow(
232 |         args.sample, args.accession, log_df, 
233 |         outdir=args.outdir, 
234 |         gcp_download=args.gcp_download, 
235 |         tries=args.tries, 
236 |         max_size_db=args.max_size_gb
237 |     )
238 | 
239 |     # write log
240 |     log_file = os.path.join(args.outdir, "prefetch_log.csv")
241 |     log_df.to_csv(log_file, index=False)
242 |     logging.info(f'Log written to: {log_file}')
243 |     
244 |     # upsert log to database
245 |     with db_connect() as conn:
246 |         db_upsert(log_df, "screcounter_log", conn)
247 |     


--------------------------------------------------------------------------------
/scripts/tiledb-loader/bin/find-mtx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # import
  3 | ## batteries
  4 | import os
  5 | import logging
  6 | import argparse 
  7 | from pathlib import Path
  8 | from itertools import chain, repeat
  9 | from typing import List, Set, Tuple, Optional
 10 | ## 3rd party
 11 | import pandas as pd
 12 | import tiledbsoma
 13 | import tiledbsoma.io
 14 | import scanpy as sc
 15 | 
 16 | # format logging
 17 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 18 | logging.getLogger("tiledbsoma").setLevel(logging.WARNING)
 19 | logging.getLogger("tiledbsoma.io").setLevel(logging.WARNING)
 20 | logging.getLogger("tiledb").setLevel(logging.WARNING) 
 21 | 
 22 | # classes
 23 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 24 |     pass
 25 | 
 26 | # functions
 27 | def parse_arguments() -> argparse.Namespace:
 28 |     """
 29 |     Parse command-line arguments.
 30 |     """
 31 |     desc = 'Find scRNA-seq count matrix files for TileDB loader.'
 32 |     epi = """DESCRIPTION:
 33 |     """
 34 |     parser = argparse.ArgumentParser(description=desc, epilog=epi, formatter_class=CustomFormatter)
 35 |     parser.add_argument(
 36 |         'base_dir',  type=str, help='Base directory to search for input data files'
 37 |     )
 38 |     parser.add_argument(
 39 |         '--feature-type', default='GeneFull_Ex50pAS', 
 40 |         choices=['Gene', 'GeneFull', 'GeneFull_Ex50pAS', 'GeneFull_ExonOverIntron', 'Velocyto', None], 
 41 |         help='Feature type to process'
 42 |     )
 43 |     parser.add_argument(
 44 |         '--raw', action='store_true', default=False,
 45 |         help='Use raw count matrix files instead of filtered'
 46 |     )
 47 |     parser.add_argument(
 48 |         '--db-uri', type=str, default="tiledb_exp", 
 49 |         help='URI of existing TileDB database, or it will be created if it does not exist'
 50 |     )
 51 |     parser.add_argument(
 52 |         '--batch-size', type=int, default=8, help='batch size for downstream processing'
 53 |     )
 54 |     parser.add_argument(
 55 |         '--max-datasets', type=int, default=None,
 56 |         help='Maximum number of datasets to process'
 57 |     )
 58 |     parser.add_argument(   # TODO: implement => https://github.com/alexdobin/STAR/blob/master/extras/scripts/soloBasicCellFilter.awk
 59 |         '--multi-mapper', default='None', choices=['None', 'EM', 'uniform'],
 60 |         help='Multi-mapper strategy to use' 
 61 |     )
 62 |     return parser.parse_args()
 63 | 
 64 | def get_existing_srx_ids(db_uri: str) -> Set[str]:
 65 |     """
 66 |     Read metadata from existing database and return set of SRX IDs.
 67 |     Args:
 68 |         db_uri: URI of the TileDB database
 69 |     Returns:
 70 |         Set of SRX IDs already in the database
 71 |     """
 72 |     logging.info(f"Checking for existing SRX accessions in {db_uri}...")
 73 | 
 74 |     srx = set()
 75 |     if not os.path.exists(db_uri):
 76 |         logging.info("Database does not exist yet. No SRX/ERX accessions to obtain.")
 77 |     else:
 78 |         with tiledbsoma.open(db_uri) as exp:
 79 |             try:
 80 |                 metadata = (exp.obs.read(column_names=["SRX_accession"])
 81 |                     .concat()
 82 |                     .group_by(["SRX_accession"])
 83 |                     .aggregate([
 84 |                         ([], 'count_all'),
 85 |                     ])
 86 |                     .to_pandas())
 87 |                 srx = set(metadata["SRX_accession"].unique())
 88 |             except tiledbsoma._exception.DoesNotExistError:
 89 |                 metadata = (exp.obs.read(column_names=["SRX_accession"])
 90 |                     .concat()
 91 |                     .to_pandas())
 92 |                 srx = set(metadata["SRX_accession"].unique())
 93 |     # status
 94 |     logging.info(f"  Found {len(srx)} existing SRX/ERX accessions.")
 95 |     return srx
 96 | 
 97 | def find_matrix_files(
 98 |         base_dir: str, 
 99 |         feature_type: str, 
100 |         existing_srx: Set[str], 
101 |         multi_mapper: str='None',
102 |         raw: bool=False, 
103 |         max_datasets: Optional[int]=None
104 |     ) -> List[tuple]:
105 |     """
106 |     Recursively find matrix.mtx.gz files and extract SRX/ERX IDs.
107 |     Args:
108 |         base_dir: Base directory to search
109 |         feature_type: 'Gene' or 'GeneFull'
110 |         existing_srx: Set of existing SRX IDs
111 |         multi_mapper: 'EM', 'uniform', or 'None'
112 |         raw: Use raw count matrix files instead of filtered
113 |         max_datasets: Maximum number of datasets to process
114 |     Returns:
115 |         List of tuples (matrix_path, srx_id)
116 |     """
117 |     logging.info(f"Searching for new data files in {base_dir}...")
118 |     base_path = Path(base_dir)
119 |     subdir = 'raw' if raw else 'filtered'
120 |     results = []
121 |     stats = {'found': 0, 'exists': 0, 'permissions': 0, 'mtx_file_missing': 0, 'novel': 0}
122 |     
123 |     # Determine which matrix file to look for based on multi_mapper
124 |     if multi_mapper == 'None':
125 |         matrix_filename = 'matrix.mtx.gz'
126 |     elif multi_mapper == 'EM':
127 |         matrix_filename = 'UniqueAndMult-EM.mtx.gz'
128 |     elif multi_mapper == 'uniform':
129 |         matrix_filename = 'UniqueAndMult-Uniform.mtx.gz'
130 |     else:
131 |         raise ValueError(f"Invalid multi-mapper strategy: {multi_mapper}")
132 |     
133 |     # Walk through directory structure
134 |     num_dirs = 0
135 |     for srx_dir in chain(base_path.glob('**/SRX*'), base_path.glob('**/ERX*')):
136 |         # skip files
137 |         if not srx_dir.is_dir():
138 |             continue
139 |         else:
140 |             stats['found'] += 1
141 | 
142 |         # status
143 |         num_dirs += 1
144 |         if num_dirs % 1000 == 0:
145 |             logging.info(f"  Searched {num_dirs} SRX directories so far...")
146 | 
147 |         # Check if SRX directory exists in database
148 |         if srx_dir.name in existing_srx:
149 |             stats['exists'] += 1
150 |             continue
151 | 
152 |         # Find target matrix file in SRX directory
153 |         for mtx_file in srx_dir.glob(f'**/{matrix_filename}'):
154 |             hit = None
155 |             # check for `feature_type/subdir` in file path
156 |             for i,x in enumerate(mtx_file.parts):
157 |                 try:
158 |                     if feature_type in x and mtx_file.parts[i+1] == subdir:
159 |                         hit = True
160 |                         break
161 |                 except IndexError:
162 |                     continue
163 |             # if target file found, check if it exists, and add to results
164 |             if hit:
165 |                 try:
166 |                     if not mtx_file.exists():
167 |                         stats['mtx_file_missing'] += 1
168 |                     else:
169 |                         stats['novel'] += 1
170 |                         results.append([mtx_file, srx_dir.name])                       
171 |                 except PermissionError:
172 |                     logging.warning(f"Permission denied for {mtx_file}. Skipping.")
173 |                     stats['permissions'] += 1
174 |                 break
175 |         
176 |         # Check max datasets
177 |         if max_datasets and len(results) >= max_datasets:
178 |             logging.info(f"  Found --max-datasets datasets. Stopping search.")
179 |             break
180 | 
181 |     # Status
182 |     logging.info(f"  {stats['found']} total SRX directories found (total).")
183 |     logging.info(f"  {stats['exists']} existing SRX directories found (skipped).")
184 |     logging.info(f"  {stats['mtx_file_missing']} missing matrix files (skipped).")
185 |     logging.info(f"  {stats['permissions']} directories with permission errors (skipped).")
186 |     logging.info(f"  {stats['novel']} novel SRX directories found (final).")
187 |     return results
188 | 
189 | def make_batch(num_repeats: int, total_numbers: int) -> List[int]:
190 |     """
191 |     Bin numbers into batches of num_repeats.
192 |     Args:
193 |         num_repeats: Number of repeats per unique number
194 |         total_numbers: Total number of unique numbers
195 |     Returns:
196 |         List of batch numbers
197 |     """
198 |     batch_counts = []
199 |     unique_count = int(round(total_numbers / num_repeats + 0.5))
200 |     for i in range(1, unique_count + 1):
201 |         batch_counts.extend(repeat(i, num_repeats))
202 |     return batch_counts[:total_numbers]
203 | 
204 | def main():
205 |     """Main function to run the TileDB loader workflow."""
206 |     args = parse_arguments()
207 | 
208 |     # Get existing SRX IDs
209 |     existing_srx = get_existing_srx_ids(args.db_uri)
210 |     
211 |     # Find all matrix files and their corresponding SRX IDs
212 |     matrix_files = find_matrix_files(
213 |         args.base_dir, args.feature_type, existing_srx,
214 |         multi_mapper=args.multi_mapper,
215 |         raw=args.raw, 
216 |         max_datasets=args.max_datasets
217 |     )
218 | 
219 |     # write as csv
220 |     df = pd.DataFrame(matrix_files, columns=['matrix_path', 'srx'])
221 |     df["batch"] = make_batch(args.batch_size, df.shape[0])
222 |     df.to_csv('mtx_files.csv', index=False)
223 |     logging.info(f"File written: mtx_files.csv")
224 | 
225 | if __name__ == "__main__":
226 |     from dotenv import load_dotenv
227 |     load_dotenv(override=True)
228 |     main()


--------------------------------------------------------------------------------
/scripts/acc2srr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # import
  3 | import os
  4 | import io
  5 | import csv
  6 | import sys
  7 | import argparse
  8 | from time import sleep
  9 | from typing import List, Dict
 10 | from urllib.error import HTTPError
 11 | from dotenv import load_dotenv
 12 | import pandas as pd
 13 | from Bio import Entrez
 14 | from pysradb.sraweb import SRAweb
 15 | 
 16 | 
 17 | # argparse
 18 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 19 |                       argparse.RawDescriptionHelpFormatter):
 20 |     pass
 21 | 
 22 | desc = 'Convert accessions to SRR accessions'
 23 | epi = """DESCRIPTION:
 24 | Convert SRP, GSE, or other accessions to SRR accessions.
 25 | If NCBI_API_KEY is set in the environment, it will be used as the API key.
 26 | """
 27 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 28 |                                  formatter_class=CustomFormatter)
 29 | parser.add_argument('accession_file', type=str, 
 30 |                     help='Text file with accessions; 1 per line')
 31 | parser.add_argument('--email', type=str, default=None,
 32 |                     help='Email address for Entrez')
 33 | parser.add_argument('--batch-size', type=int, default=50,
 34 |                     help='Batch size for fetching')
 35 | parser.add_argument('--outfile', type=str, default='srr_accessions.csv',
 36 |                     help='Output file name')
 37 | 
 38 | # functions
 39 | def load_accessions(accession_file: str) -> List[str]:
 40 |     """
 41 |     Load accessions from file
 42 |     Args:
 43 |         accession_file: File with accessions
 44 |     Returns:
 45 |         List of accessions
 46 |     """
 47 |     accessions = []
 48 |     with open(accession_file) as inF:
 49 |         for line in inF:
 50 |             line = line.strip().split(',')[0]
 51 |             if line == "" or line.startswith("#"):
 52 |                 continue
 53 |             accessions.append(line)
 54 |     return accessions
 55 | 
 56 | def esearch_batch(db, accession, batch_size = 50, ntries=3, sleep_time=5) -> List[str]:
 57 |     """
 58 |     Entrez esearch in batches
 59 |     Args:
 60 |         db: Database to search
 61 |         accession: Accession to search
 62 |         batch_size: Batch size for fetching
 63 |         ntries: Number of tries before giving up
 64 |         sleep_time: Sleep time between retries
 65 |     Returns:
 66 |         List of unique IDs
 67 |     """
 68 |     print(f"esearch of {db} for: {accession}", file=sys.stderr)
 69 |     results = []
 70 |     # Initial search to get the total count of records
 71 |     handle = Entrez.esearch(db=db, term=accession, usehistory="y", retmax=1)
 72 |     record = Entrez.read(handle)
 73 |     handle.close()
 74 |     results += record["IdList"] if record["IdList"] else None
 75 |     total_records = int(record["Count"])
 76 |     print(f"  Total records: {total_records}", file=sys.stderr)
 77 | 
 78 |     # Retrieve results in batches
 79 |     for start in range(0, total_records, batch_size):
 80 |         print(f"  Fetching records {start+1}-{min(start+batch_size, total_records)}", file=sys.stderr)
 81 |         for i in range(ntries):
 82 |             try:
 83 |                 handle = Entrez.esearch(db=db, term=accession, retstart=start, retmax=batch_size, usehistory="y")
 84 |                 record = Entrez.read(handle)
 85 |                 handle.close()
 86 |                 if "IdList" in record:
 87 |                     results += record["IdList"] 
 88 |                 sleep(0.5)  # comply with NCBI rate limits
 89 |             except Exception as e:
 90 |                 print(f"  Attempt {i+1}/{ntries}: Error encountered: {e}", file=sys.stderr)
 91 |                 sleep(5 * (i+1))
 92 | 
 93 |     # Return unique IDs
 94 |     return list(set(results))
 95 | 
 96 | def efetch_batch(db, idlist, batch_size=20, rettype="runinfo", retmode="text", ntries=3, sleep_time=5
 97 |                  ) -> List[pd.DataFrame]:
 98 |     """
 99 |     Entrez efetch in batches
100 |     Args:
101 |         db: Database to search
102 |         idlist: List of IDs to fetch
103 |         batch_size: Batch size for fetching
104 |         rettype: Return type
105 |         retmode: Return mode
106 |         ntries: Number of tries before giving up
107 |         sleep_time: Sleep time between retries
108 |     Returns:
109 |         List of dataframes
110 |     """
111 |     print(f"efetch of {db} for: {len(idlist)} IDs", file=sys.stderr)
112 |     results = []
113 |     for start in range(0, len(idlist), batch_size):
114 |         print(f"  Fetching batch {start+1}-{min(start+batch_size, len(idlist))}", file=sys.stderr)
115 |         batch_ids = ",".join(idlist[start:start + batch_size])  # Get current batch of IDs
116 |         batch_result = None
117 |         for i in range(ntries):  # Retry logic for each batch
118 |             try:
119 |                 handle = Entrez.efetch(db=db, id=batch_ids, rettype=rettype, retmode=retmode)
120 |                 batch_result = handle.read()
121 |                 handle.close()
122 |                 # convert to dataframe
123 |                 df = pd.read_csv(io.StringIO(batch_result.decode('utf-8')))
124 |                 results.append(df)
125 |                 sleep(0.5)  # comply with NCBI rate limits
126 |                 break  # Exit retry loop on success
127 |             except HTTPError as e:
128 |                 print(f"  Attempt {i+1}/{ntries}: HTTPError for batch {start}-{start+batch_size}: {e}", file=sys.stderr)
129 |                 sleep(sleep_time * (i + 1))  # Progressive wait time before retry
130 |                 continue
131 |         if batch_result is None:
132 |             print(f"  Failed to fetch batch {start}-{start+batch_size}", file=sys.stderr)
133 |     return results
134 | 
135 | def fetch_srr_from_srp(accession, batch_size=50, ntries=3, sleep_time=5) -> pd.DataFrame:
136 |     """
137 |     Fetch SRR accessions from SRP
138 |     Args:
139 |         accession: SRP accession
140 |         batch_size: Batch size for fetching
141 |         ntries: Number of tries before giving up
142 |         sleep_time: Sleep time between retries
143 |     Returns:
144 |         Dataframe with SRR accessions
145 |     """
146 |     # Search the SRA database for the SRP accession
147 |     idlist = esearch_batch("sra", accession, batch_size=batch_size, ntries=ntries, sleep_time=sleep_time)
148 |     # get IDs from record
149 |     if len(idlist) == 0:
150 |         print(f"No records found for accession: {accession}", file=sys.stderr)
151 |         return []
152 |     # Fetch run info to get SRR accessions
153 |     results = efetch_batch("sra", idlist, batch_size=batch_size, ntries=ntries, sleep_time=sleep_time)
154 |     # concat dataframes
155 |     df = pd.concat(results)
156 |     # return specific columns
157 |     to_keep = [
158 |         "Sample", "Run", "Experiment",  "SRAStudy", "BioProject", 
159 |         "spots", "spots_with_mates", "avgLength", "size_MB"
160 |     ]
161 |     df = df[to_keep].rename(columns={
162 |         "Sample" : "sample",
163 |         "Run" : "accession",
164 |         "Experiment" : "experiment",
165 |         "SRAStudy" : "sra_study",
166 |         "BioProject" : "bioproject",
167 |         "avgLength" : "avg_length",
168 |         "size_MB" : "size_mb"
169 |     })
170 |     # getting just unique for for "accession"
171 |     return df.drop_duplicates(subset=["accession"])
172 | 
173 | def gse_to_srp(accession: str) -> str:
174 |     """
175 |     Use pysradb to convert GSE to SRP
176 |     Args:
177 |         accession: GSE accession
178 |     Returns:   
179 |         SRP accession
180 |     """
181 |     sradb = SRAweb()
182 |     df = sradb.gse_to_srp(
183 |         [accession],
184 |         detailed=False,
185 |         sample_attribute=False,
186 |         expand_sample_attributes=False,
187 |     )
188 |     srp_accession = df["study_accession"].tolist()[0]
189 |     print(f"Converted GSE to SRP: {srp_accession}", file=sys.stderr)
190 |     return srp_accession
191 | 
192 | def gsm_to_srp(accession: str) -> str:
193 |     """
194 |     Use pysradb to convert GSM to SRP
195 |     Args:
196 |         accession: GSM accession
197 |     Returns:
198 |         SRP accession
199 |     """
200 |     sradb = SRAweb()
201 |     df = sradb.gsm_to_srp(
202 |         [accession],
203 |         detailed=False,
204 |         sample_attribute=False,
205 |         expand_sample_attributes=False,
206 |     )
207 |     srp_accession = df["study_accession"].tolist()[0]
208 |     print(f"Converted GSM to SRP: {srp_accession}", file=sys.stderr)
209 |     return srp_accession
210 | 
211 | def convert_to_srp(accession: str) -> str:
212 |     """
213 |     Convert GSE or GSM to SRP
214 |     Args:
215 |         accession: GSE or GSM accession
216 |     Returns:
217 |         SRP accession
218 |     """
219 |     if accession.startswith('GSE'):
220 |         try:
221 |             return gse_to_srp(accession)
222 |         except Exception as e:
223 |             print(f"Error converting GSE to SRP: {e}", file=sys.stderr)
224 |             return None
225 |     elif accession.startswith('GSM'):
226 |         try:
227 |             return gsm_to_srp(accession)
228 |         except Exception as e:
229 |             print(f"Error converting GSM to SRP: {e}", file=sys.stderr)
230 |             return None
231 |     else:
232 |         print(f"Accession type not recognized: {accession}", file=sys.stderr)
233 |         return None
234 | 
235 | def fetch_srr_from_accession(accession: str, batch_size: int) -> List[pd.DataFrame]:
236 |     """
237 |     Fetch SRR accessions from SRP or GSE
238 |     Args:
239 |         accession: SRP or GSE accession
240 |         batch_size: Batch size for fetching
241 |     Returns:
242 |         List of dataframes with SRR accession info
243 |     """
244 |     print(f"#-- Fetching SRR accessions for: {accession} --#", file=sys.stderr)
245 |     if accession.startswith('GSE') or accession.startswith('GSM'):
246 |         # convert GSE to SRP
247 |         srp_accession = convert_to_srp(accession)
248 |         df = fetch_srr_from_srp(srp_accession)
249 |     elif accession.startswith('SRP'):
250 |         # fetch SRR from SRP
251 |         df = fetch_srr_from_srp(accession)
252 |     else:
253 |         print(f"Accession type not recognized: {accession}", file=sys.stderr)
254 |         return None
255 |     # add query accession
256 |     df["query_accession"] = accession
257 |     # move query accession to first column
258 |     cols = df.columns.tolist()
259 |     cols = cols[-1:] + cols[:-1]
260 |     return df[cols]
261 | 
262 | def main(args):
263 |     # load accessions 
264 |     accessions = load_accessions(args.accession_file)
265 | 
266 |     # set email
267 |     if args.email:
268 |         Entrez.email = args.email
269 |     # set API key
270 |     if 'NCBI_API_KEY' in os.environ:
271 |         Entrez.api_key = os.environ['NCBI_API_KEY']
272 | 
273 |     # get SRR accessions
274 |     srr_accessions = []
275 |     for accession in accessions:
276 |         srr_accessions.append(
277 |             fetch_srr_from_accession(accession, batch_size=args.batch_size)
278 |         )
279 | 
280 |     # concat list of dataframes
281 |     srr_accessions = pd.concat(srr_accessions)
282 | 
283 |     # write table
284 |     srr_accessions.to_csv(args.outfile, sep=',', index=False)
285 |     print(f"Saved SRR accessions to: {args.outfile}", file=sys.stderr)
286 | 
287 | 
288 | ## script main
289 | if __name__ == '__main__':
290 |     args = parser.parse_args()
291 |     load_dotenv()
292 |     main(args)


--------------------------------------------------------------------------------