├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── docs.yml
    │   ├── publish-to-pypi.yml
    │   └── test_linux.yml
├── .gitignore
├── .markdownlint.yaml
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── codecov.yml
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── 10x.md
    │   ├── _static
    │       └── schematic.svg
    │   ├── advanced_usage.md
    │   ├── api.rst
    │   ├── api
    │       ├── cauchy_combination.rst
    │       ├── create_slice_mean.rst
    │       ├── find_latent_representations.rst
    │       ├── format_sumstats.rst
    │       ├── generate_ldscore.rst
    │       ├── latent_to_gene.rst
    │       ├── quick_mode.rst
    │       ├── report.rst
    │       └── spatial_ldsc.rst
    │   ├── conf.py
    │   ├── data.rst
    │   ├── data_format.md
    │   ├── index.rst
    │   ├── install.rst
    │   ├── quick_mode.md
    │   ├── release.rst
    │   ├── step_by_step.md
    │   └── tutorials.rst
├── pyproject.toml
├── schematic.png
├── src
    └── gsMap
    │   ├── GNN
    │       ├── __init__.py
    │       ├── adjacency_matrix.py
    │       ├── model.py
    │       └── train.py
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── cauchy_combination_test.py
    │   ├── config.py
    │   ├── create_slice_mean.py
    │   ├── diagnosis.py
    │   ├── find_latent_representation.py
    │   ├── format_sumstats.py
    │   ├── generate_ldscore.py
    │   ├── latent_to_gene.py
    │   ├── main.py
    │   ├── report.py
    │   ├── run_all_mode.py
    │   ├── setup.py
    │   ├── spatial_ldsc_multiple_sumstats.py
    │   ├── templates
    │       └── report_template.html
    │   ├── utils
    │       ├── __init__.py
    │       ├── generate_r2_matrix.py
    │       ├── jackknife.py
    │       ├── manhattan_plot.py
    │       └── regression_read.py
    │   └── visualize.py
├── tests
    ├── conftest.py
    ├── test_advanced_usage.py
    ├── test_cli.py
    └── test_docs_cli_parsing.py
└── visualization_web_docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
        ├── _static
            ├── raw1_add_txt.svg
            ├── raw2_add_txt.svg
            ├── raw3_add_txt.svg
            ├── raw4_add_txt.svg
            ├── raw5_add_txt.svg
            └── schematic.svg
        ├── conf.py
        └── index.rst


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = src/gsMap
 3 | omit =
 4 |     # Exclude the files you mentioned
 5 |     src/gsMap/utils/jackknife.py
 6 |     src/gsMap/format_sumstats.py
 7 |     # Other files you might want to exclude
 8 |     */tests/*
 9 |     */__init__.py
10 |     src/gsMap/templates/*
11 |     # Additional excludes
12 |     src/gsMap/__main__.py
13 |     src/gsMap/setup.py
14 | 
15 | [report]
16 | exclude_lines =
17 |     # Have to re-enable the standard pragma
18 |     pragma: no cover
19 | 
20 |     # Don't complain about missing debug-only code
21 |     def __repr__
22 |     if self\.debug
23 | 
24 |     # Don't complain if tests don't hit defensive assertion code
25 |     raise NotImplementedError
26 |     raise ValueError
27 |     except ImportError
28 |     except Exception
29 |     except:
30 | 
31 |     # Don't complain if non-runnable code isn't run
32 |     if 0:
33 |     if __name__ == .__main__.:
34 |     if False:
35 | 
36 |     # Skip pass statements
37 |     pass
38 | 
39 | [paths]
40 | source =
41 |     src/gsMap
42 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve gsMap
 4 | title: ""
 5 | labels: bug
 6 | assignees: ""
 7 | ---
 8 | 
 9 | **Describe the bug**
10 | A clear and concise description of what the bug is.
11 | 
12 | **To Reproduce**
13 | 
14 | ```bash
15 | # Paste your gsmap script here
16 | ```
17 | 
18 | **Error messages/logs**
19 | 
20 | ```
21 | Paste the error messages or logs here
22 | ```
23 | 
24 | **Environment (please complete the following information):**
25 |  - Python version: [output of python --version]
26 |  - gsMap version: [output of gsmap --version]
27 |  - OS: [e.g. Ubuntu 22.04, macOS 13]
28 | 
29 | **Input data information (if applicable):**
30 | - ST data description: [e.g. 10x Visium, Stero-seq, etc.]
31 | - Data dimensions: [e.g. 10,000 spots × 20,000 genes]
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for gsMap
 4 | title: '[FEATURE] '
 5 | labels: 'enhancement'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Use case**
20 | Describe the use case for this feature. How would it benefit users of gsMap?
21 | 
22 | **Relevant literature or methods**
23 | If applicable, provide references to relevant papers, methods, or algorithms that support this feature.
24 | 
25 | **Additional context**
26 | Add any other context or screenshots about the feature request here.
27 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - "docs/**"
 7 |       - "visualization_web_docs/**"
 8 | 
 9 | jobs:
10 |   deploy_docs:
11 |     runs-on: self-hosted
12 | 
13 |     steps:
14 |       - name: Checkout code
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v4
19 |         with:
20 |           python-version: "3.12"
21 | 
22 |       # --- Build 'docs' documentation ---
23 |       - name: Install 'docs' dependencies
24 |         run: |
25 |           python -m pip install '.[doc]'
26 | 
27 |       - name: Build 'docs' documentation
28 |         working-directory: ./docs
29 |         run: |
30 |           make html
31 | 
32 |       - name: Deploy 'docs' to website_docs
33 |         run: |
34 |           DOCS_OUTPUT_DIR="/mnt/website_docs/gsmap"  # Adjust if you want a different subfolder
35 |           mkdir -p "$DOCS_OUTPUT_DIR"
36 |           rsync -avz ./docs/build/html/ "$DOCS_OUTPUT_DIR/"
37 |           echo "Documentation for 'docs' deployed to: $DOCS_OUTPUT_DIR"
38 | 
39 | #      # --- Build 'visualization_web_docs' documentation ---
40 | #      - name: Install 'visualization_web_docs' dependencies
41 | #        working-directory: ./visualization_web_docs
42 | #        run: |
43 | #          python -m pip install -r requirements.txt
44 | #
45 | #      - name: Build 'visualization_web_docs' documentation
46 | #        working-directory: ./visualization_web_docs
47 | #        run: |
48 | #          make html
49 | #
50 | #      - name: Deploy 'visualization_web_docs' to website_docs
51 | #        run: |
52 | #          DOCS_OUTPUT_DIR="/mnt/website_docs/visualization_web_docs" # Deploy to a separate folder
53 | #          mkdir -p "$DOCS_OUTPUT_DIR"
54 | #          rsync -avz ./visualization_web_docs/build/html/ "$DOCS_OUTPUT_DIR/"
55 | #          echo "Documentation for 'visualization_web_docs' deployed to: $DOCS_OUTPUT_DIR"
56 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build:
 7 |     name: Build distribution 📦
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - name: Set up Python
13 |         uses: actions/setup-python@v4
14 |         with:
15 |           python-version: "3.x"
16 |       - name: Install pypa/build
17 |         run: >-
18 |           python3 -m
19 |           pip install
20 |           flit
21 |           --user
22 |       - name: Build a binary wheel and a source tarball
23 |         run: |
24 |           flit build
25 |           ls -lh dist
26 |       - name: Store the distribution packages
27 |         if: startsWith(github.ref, 'refs/tags/') # store only on tag pushes
28 |         uses: actions/upload-artifact@v4
29 |         with:
30 |           name: python-package-distributions
31 |           path: dist/
32 | 
33 |   publish-to-pypi:
34 |     name: >-
35 |       Publish Python 🐍 distribution 📦 to PyPI
36 |     if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
37 |     needs:
38 |       - build
39 |     runs-on: ubuntu-latest
40 | 
41 |     environment:
42 |       name: pypi
43 |       url: https://pypi.org/p/gsMap
44 |     permissions:
45 |       id-token: write # IMPORTANT: mandatory for trusted publishing
46 | 
47 |     steps:
48 |       - name: Download all the dists
49 |         uses: actions/download-artifact@v4
50 |         with:
51 |           name: python-package-distributions
52 |           path: dist/
53 |       - name: Publish distribution 📦 to PyPI
54 |         uses: pypa/gh-action-pypi-publish@release/v1
55 | 
56 |   github-release:
57 |     name: >-
58 |       Sign the Python 🐍 distribution 📦 with Sigstore
59 |       and upload them to GitHub Release
60 |     needs:
61 |       - publish-to-pypi
62 |     runs-on: ubuntu-latest
63 | 
64 |     permissions:
65 |       contents: write # IMPORTANT: mandatory for making GitHub Releases
66 |       id-token: write # IMPORTANT: mandatory for sigstore
67 | 
68 |     steps:
69 |       - name: Download all the dists
70 |         uses: actions/download-artifact@v4
71 |         with:
72 |           name: python-package-distributions
73 |           path: dist/
74 |       - name: Sign the dists with Sigstore
75 |         uses: sigstore/gh-action-sigstore-python@v3.0.0
76 |         with:
77 |           inputs: >-
78 |             ./dist/*.tar.gz
79 |             ./dist/*.whl
80 |       - name: Create GitHub Release
81 |         env:
82 |           GITHUB_TOKEN: ${{ github.token }}
83 |         run: >-
84 |           gh release create
85 |           '${{ github.ref_name }}'
86 |           --repo '${{ github.repository }}'
87 |           --notes ""
88 |       - name: Upload artifact signatures to GitHub Release
89 |         env:
90 |           GITHUB_TOKEN: ${{ github.token }}
91 |         # Upload to GitHub Release using the `gh` CLI.
92 |         # `dist/` contains the built packages, and the
93 |         # sigstore-produced signatures and certificates.
94 |         run: >-
95 |           gh release upload
96 |           '${{ github.ref_name }}' dist/**
97 |           --repo '${{ github.repository }}'
98 | 


--------------------------------------------------------------------------------
/.github/workflows/test_linux.yml:
--------------------------------------------------------------------------------
  1 | name: test
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main, "[0-9]+.[0-9]+.x"]
  6 |   pull_request:
  7 |   schedule:
  8 |     - cron: "0 0 * * *"
  9 |   workflow_dispatch:
 10 | 
 11 | concurrency:
 12 |   group: ${{ github.workflow }}-${{ github.ref }}
 13 |   cancel-in-progress: true
 14 | 
 15 | jobs:
 16 |   test:
 17 |     runs-on: ubuntu-latest
 18 | 
 19 |     defaults:
 20 |       run:
 21 |         shell: bash -e {0} # -e to fail on error
 22 | 
 23 |     strategy:
 24 |       fail-fast: false
 25 |       matrix:
 26 |         python: ["3.10", "3.13"]
 27 | 
 28 |     name: Python ${{ matrix.python }} integration
 29 | 
 30 |     env:
 31 |       PYTHON: ${{ matrix.python }}
 32 |       TEST_DATA_URL: https://yanglab.westlake.edu.cn/data/gsMap/gsMap_test_data.tar.gz
 33 |       TEST_DATA_DIR: ${{ github.workspace }}/test_data
 34 |       WORK_DIR: ${{ github.workspace }}/gsmap_workdir
 35 | 
 36 |     steps:
 37 |       - name: Checkout code
 38 |         uses: actions/checkout@v4
 39 | 
 40 |       - name: Install uv
 41 |         uses: astral-sh/setup-uv@v5
 42 | 
 43 |       - name: "Set up Python"
 44 |         uses: actions/setup-python@v5
 45 |         with:
 46 |           python-version: ${{ matrix.python }}
 47 | 
 48 |       - name: Install dependencies
 49 |         run: |
 50 |           uv pip install --system -e ".[tests]"
 51 | 
 52 |       - name: Create workdir
 53 |         run: |
 54 |           mkdir -p $WORK_DIR
 55 |           echo "Created workdir: $WORK_DIR"
 56 | 
 57 |       - name: Cache test data
 58 |         uses: actions/cache@v3
 59 |         id: cache-test-data
 60 |         with:
 61 |           path: ${{ env.TEST_DATA_DIR }}
 62 |           key: test-data-v1
 63 | 
 64 |       - name: Download and extract test data
 65 |         if: steps.cache-test-data.outputs.cache-hit != 'true'
 66 |         run: |
 67 |           echo "Downloading test data from $TEST_DATA_URL"
 68 |           curl -L $TEST_DATA_URL -o gsMap_test_data.tar.gz
 69 |           tar -xzf gsMap_test_data.tar.gz -C ${{ github.workspace }}
 70 |           rm gsMap_test_data.tar.gz
 71 |           echo "Test data extracted to ${{ github.workspace }}"
 72 |           ls -la $TEST_DATA_DIR
 73 | 
 74 |       - name: Run pytest
 75 |         env:
 76 |           MPLBACKEND: agg
 77 |           DISPLAY: :0
 78 |           COLUMNS: 120
 79 |         run: |
 80 |           python -m pytest --cov=src \
 81 |               --junitxml=junit.xml -o junit_family=legacy \
 82 |               --cov-report=term-missing \
 83 |               --cov-report=xml \
 84 |               --cov-config=.coveragerc \
 85 |               -v -s --color=yes \
 86 |               --run-real-data \
 87 |               --work-dir=$WORK_DIR \
 88 |               --test-data=$TEST_DATA_DIR
 89 | 
 90 |       - uses: codecov/codecov-action@v4
 91 |         with:
 92 |           token: ${{ secrets.CODECOV_TOKEN }}
 93 |           files: ./coverage.xml
 94 |           fail_ci_if_error: false
 95 | 
 96 |       - name: Upload test results to Codecov
 97 |         if: ${{ !cancelled() }}
 98 |         uses: codecov/test-results-action@v1
 99 |         with:
100 |           token: ${{ secrets.CODECOV_TOKEN }}
101 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### JetBrains+all template
  2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  4 | 
  5 | # User-specific stuff
  6 | .idea/**/workspace.xml
  7 | .idea/**/tasks.xml
  8 | .idea/**/usage.statistics.xml
  9 | .idea/**/dictionaries
 10 | .idea/**/shelf
 11 | 
 12 | # AWS User-specific
 13 | .idea/**/aws.xml
 14 | 
 15 | # Generated files
 16 | .idea/**/contentModel.xml
 17 | 
 18 | # Sensitive or high-churn files
 19 | .idea/**/dataSources/
 20 | .idea/**/dataSources.ids
 21 | .idea/**/dataSources.local.xml
 22 | .idea/**/sqlDataSources.xml
 23 | .idea/**/dynamic.xml
 24 | .idea/**/uiDesigner.xml
 25 | .idea/**/dbnavigator.xml
 26 | 
 27 | # Gradle
 28 | .idea/**/gradle.xml
 29 | .idea/**/libraries
 30 | 
 31 | # Gradle and Maven with auto-import
 32 | # When using Gradle or Maven with auto-import, you should exclude module files,
 33 | # since they will be recreated, and may cause churn.  Uncomment if using
 34 | # auto-import.
 35 | # .idea/artifacts
 36 | # .idea/compiler.xml
 37 | # .idea/jarRepositories.xml
 38 | # .idea/modules.xml
 39 | # .idea/*.iml
 40 | # .idea/modules
 41 | # *.iml
 42 | # *.ipr
 43 | 
 44 | # CMake
 45 | cmake-build-*/
 46 | 
 47 | # Mongo Explorer plugin
 48 | .idea/**/mongoSettings.xml
 49 | 
 50 | # File-based project format
 51 | *.iws
 52 | 
 53 | # IntelliJ
 54 | out/
 55 | 
 56 | # mpeltonen/sbt-idea plugin
 57 | .idea_modules/
 58 | 
 59 | # JIRA plugin
 60 | atlassian-ide-plugin.xml
 61 | 
 62 | # Cursive Clojure plugin
 63 | .idea/replstate.xml
 64 | 
 65 | # SonarLint plugin
 66 | .idea/sonarlint/
 67 | 
 68 | # Crashlytics plugin (for Android Studio and IntelliJ)
 69 | com_crashlytics_export_strings.xml
 70 | crashlytics.properties
 71 | crashlytics-build.properties
 72 | fabric.properties
 73 | 
 74 | # Editor-based Rest Client
 75 | .idea/httpRequests
 76 | 
 77 | # Android studio 3.1+ serialized cache file
 78 | .idea/caches/build_file_checksums.ser
 79 | 
 80 | ### JupyterNotebooks template
 81 | # gitignore template for Jupyter Notebooks
 82 | # website: http://jupyter.org/
 83 | 
 84 | .ipynb_checkpoints
 85 | */.ipynb_checkpoints/*
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # Remove previous ipynb_checkpoints
 92 | #   git rm -r .ipynb_checkpoints/
 93 | 
 94 | ### Python template
 95 | # Byte-compiled / optimized / DLL files
 96 | __pycache__/
 97 | *.py[cod]
 98 | *$py.class
 99 | 
100 | # C extensions
101 | *.so
102 | 
103 | # Distribution / packaging
104 | .Python
105 | build/
106 | develop-eggs/
107 | dist/
108 | downloads/
109 | eggs/
110 | .eggs/
111 | lib/
112 | lib64/
113 | parts/
114 | sdist/
115 | var/
116 | wheels/
117 | share/python-wheels/
118 | *.egg-info/
119 | .installed.cfg
120 | *.egg
121 | MANIFEST
122 | 
123 | # PyInstaller
124 | #  Usually these files are written by a python script from a template
125 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
126 | *.manifest
127 | *.spec
128 | 
129 | # Installer logs
130 | pip-log.txt
131 | pip-delete-this-directory.txt
132 | 
133 | # Unit test / coverage reports
134 | htmlcov/
135 | .tox/
136 | .nox/
137 | .coverage
138 | .coverage.*
139 | .cache
140 | nosetests.xml
141 | coverage.xml
142 | *.cover
143 | *.py,cover
144 | .hypothesis/
145 | .pytest_cache/
146 | cover/
147 | 
148 | # Translations
149 | *.mo
150 | *.pot
151 | 
152 | # Django stuff:
153 | *.log
154 | local_settings.py
155 | db.sqlite3
156 | db.sqlite3-journal
157 | 
158 | # Flask stuff:
159 | instance/
160 | .webassets-cache
161 | 
162 | # Scrapy stuff:
163 | .scrapy
164 | 
165 | # Sphinx documentation
166 | docs/_build/
167 | 
168 | # PyBuilder
169 | .pybuilder/
170 | target/
171 | 
172 | # Jupyter Notebook
173 | .ipynb_checkpoints
174 | 
175 | # IPython
176 | profile_default/
177 | ipython_config.py
178 | 
179 | # pyenv
180 | #   For a library or package, you might want to ignore these files since the code is
181 | #   intended to run in multiple environments; otherwise, check them in:
182 | # .python-version
183 | 
184 | # pipenv
185 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
186 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
187 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
188 | #   install all needed dependencies.
189 | #Pipfile.lock
190 | 
191 | # poetry
192 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
193 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
194 | #   commonly ignored for libraries.
195 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
196 | #poetry.lock
197 | 
198 | # pdm
199 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
200 | #pdm.lock
201 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
202 | #   in version control.
203 | #   https://pdm.fming.dev/#use-with-ide
204 | .pdm.toml
205 | 
206 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
207 | __pypackages__/
208 | 
209 | # Celery stuff
210 | celerybeat-schedule
211 | celerybeat.pid
212 | 
213 | # SageMath parsed files
214 | *.sage.py
215 | 
216 | # Environments
217 | .env
218 | .venv
219 | env/
220 | venv/
221 | ENV/
222 | env.bak/
223 | venv.bak/
224 | 
225 | # Spyder project settings
226 | .spyderproject
227 | .spyproject
228 | 
229 | # Rope project settings
230 | .ropeproject
231 | 
232 | # mkdocs documentation
233 | /site
234 | 
235 | # mypy
236 | .mypy_cache/
237 | .dmypy.json
238 | dmypy.json
239 | 
240 | # Pyre type checker
241 | .pyre/
242 | 
243 | # pytype static type analyzer
244 | .pytype/
245 | 
246 | # Cython debug symbols
247 | cython_debug/
248 | 
249 | # vscode
250 | .vscode
251 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
 1 | # default to true for all rules
 2 | default: true
 3 | 
 4 | # MD007/unordered-list-indent
 5 | MD007:
 6 |   indent: 4
 7 | 
 8 | # MD033/no-inline-html
 9 | MD033: false
10 | 
11 | # MD041/first-line-h1
12 | MD041: false
13 | 
14 | # MD013/line-length
15 | MD013: false
16 | 
17 | # MD024/no-duplicate-heading
18 | MD024:
19 |   # Allow when nested under different parents e.g. CHANGELOG.md
20 |   siblings_only: true
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | fail_fast: false
 2 | default_language_version:
 3 |   python: python3
 4 | default_stages:
 5 |   - pre-commit
 6 |   - pre-push
 7 | minimum_pre_commit_version: 2.16.0
 8 | ci:
 9 |   autoupdate_schedule: quarterly
10 | repos:
11 |   - repo: https://github.com/asottile/blacken-docs
12 |     rev: 1.19.1
13 |     hooks:
14 |       - id: blacken-docs
15 | 
16 |   - repo: https://github.com/pre-commit/mirrors-prettier
17 |     rev: v4.0.0-alpha.8
18 |     hooks:
19 |       - id: prettier
20 |         types: [yaml]
21 | 
22 |   - repo: https://github.com/executablebooks/mdformat
23 |     rev: 0.7.22
24 |     hooks:
25 |       - id: mdformat
26 |         additional_dependencies:
27 |           - mdformat-mkdocs
28 |         exclude: |
29 |           (?x)^(
30 |             \.github/.*\.md
31 |           )$
32 | 
33 |   - repo: https://github.com/igorshubovych/markdownlint-cli
34 |     rev: v0.44.0
35 |     hooks:
36 |       - id: markdownlint-fix
37 |         exclude: |
38 |           (?x)^(
39 |             \.github/.*\.md
40 |           )$
41 | 
42 |   - repo: https://github.com/astral-sh/ruff-pre-commit
43 |     rev: v0.11.9
44 |     hooks:
45 |       - id: ruff
46 |         args: [--fix, --exit-non-zero-on-fix]
47 |       - id: ruff-format
48 | 
49 |   - repo: https://github.com/pre-commit/pre-commit-hooks
50 |     rev: v5.0.0
51 |     hooks:
52 |       - id: detect-private-key
53 |       - id: check-ast
54 |       - id: end-of-file-fixer
55 |       - id: mixed-line-ending
56 |         args: [--fix=lf]
57 |       - id: trailing-whitespace
58 |       - id: check-case-conflict
59 | 
60 |   - repo: local
61 |     hooks:
62 |       - id: forbid-to-commit
63 |         name: Don't commit rej files
64 |         entry: |
65 |           Cannot commit .rej files. These indicate merge conflicts that arise during automated template updates.
66 |           Fix the merge conflicts manually and remove the .rej files.
67 |         language: fail
68 |         files: '.*\.rej$'
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 JianYang-Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gsMap
  2 | 
  3 | |               |                                                                                                      |                |                                                                                                    |
  4 | | ------------- | ---------------------------------------------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------- |
  5 | | __Version__   | [![PyPI version][pypi-badge]][pypi-url] [![Python][python-badge]][python-url]                        | __Status__     | [![Project Status][status-badge]][status-url] [![Maintenance][maintenance-badge]][maintenance-url] |
  6 | | __Activity__  | [![GitHub commits][commits-badge]][commits-url] [![Last Commit][last-commit-badge]][last-commit-url] | __Quality__    | [![codecov][codecov-badge]][codecov-url] [![Ruff][ruff-badge]][ruff-url]                           |
  7 | | __CI/CD__     | [![Docs][docs-badge]][docs-url] [![test][test-badge]][test-url]                                      | __Community__  | [![GitHub stars][stars-badge]][stars-url] [![GitHub forks][forks-badge]][forks-url]                |
  8 | | __Downloads__ | [![Downloads][downloads-badge]][downloads-url]                                                       | __License__    | [![License: MIT][license-badge]][license-url] [![DOI][doi-badge]][doi-url]                         |
  9 | | __Platform__  | [![Linux][linux-badge]][linux-url]                                                                   | __Contribute__ | [![Issues][issues-badge]][issues-url] [![PRs Welcome][pr-badge]][pr-url]                           |
 10 | 
 11 | ## Introduction
 12 | 
 13 | `gsMap` (genetically informed spatial mapping of cells for complex traits)
 14 | integrates spatial transcriptomics (ST) data with genome-wide association study (GWAS)
 15 | summary statistics to map cells to human complex traits, including diseases,
 16 | in a spatially resolved manner.
 17 | 
 18 | ## Key Features
 19 | 
 20 | - __Spatially-aware High-Resolution Trait Mapping__
 21 | - __Spatial Region Identification__
 22 | - __Putative Causal Genes Identification__
 23 | 
 24 | ![Model Architecture](schematic.png)
 25 | 
 26 | ## Installation
 27 | 
 28 | Install using pip:
 29 | 
 30 | ```bash
 31 | conda create -n gsMap python>=3.10
 32 | conda activate gsMap
 33 | pip install gsMap
 34 | ```
 35 | 
 36 | Install using conda:
 37 | 
 38 | ```bash
 39 | conda create -n gsMap python>=3.10
 40 | conda activate gsMap
 41 | conda install bioconda::gsmap
 42 | ```
 43 | 
 44 | Install from source:
 45 | 
 46 | ```bash
 47 | git clone https://github.com/JianYang-Lab/gsMap
 48 | cd gsMap
 49 | pip install -e .
 50 | ```
 51 | 
 52 | Verify the installation by running the following command:
 53 | 
 54 | ```bash
 55 | gsmap --help
 56 | ```
 57 | 
 58 | ## Usage
 59 | 
 60 | Please check out the documentation and tutorials at [gsMap Documentation](https://yanglab.westlake.edu.cn/gsmap/document/software).
 61 | 
 62 | ## Online Visualization
 63 | 
 64 | To visualize the traits-cell association spatial maps,
 65 | please refer to [gsMap Visualization](https://yanglab.westlake.edu.cn/gsmap/visualize).
 66 | 
 67 | ## Citation
 68 | 
 69 | Song, L., Chen, W., Hou, J., Guo, M. & Yang, J.
 70 | [Spatially resolved mapping of cells associated with human complex traits.](https://doi.org/10.1038/s41586-025-08757-x)
 71 | Nature (2025).
 72 | 
 73 | Please cite the paper and give us a STAR if you find gsMap useful for your research.
 74 | 
 75 | <!-- Badge links -->
 76 | 
 77 | [codecov-badge]: https://codecov.io/gh/JianYang-Lab/gsMap/graph/badge.svg?token=NFZFXZIEUU
 78 | [codecov-url]: https://codecov.io/gh/JianYang-Lab/gsMap
 79 | [commits-badge]: https://img.shields.io/github/commit-activity/m/JianYang-Lab/gsMap
 80 | [commits-url]: https://github.com/JianYang-Lab/gsMap/commits/main
 81 | [docs-badge]: https://github.com/JianYang-Lab/gsMap/actions/workflows/docs.yml/badge.svg
 82 | [docs-url]: https://github.com/JianYang-Lab/gsMap/actions/workflows/docs.yml
 83 | [doi-badge]: https://img.shields.io/badge/DOI-10.1038%2Fs41586--025--08757--x-blue
 84 | [doi-url]: https://doi.org/10.1038/s41586-025-08757-x
 85 | [downloads-badge]: https://static.pepy.tech/badge/gsMap
 86 | [downloads-url]: https://pepy.tech/project/gsMap
 87 | [forks-badge]: https://img.shields.io/github/forks/JianYang-Lab/gsMap
 88 | [forks-url]: https://github.com/JianYang-Lab/gsMap/network/members
 89 | [issues-badge]: https://img.shields.io/github/issues/JianYang-Lab/gsMap
 90 | [issues-url]: https://github.com/JianYang-Lab/gsMap/issues
 91 | [last-commit-badge]: https://img.shields.io/github/last-commit/JianYang-Lab/gsMap
 92 | [last-commit-url]: https://github.com/JianYang-Lab/gsMap/commits/main
 93 | [license-badge]: https://img.shields.io/badge/License-MIT-yellow.svg
 94 | [license-url]: https://opensource.org/licenses/MIT
 95 | [linux-badge]: https://img.shields.io/badge/Linux-%E2%9C%93-success
 96 | [linux-url]: https://github.com/JianYang-Lab/gsMap/actions/workflows/test_linux.yml
 97 | [maintenance-badge]: https://img.shields.io/badge/Maintained%3F-yes-green.svg
 98 | [maintenance-url]: https://github.com/JianYang-Lab/gsMap/graphs/commit-activity
 99 | [pr-badge]: https://img.shields.io/badge/PRs-welcome-brightgreen.svg
100 | [pr-url]: https://github.com/JianYang-Lab/gsMap/pulls
101 | [pypi-badge]: https://img.shields.io/pypi/v/gsMap
102 | [pypi-url]: https://pypi.org/project/gsMap/
103 | [python-badge]: https://img.shields.io/pypi/pyversions/gsMap
104 | [python-url]: https://www.python.org
105 | [ruff-badge]: https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json
106 | [ruff-url]: https://github.com/astral-sh/ruff
107 | [stars-badge]: https://img.shields.io/github/stars/JianYang-Lab/gsMap
108 | [stars-url]: https://github.com/JianYang-Lab/gsMap/stargazers
109 | [status-badge]: https://www.repostatus.org/badges/latest/active.svg
110 | [status-url]: https://www.repostatus.org/#active
111 | [test-badge]: https://github.com/JianYang-Lab/gsMap/actions/workflows/test_linux.yml/badge.svg
112 | [test-url]: https://github.com/JianYang-Lab/gsMap/actions/workflows/test_linux.yml
113 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 | 
 4 | coverage:
 5 |   precision: 2
 6 |   round: down
 7 |   range: "70...100"
 8 |   status:
 9 |     project:
10 |       default:
11 |         target: 70%
12 |         threshold: 1%
13 |     patch:
14 |       default:
15 |         target: auto
16 |         threshold: 10%
17 | 
18 | parsers:
19 |   gcov:
20 |     branch_detection:
21 |       conditional: yes
22 |       loop: yes
23 |       method: no
24 |       macro: no
25 | 
26 | comment:
27 |   layout: "reach,diff,flags,files,footer"
28 |   behavior: default
29 |   require_changes: no
30 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==6
2 | gsMap[doc]
3 | 


--------------------------------------------------------------------------------
/docs/source/10x.md:
--------------------------------------------------------------------------------
  1 | # Cases on 10x Visium Data
  2 | 
  3 | Here we provide case applications based on 10x Visium data (which are not at single-cell resolution). For convenience, we used the `Quick Mode` here, but you can also follow the {doc}`Step by Step <step_by_step>` Guide to analyze 10x Visium data—the steps are the same.
  4 | 
  5 | A frequently asked question is how to provide annotations for 10x Visium data. Note that gsMap can run without annotations. The most convenient approaches are to either leave the `annotation` parameter unset (in {doc}`Step by Step <step_by_step>`) or provide annotations from spatial clustering methods, such as [SpaGCN](https://github.com/jianhuupenn/SpaGCN).
  6 | 
  7 | ## Preparation
  8 | 
  9 | Make sure you have {doc}`installed <install>` the `gsMap` package before proceeding.
 10 | 
 11 | ### 1. Download Dependencies
 12 | 
 13 | The `gsMap` package in quick mode requires the following resources:
 14 | 
 15 | - **Gene transfer format (GTF) file**, for gene coordinates on the genome.
 16 | - **LD reference panel**, in quick mode, we provide a pre-built LD score snp-by-gene matrix based on 1000G_EUR_Phase3.
 17 | - **SNP weight file**, to adjust correlations between SNP-trait association statistics.
 18 | - **Homologous gene transformations file** (optional), to map genes between species.
 19 | 
 20 | To download all the required files:
 21 | 
 22 | ```bash
 23 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_resource.tar.gz
 24 | tar -xvzf gsMap_resource.tar.gz
 25 | ```
 26 | 
 27 | Directory structure:
 28 | 
 29 | ```bash
 30 | tree -L 2
 31 | 
 32 | gsMap_resource
 33 |     ├── genome_annotation
 34 |     │   ├── enhancer
 35 |     │   └── gtf
 36 |     ├── homologs
 37 |     │   ├── macaque_human_homologs.txt
 38 |     │   └── mouse_human_homologs.txt
 39 |     ├── LD_Reference_Panel
 40 |     │   └── 1000G_EUR_Phase3_plink
 41 |     ├── LDSC_resource
 42 |     │   ├── hapmap3_snps
 43 |     │   └── weights_hm3_no_hla
 44 |     └── quick_mode
 45 |         ├── baseline
 46 |         ├── SNP_gene_pair
 47 |         └── snp_gene_weight_matrix.h5ad
 48 | ```
 49 | 
 50 | ### 2. Download Example Data
 51 | 
 52 | You can download the example 10x Visium data as follows:
 53 | 
 54 | ```bash
 55 | wget https://yanglab.westlake.edu.cn/data/gsMap/Visium_example_data.tar.gz
 56 | tar -xvzf Visium_example_data.tar.gz
 57 | ```
 58 | 
 59 | Directory structure:
 60 | 
 61 | ```bash
 62 | tree -L 2
 63 | 
 64 | Visium_example_data/
 65 | ├── GWAS
 66 | │   ├── IQ_NG_2018.sumstats.gz
 67 | │   └── Serum_creatinine.sumstats.gz
 68 | └── ST
 69 |     ├── V1_Adult_Mouse_Brain_Coronal_Section.h5ad
 70 |     ├── V1_Mouse_Brain_Sagittal_Posterior_Section.h5ad
 71 |     └── V1_Mouse_Kidney.h5ad
 72 | ```
 73 | 
 74 | ## Case1
 75 | 
 76 | Data: Visium data of adult mouse coronal section
 77 | Trait: IQ
 78 | <span style="color:#31a354"> Required memory: 11G (2902 cells) </span>
 79 | 
 80 | ```bash
 81 | gsmap quick_mode \
 82 |     --workdir './example_quick_mode/Visium' \
 83 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
 84 |     --sample_name 'V1_Adult_Mouse_Brain_Coronal_Section' \
 85 |     --gsMap_resource_dir 'gsMap_resource' \
 86 |     --hdf5_path 'Visium_example_data/ST/V1_Adult_Mouse_Brain_Coronal_Section.h5ad' \
 87 |     --annotation 'domain' \
 88 |     --data_layer 'count' \
 89 |     --sumstats_file 'Visium_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
 90 |     --trait_name 'IQ'
 91 | ```
 92 | 
 93 | [gsMap report](https://yanglab.westlake.edu.cn/data/gsMap/Visium_report/coronal/V1_Adult_Mouse_Brain_Coronal_Section_IQ_gsMap_Report.html) for the `IQ` on the adult mouse coronal section Visium data.
 94 | 
 95 | ## Case2
 96 | 
 97 | Data: Visium data of adult mouse sigital section
 98 | Trait: IQ
 99 | 
100 | <span style="color:#31a354"> Required memory: 12G (3289 cells) </span>
101 | 
102 | ```bash
103 | gsmap quick_mode \
104 |     --workdir './example_quick_mode/Visium' \
105 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
106 |     --sample_name 'V1_Mouse_Brain_Sagittal_Posterior_Section' \
107 |     --gsMap_resource_dir 'gsMap_resource' \
108 |     --hdf5_path 'Visium_example_data/ST/V1_Mouse_Brain_Sagittal_Posterior_Section.h5ad' \
109 |     --annotation 'domain' \
110 |     --data_layer 'count' \
111 |     --sumstats_file 'Visium_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
112 |     --trait_name 'IQ'
113 | ```
114 | 
115 | [gsMap report](https://yanglab.westlake.edu.cn/data/gsMap/Visium_report/saggital/V1_Mouse_Brain_Sagittal_Posterior_Section_IQ_gsMap_Report.html) for the `IQ` on the adult mouse sigital section Visium data.
116 | 
117 | ## Case3
118 | 
119 | Data: Visium data of adult mouse kindey
120 | Trait: Serum creatinine
121 | 
122 | <span style="color:#31a354"> Required memory: 8G (1437 cells) </span>
123 | 
124 | ```bash
125 | gsmap quick_mode \
126 |     --workdir './example_quick_mode/Visium' \
127 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
128 |     --sample_name 'V1_Mouse_Kidney' \
129 |     --gsMap_resource_dir 'gsMap_resource' \
130 |     --hdf5_path 'Visium_example_data/ST/V1_Mouse_Kidney.h5ad' \
131 |     --annotation 'domain' \
132 |     --data_layer 'count' \
133 |     --sumstats_file 'Visium_example_data/GWAS/Serum_creatinine.sumstats.gz' \
134 |     --trait_name 'Serum_creatinine'
135 | ```
136 | 
137 | [gsMap report](https://yanglab.westlake.edu.cn/data/gsMap/Visium_report/Serum_creatinine/V1_Mouse_Kidney_Serum_creatinine_gsMap_Report.html) for the `Serum creatinine` on the adult mouse kindey Visium data.
138 | 


--------------------------------------------------------------------------------
/docs/source/advanced_usage.md:
--------------------------------------------------------------------------------
  1 | # gsMap Advanced Usage
  2 | 
  3 | ## Using Customized Latent Representations
  4 | 
  5 | ```bash
  6 | 
  7 | # This could be any key in the obsm field of the AnnData object.
  8 | latent_customized = 'latent_customized'
  9 | gsmap run_latent_to_gene \
 10 |     --input_hdf5_path 'sample1.h5ad' \
 11 |     --workdir './workdir' \
 12 |     --sample_name 'sample1' \
 13 |     --annotation 'annotation' \
 14 |     --latent_representation $latent_customized
 15 | ```
 16 | 
 17 | ## Conditional Analysis
 18 | 
 19 | **Objective**: Perform conditional analysis by adjusting for other functional annotations or cell-type-level annotations.
 20 | 
 21 | This step extends `step 3: generate ldscore`, by adding additional functional annotations to the baseline with the aim of conducting a conditional analysis. The directory of additional annotations can be specified using the parameter `--additional_baseline_annotation`. The other steps are same to the tutorials above.
 22 | 
 23 | Download the additional annotations:
 24 | 
 25 | ```bash
 26 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_additional_annotation.tar.gz
 27 | tar -xvzf gsMap_additional_annotation.tar.gz
 28 | ```
 29 | 
 30 | The format of the additional annotation files is such that each line represents a SNP, with columns indicating the annotation values for that SNP. These values can be either binary or continuous.
 31 | 
 32 | ```bash
 33 | zless -S gsMap_additional_annotation/baseline.1.annot.gz
 34 | ```
 35 | 
 36 | **Execution**: <span style="color:#31a354"> required memory: ~50G </span>
 37 | 
 38 | ```bash
 39 | for CHROM in {1..22}
 40 | do
 41 |     gsmap run_generate_ldscore \
 42 |         --workdir './example/Mouse_Embryo' \
 43 |         --sample_name 'E16.5_E1S1.MOSTA' \
 44 |         --chrom $CHROM \
 45 |         --bfile_root 'gsMap_resource/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC' \
 46 |         --keep_snp_root 'gsMap_resource/LDSC_resource/hapmap3_snps/hm' \
 47 |         --gtf_annotation_file 'gsMap_resource/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf' \
 48 |         --gene_window_size 50000 \
 49 |         --additional_baseline_annotation 'gsMap_additional_annotation'
 50 | done
 51 | ```
 52 | 
 53 | ## gsMap on Biological Replicates
 54 | 
 55 | **Objective**: When multiple biological replicates are available, a uniform slice mean can be calculated for the gene ranks across the samples. This slice mean rank can then be used to compute the GSS. This approach ensures more consistent and comparable results across different samples.
 56 | 
 57 | ### Calculate the Slice Mean
 58 | 
 59 | To generate the slice mean, use the `create_slice_mean` command. This command outputs a parquet file containing the slice mean ranks for each gene. The `--sample_name_list` parameter specifies the names of the samples, and the `--h5ad_list` parameter provides the paths to the AnnData objects for each sample.
 60 | 
 61 | ```bash
 62 | gsmap create_slice_mean \
 63 |     --sample_name_list 'E16.5_E1S1.MOSTA' 'E16.5_E2S11.MOSTA' \
 64 |     --h5ad_list 'gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad' 'gsMap_example_data/ST/E16.5_E2S11.MOSTA.h5ad' \
 65 |     --slice_mean_output_file './workdir/sample_slice_mean.parquet' \
 66 |     --data_layer 'count' \
 67 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt'
 68 | ```
 69 | 
 70 | ### Use the Slice Mean in Quick Mode
 71 | 
 72 | The `quick_mode` command allows you to run the entire pipeline in a single step using the slice mean. The `--gM_slices` parameter specifies the path to the slice mean file generated in the previous step.
 73 | 
 74 | ```bash
 75 | # For the 'E16.5_E1S1.MOSTA' sample (the same applies to 'E16.5_E2S11.MOSTA')
 76 | gsmap quick_mode \
 77 |     --workdir './workdir' \
 78 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
 79 |     --sample_name 'E16.5_E1S1.MOSTA' \
 80 |     --gsMap_resource_dir 'gsMap_resource' \
 81 |     --hdf5_path 'gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad' \
 82 |     --annotation 'annotation' \
 83 |     --data_layer 'count' \
 84 |     --sumstats_file 'gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
 85 |     --trait_name 'IQ' \
 86 |     --gM_slices './workdir/sample_slice_mean.parquet'
 87 | ```
 88 | 
 89 | ### Use the Slice Mean in Step-by-Step Mode
 90 | 
 91 | To incorporate the slice mean into the step-by-step pipeline, provide the slice mean file using the `--gM_slices` parameter in the `run_latent_to_gene` command. This enables the computation of gene specificity scores based on the slice mean.
 92 | 
 93 | ```bash
 94 | # For the 'E16.5_E1S1.MOSTA' sample (the same applies to 'E16.5_E2S11.MOSTA')
 95 | gsmap run_latent_to_gene \
 96 |     --workdir './workdir' \
 97 |     --sample_name 'E16.5_E1S1.MOSTA' \
 98 |     --annotation 'annotation' \
 99 |     --latent_representation 'latent_GVAE' \
100 |     --num_neighbour 51 \
101 |     --num_neighbour_spatial 201 \
102 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
103 |     --gM_slices './workdir/sample_slice_mean.parquet'
104 | ```
105 | 
106 | ### Cauchy combination for multiple samples
107 | 
108 | Use the `run_cauchy_combination` command to aggregate the spot p-values for the same annotation across multiple samples.
109 | 
110 | ```bash
111 | gsmap run_cauchy_combination \
112 |     --workdir './workdir' \
113 |     --sample_name_list 'E16.5_E1S1.MOSTA' 'E16.5_E2S11.MOSTA' \
114 |     --trait_name 'IQ' \
115 |     --annotation 'annotation' \
116 |     --output_file './workdir/combined_IQ_cauchy_combination.csv.gz'
117 | ```
118 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api-documentation:
 2 | 
 3 | gsMap Command Line Alphabets
 4 | ==============================
 5 | 
 6 | 
 7 | .. program:: gsmap
 8 | 
 9 | Synopsis
10 | --------
11 | 
12 | .. code-block:: shell
13 | 
14 |     usage: gsmap [-h] [--version] {run_find_latent_representations,run_latent_to_gene,run_generate_ldscore,run_spatial_ldsc,run_cauchy_combination,run_report,format_sumstats,quick_mode} ...
15 | 
16 | Description
17 | -----------
18 | 
19 | gsmap: genetically informed spatial mapping of cells for complex traits.
20 | 
21 | Options
22 | -------
23 | 
24 | .. option:: -h, --help
25 | 
26 |     Show this help message and exit.
27 | 
28 | .. option:: --version, -v
29 | 
30 |     Show program's version number and exit.
31 | 
32 | Subcommands
33 | -----------
34 | 
35 | .. option:: format_sumstats
36 | 
37 |     Convert GWAS summary statistics into the format that gsMap can recognize.
38 | 
39 | .. option:: run_find_latent_representations
40 | 
41 |     Find the latent representations of each spot by running GNN-VAE.
42 | 
43 | .. option:: run_latent_to_gene
44 | 
45 |     Generate gene specificity scores (GSS) for each spot.
46 | 
47 | .. option:: run_generate_ldscore
48 | 
49 |     Generate LD scores for each spot.
50 | 
51 | .. option:: run_spatial_ldsc
52 | 
53 |     Perform LDSC for each spot.
54 | 
55 | .. option:: run_cauchy_combination
56 | 
57 |     Perform cauchy combination test for each annotation.
58 | 
59 | .. option:: run_report
60 | 
61 |     Generate gsMap report.
62 | 
63 | .. option:: quick_mode
64 | 
65 |     Run entire gsMap Pipeline.
66 | 
67 | .. option:: create_slice_mean
68 | 
69 |     Create Slice Mean using multiple slices.
70 | 
71 | -----
72 | 
73 | .. toctree::
74 |     :maxdepth: 1
75 |     :caption: Subcommands Documentation
76 | 
77 |     api/format_sumstats
78 |     api/quick_mode
79 |     api/find_latent_representations
80 |     api/latent_to_gene
81 |     api/generate_ldscore
82 |     api/spatial_ldsc
83 |     api/cauchy_combination
84 |     api/report
85 |     api/create_slice_mean
86 | 


--------------------------------------------------------------------------------
/docs/source/api/cauchy_combination.rst:
--------------------------------------------------------------------------------
1 | Step 5: cauchy_combination (optional)
2 | =====================================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_cauchy_combination
9 | 


--------------------------------------------------------------------------------
/docs/source/api/create_slice_mean.rst:
--------------------------------------------------------------------------------
1 | Create Slice Mean
2 | ====================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: create_slice_mean
9 | 


--------------------------------------------------------------------------------
/docs/source/api/find_latent_representations.rst:
--------------------------------------------------------------------------------
1 | Step 1: find_latent_representation
2 | ==================================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_find_latent_representations
9 | 


--------------------------------------------------------------------------------
/docs/source/api/format_sumstats.rst:
--------------------------------------------------------------------------------
1 | format_sumstats (tips)
2 | ==================================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: format_sumstats
9 | 


--------------------------------------------------------------------------------
/docs/source/api/generate_ldscore.rst:
--------------------------------------------------------------------------------
1 | Step 3: generate_ldscore
2 | ========================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_generate_ldscore
9 | 


--------------------------------------------------------------------------------
/docs/source/api/latent_to_gene.rst:
--------------------------------------------------------------------------------
1 | Step 2: latent_to_gene
2 | ======================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_latent_to_gene
9 | 


--------------------------------------------------------------------------------
/docs/source/api/quick_mode.rst:
--------------------------------------------------------------------------------
1 | quick_mode (run entire pipeline)
2 | ==================================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: quick_mode
9 | 


--------------------------------------------------------------------------------
/docs/source/api/report.rst:
--------------------------------------------------------------------------------
1 | Step 6: gsMap report (optional)
2 | ===============================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_report
9 | 


--------------------------------------------------------------------------------
/docs/source/api/spatial_ldsc.rst:
--------------------------------------------------------------------------------
1 | Step 4: spatial_ldsc
2 | ====================
3 | 
4 | .. argparse::
5 |    :module: gsMap.main
6 |    :func: create_parser
7 |    :prog: gsmap
8 |    :path: run_spatial_ldsc
9 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | project = "gsMap"
 2 | copyright = "2024, Liyang, Wenhao"
 3 | author = "Liyang, Wenhao"
 4 | # release = gsMap.__version__
 5 | 
 6 | # -- General configuration ---------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 8 | 
 9 | 
10 | extensions = [
11 |     "sphinx.ext.autodoc",
12 |     "sphinx.ext.autosummary",
13 |     "sphinx.ext.intersphinx",
14 |     "sphinx.ext.napoleon",
15 |     "sphinx.ext.viewcode",
16 |     "sphinx.ext.mathjax",
17 |     "sphinx_autodoc_typehints",
18 |     "sphinx_copybutton",
19 |     "sphinx.ext.viewcode",
20 |     "sphinxarg.ext",
21 |     "nbsphinx",
22 |     "myst_parser",
23 |     # "sphinx_charts.charts",
24 |     "sphinxcontrib.jquery",
25 |     "sphinx_inline_tabs",
26 | ]
27 | 
28 | exclude_patterns = []
29 | 
30 | 
31 | # -- Options for HTML output -------------------------------------------------
32 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
33 | 
34 | # html_theme = 'alabaster'
35 | # html_theme = 'classic'
36 | # html_theme = 'sphinx_rtd_theme'
37 | # html_theme = "pydata_sphinx_theme"
38 | html_theme = "furo"
39 | html_static_path = ["_static"]
40 | templates_path = ["_templates"]
41 | 
42 | html_theme_options = {
43 |     # "light_css_variables": {
44 |     #     "color-brand-primary": "#7C4DFF",
45 |     #     "color-brand-content": "#7C4DFF",
46 |     #     "color-code-background": "#f5f5f5",
47 |     # },
48 | }
49 | 
50 | # add plotly.js to the build
51 | # html_js_files = [
52 | #     "https://cdn.plot.ly/plotly-latest.min.js",
53 | # ]
54 | 


--------------------------------------------------------------------------------
/docs/source/data.rst:
--------------------------------------------------------------------------------
 1 | gsMap Resources Download
 2 | =========================
 3 | 
 4 | Download gsMap Running Resources
 5 | ----------------------------------
 6 | 
 7 | You can download the necessary resources for running gsMap from the following link:
 8 | 
 9 | .. _gsMap running dependencies: https://yanglab.westlake.edu.cn/data/gsMap/gsMap_resource.tar.gz
10 | 
11 | `gsMap running dependencies <https://yanglab.westlake.edu.cn/data/gsMap/gsMap_resource.tar.gz>`_
12 | 
13 | 
14 | - **LD Reference Panel**:
15 | 
16 |   - The 1000 Genomes Project Phase 3 panel.
17 | 
18 | - **Gene Annotation**:
19 | 
20 |   - **GTF File**: Contains the gene transcript coordinates.
21 | 
22 |   - **Enhancer Data**: Enhancer annotation by each tissue.
23 | 
24 | - **LDSC Resources**
25 | 
26 | - **Homologous Gene Transformations Data**:
27 | 
28 |   - Derived from the biomaRt (V3.18) R package, this data is used to transform gene names across species.
29 | 
30 | .. note::
31 |    These data are curated to ensure compatibility and ease of use. You can use your own reference panel and gene annotation data, but it is important to ensure that the data is compatible with the gsMap pipeline.
32 | 
33 | .. _download-example-data:
34 | 
35 | Download Example Data
36 | -----------------------------------
37 | 
38 | To illustrate how to use gsMap, we provide some example GWAS and ST data. You can download the example data from the following link:
39 | 
40 | `gsMap Example Data <https://yanglab.westlake.edu.cn/data/gsMap/gsMap_example_data.tar.gz>`_
41 | 
42 | .. _data-availability:
43 | 
44 | Data Availability
45 | -------------------
46 | .. list-table::
47 |    :header-rows: 1
48 | 
49 |    * - Category
50 |      - Description
51 |    * - GWAS Data
52 |      - - The 110 GWAS summary statistics used in the paper in in Supplementary Table.1
53 |        - You could download the GWAS summary statistics from the GWAS catalog: `https://www.ebi.ac.uk/gwas/ <https://www.ebi.ac.uk/gwas/>`_
54 |    * - ST Data
55 |      - - Mouse embryo and brain ST datasets: `https://db.cngb.org/search/project/CNP0001543/ <https://db.cngb.org/search/project/CNP0001543/>`_
56 |        - Human embryo ST datasets: `https://ngdc.cncb.ac.cn/gsa-human/browse/HRA005567 <https://ngdc.cncb.ac.cn/gsa-human/browse/HRA005567>`_
57 |        - Macaque cortex ST datasets: `https://db.cngb.org/search/project/CNP0002035/ <https://db.cngb.org/search/project/CNP0002035/>`_
58 |        - Human DLPFC ST datasets: `https://research.libd.org/globus/ <https://research.libd.org/globus/>`_
59 |    * - LDSC Data
60 |      - - LD reference panel from 1000 Genomes Project Phase 3: `ftp://ftp.1000genomes.ebi.ac.uk <ftp://ftp.1000genomes.ebi.ac.uk>`_
61 |        - LDSC baseline annotations: `https://data.broadinstitute.org/alkesgroup/LDSCORE <https://data.broadinstitute.org/alkesgroup/LDSCORE>`_
62 |    * - Enhancer Data
63 |      - - Enhancer data from ABC and Roadmap are curated and available at: `https://github.com/kkdey/GSSG/ <https://github.com/kkdey/GSSG/>`_
64 | 


--------------------------------------------------------------------------------
/docs/source/data_format.md:
--------------------------------------------------------------------------------
 1 | (data-format)=
 2 | 
 3 | # Data Format
 4 | 
 5 | ## ST Data
 6 | 
 7 | The input ST data must be an h5ad file containing at least the gene expression matrix and spatial coordinates. The gene expression matrix should be stored in the `layers` attribute, and the spatial coordinates should be in the obsm `attribute` with the key `spatial`. Optionally, the h5ad file may include spot (cell) annotations in the obs attribute.
 8 | 
 9 | ```python
10 | import scanpy as sc
11 | 
12 | adata = sc.read_h5ad("gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad")
13 | 
14 | print(adata.layers["count"].shape)
15 | print(adata.obsm["spatial"].shape)
16 | print(adata.obs["annotation"].value_counts().head())
17 | ```
18 | 
19 | ## GWAS Data
20 | 
21 | The input GWAS data is a text file containing at least the columns for SNP (rs number), Z (Z-statistics), and N (sample size). Column headers are keywords used by gsMap.
22 | 
23 | ```shell
24 | zcat gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz | head -n 5
25 | 
26 | SNP  A1 A2 Z N
27 | rs12184267 T C 0.916 225955
28 | rs12184277 G A 0.656 226215
29 | rs12184279 A C 1.050 226224
30 | rs116801199 T G 0.300 226626
31 | ```
32 | 
33 | ### How to format the GWAS data
34 | 
35 | You can convert GWAS summary data into the required format using custom code. For convenience, gsMap provides a command to do this. Below is an example of how to use the command.
36 | 
37 | Download the human height GWAS data and decompress it.
38 | 
39 | ```bash
40 | wget https://portals.broadinstitute.org/collaboration/giant/images/4/4e/GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
41 | 
42 | gzip -d GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL.gz
43 | ```
44 | 
45 | Convert the summary statistics to the required format.
46 | 
47 | ```bash
48 | gsmap format_sumstats \
49 | --sumstats 'GIANT_HEIGHT_YENGO_2022_GWAS_SUMMARY_STATS_ALL' \
50 | --out 'HEIGHT'
51 | ```
52 | 
53 | You will obtain a file named HEIGHT.sumstats.gz
54 | 
55 | ```bash
56 | zcat HEIGHT.sumstats.gz | head -n 5
57 | 
58 | SNP  A1 A2 Z N
59 | rs3131969 G A 0.328 1494218.000
60 | rs3131967 C T 0.386 1488150.000
61 | rs12562034 A G 1.714 1554976.000
62 | rs4040617 G A -0.463 1602016.000
63 | ```
64 | 
65 | For more usage options, please refer to:
66 | 
67 | ```bash
68 | gsMap format_sumstats -h
69 | ```
70 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to gsMap's documentation!
 2 | ===================================
 3 | 
 4 | 
 5 | Introduction
 6 | ------------
 7 | 
 8 | ``gsMap`` (genetically informed spatial mapping of cells for complex traits) integrates spatial transcriptomics (ST) data with genome-wide association study (GWAS) summary statistics to map cells to human complex traits, including diseases, in a spatially resolved manner.
 9 | 
10 | 
11 | How to Cite
12 | ------------
13 | If you use ``gsMap`` in your studies, please cite:
14 | 
15 |    Song, L., Chen, W., Hou, J., Guo, M. & Yang, J. `Spatially resolved mapping of cells associated with human complex traits <https://doi.org/10.1038/s41586-025-08757-x>`_. Nature (2025).
16 | 
17 | Key Features
18 | ------------
19 | 
20 | - **Spatially-aware High-Resolution Trait Mapping**: Maps trait-associated cells at single-cell resolution, offering insights into their spatial distributions.
21 | - **Spatial Region Identification**: Aggregates trait-cell association p-values into trait-tissue region association p-values, prioritizing tissue regions relevant to traits of interest.
22 | - **Putative Causal Genes Identification**: Prioritizes putative causal genes by associating gene expression levels with cell-trait relevance.
23 | 
24 | 
25 | Overview of ``gsMap`` Method
26 | -----------------------------
27 | 
28 | ``gsMap`` operates on a four-step process:
29 | 
30 | 1. **Gene Specificity Assessment in Spatial Contexts**: To address technical noise and capture spatial correlations of gene expression profiles in ST data, ``gsMap`` leverages GNNs to identify homogeneous spots for each spot and estimates gene specificity scores by aggregating information from those homogeneous spots.
31 | 2. **Linking Gene Specificity to SNPs**: ``gsMap`` assigns gene specificity scores to single nucleotide polymorphisms (SNPs) based on their proximity to gene transcription start sites (TSS) and SNP-to-gene epigenetic linking maps.
32 | 3. **Spatial S-LDSC**: To estimate the relevance of spots to traits, ``gsMap`` associates stratified LD scores of individual spots with GWAS summary statistics using the S-LDSC framework.
33 | 4. **Spatial Region Identification**: To evaluate the association of a specific spatial region with traits, ``gsMap`` employs the Cauchy combination test to aggregate p-values from individual spots within that spatial region.
34 | 
35 | .. image:: _static/schematic.svg
36 |    :width: 600
37 |    :alt: Model architecture
38 | 
39 | Schmatics of ``gsMap`` method. For more details about the ``gsMap``, please check out our `publication <URL>`__.
40 | 
41 | Installation
42 | ------------
43 | 
44 | ``gsMap`` is available on `gsMap GitHub <https://github.com/JianYang-Lab/gsMap>`__.
45 | 
46 | 
47 | How to install ``gsMap``, check out the `installation guide <install.rst>`__
48 | 
49 | Tutorials
50 | ---------
51 | How to use ``gsMap``, check out the `tutorials <tutorials.rst>`__
52 | 
53 | 
54 | Online Analysis Service (coming soon)
55 | --------------------------------------
56 | Users could upload their own GWAS summary statistics data to perform the analysis.
57 | 
58 | 
59 | .. toctree::
60 |     :maxdepth: 2
61 |     :caption: Contents:
62 | 
63 |     install
64 |     tutorials
65 |     data
66 |     api
67 |     release
68 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation guide
 4 | ==================
 5 | 
 6 | 
 7 | The ``gsMap`` package can be installed via pip:
 8 | 
 9 | .. code-block:: bash
10 | 
11 |     conda create -n gsMap python>=3.10
12 |     conda activate gsMap
13 |     pip install gsMap
14 | 
15 | or via the source code:
16 | 
17 | .. code-block:: bash
18 | 
19 |     git clone https://github.com/JianYang-Lab/gsMap
20 |     cd gsMap
21 |     pip install -e .
22 | 
23 | Verify the installation by running the following command:
24 | 
25 | .. code-block:: bash
26 | 
27 |     gsmap -v
28 | 


--------------------------------------------------------------------------------
/docs/source/quick_mode.md:
--------------------------------------------------------------------------------
  1 | # Mouse Embryo (Quick Mode)
  2 | 
  3 | The `Quick Mode` option provides a simplified and efficient way to execute the entire `gsMap` pipeline. It minimizes running time and configuration complexity by utilizing pre-calculated weights based on the 1000G EUR reference panel and protein-coding genes from gtf file of Gencode v46. This mode is ideal for users who prefer a streamlined approach. For a more customizable experience, such as using custom GTF files, reference panels, and adjustable parameters, please refer to the {doc}`Step by Step <step_by_step>` guide.
  4 | 
  5 | ## Preparation
  6 | 
  7 | Make sure you have {doc}`installed <install>` the `gsMap` package before proceeding.
  8 | 
  9 | ### 1. Download Dependencies
 10 | 
 11 | The `gsMap` package in quick mode requires the following resources:
 12 | 
 13 | - **Gene transfer format (GTF) file**, for gene coordinates on the genome.
 14 | - **LD reference panel**, in quick mode, we provide a pre-built LD score snp-by-gene matrix based on 1000G_EUR_Phase3.
 15 | - **SNP weight file**, to adjust correlations between SNP-trait association statistics.
 16 | - **Homologous gene transformations file** (optional), to map genes between species.
 17 | 
 18 | To download all the required files:
 19 | 
 20 | ```bash
 21 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_resource.tar.gz
 22 | tar -xvzf gsMap_resource.tar.gz
 23 | ```
 24 | 
 25 | Directory structure:
 26 | 
 27 | ```bash
 28 | tree -L 2
 29 | 
 30 | gsMap_resource
 31 |     ├── genome_annotation
 32 |     │   ├── enhancer
 33 |     │   └── gtf
 34 |     ├── homologs
 35 |     │   ├── macaque_human_homologs.txt
 36 |     │   └── mouse_human_homologs.txt
 37 |     ├── LD_Reference_Panel
 38 |     │   └── 1000G_EUR_Phase3_plink
 39 |     ├── LDSC_resource
 40 |     │   ├── hapmap3_snps
 41 |     │   └── weights_hm3_no_hla
 42 |     └── quick_mode
 43 |         ├── baseline
 44 |         ├── SNP_gene_pair
 45 |         └── snp_gene_weight_matrix.h5ad
 46 | ```
 47 | 
 48 | ### 2. Download Example Data
 49 | 
 50 | To run the quick mode example, you can download the example data as follows:
 51 | 
 52 | ```bash
 53 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_example_data.tar.gz
 54 | tar -xvzf gsMap_example_data.tar.gz
 55 | ```
 56 | 
 57 | Directory structure:
 58 | 
 59 | ```bash
 60 | tree -L 2
 61 | 
 62 | gsMap_example_data/
 63 | ├── GWAS
 64 | │   ├── BCX2_MCHC_EA_GWAMA.sumstats.gz
 65 | │   ├── GIANT_EUR_Height_2022_Nature.sumstats.gz
 66 | │   ├── gwas_config.yaml
 67 | │   └── IQ_NG_2018.sumstats.gz
 68 | └── ST
 69 |     └── E16.5_E1S1.MOSTA.h5ad
 70 | ```
 71 | 
 72 | ## Running `gsMap` in Quick Mode
 73 | 
 74 | <span style="color:#31a354"> Required memory: 80G (120K cells) </span>
 75 | 
 76 | ```bash
 77 | gsmap quick_mode \
 78 |     --workdir './example_quick_mode/Mouse_Embryo' \
 79 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
 80 |     --sample_name 'E16.5_E1S1.MOSTA' \
 81 |     --gsMap_resource_dir 'gsMap_resource' \
 82 |     --hdf5_path 'gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad' \
 83 |     --annotation 'annotation' \
 84 |     --data_layer 'count' \
 85 |     --sumstats_file 'gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
 86 |     --trait_name 'IQ'
 87 | ```
 88 | 
 89 | ### Parameters
 90 | 
 91 | - `--workdir`: The working directory where output files will be saved.
 92 | - `--homolog_file`: The homologous gene file for converting gene names from different species to human.
 93 | - `--sample_name`: The name of the sample (e.g., `E16.5_E1S1.MOSTA`).
 94 | - `--gsMap_resource_dir`: Path to the directory containing the `gsMap` resources.
 95 | - `--hdf5_path`: Path to the input HDF5 file with spatial transcriptomics (ST) data.
 96 | - `--annotation`: The name of the annotation column in the `adata.obs` of the input HDF5 file.
 97 | - `--data_layer`: The layer of the gene expression matrix (e.g., `count`).
 98 | - `--sumstats_file`: Path to the GWAS summary statistics file.
 99 | - `--trait_name`: Name of the trait (e.g., `IQ`).
100 | 
101 | ### Additional Options
102 | 
103 | - If you want to analyze multiple traits at once, provide a configuration file (`--sumstats_config_file`) instead of a single summary statistics file:
104 | 
105 | ```bash
106 | gsmap quick_mode \
107 |     --workdir './example_quick_mode/Mouse_Embryo' \
108 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt' \
109 |     --sample_name 'E16.5_E1S1.MOSTA' \
110 |     --gsMap_resource_dir 'gsMap_resource' \
111 |     --hdf5_path 'gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad' \
112 |     --annotation 'annotation' \
113 |     --data_layer 'count' \
114 |     --sumstats_config_file 'gsMap_example_data/GWAS/gwas_config.yaml'
115 | ```
116 | 
117 | The `gwas_config.yaml` file includes the following:
118 | 
119 | ```yaml
120 | Height: gsMap_example_data/GWAS/GIANT_EUR_Height_2022_Nature.sumstats.gz
121 | IQ: gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz
122 | SCZ: gsMap_example_data/GWAS/PGC3_SCZ_wave3_public_INFO80.sumstats.gz
123 | ```
124 | 
125 | ### Output Description
126 | 
127 | - The output files will be saved in the specified `--workdir` directory and will include intermediate files such as latent representations, gene marker scores, and LD scores. These intermediate files will be reused if you analyze another GWAS trait of the same sample within this `--workdir`.
128 | - A web report will be generated in the `report` folder, which includes visualizations of spatial cell-trait associations and diagnostic plots. To view the report, copy this folder to your local machine and open the HTML file in a web browser. You can refer to this [example report](https://yanglab.westlake.edu.cn/data/gsMap/IQ/E16.5_E1S1.MOSTA_IQ_gsMap_Report.html) for the `IQ` trait.
129 | 
130 | ### Example Output Structure
131 | 
132 | After running the analysis in quick mode, the following directory structure will be created:
133 | 
134 | ```bash
135 | tree -L 3
136 | 
137 | example_quick_mode/Mouse_Embryo
138 | ├── E16.5_E1S1.MOSTA
139 | │   ├── find_latent_representations  # Contains latent representations in h5ad format
140 | │   ├── latent_to_gene               # Gene marker scores
141 | │   ├── generate_ldscore             # LD scores
142 | │   ├── spatial_ldsc                 # Spatial cell-trait association results
143 | │   ├── cauchy_combination           # Region-level or cell type-level association results
144 | │   └── report                       # Web report with visualizations and diagnostics
145 | ```
146 | 


--------------------------------------------------------------------------------
/docs/source/release.rst:
--------------------------------------------------------------------------------
1 | Release notes
2 | =============
3 | 
4 | v1.70
5 | ------
6 | 
7 | First public release
8 | 


--------------------------------------------------------------------------------
/docs/source/step_by_step.md:
--------------------------------------------------------------------------------
  1 | # Mouse Embryo (Step by Step)
  2 | 
  3 | This tutorial guides you through running `gsMap` step by step, with user-defined parameters and resources, granting greater flexibility and control over the analysis. This mode is suited for users who require detailed customization of their pipeline. For a faster, one-command execution, please see the {doc}`Quick Mode <quick_mode>` tutorial.
  4 | 
  5 | ## Preparation
  6 | 
  7 | Please ensure you have {doc}`installed <install>` the `gsMap`. This tutorial guides you through using gsMap in a step-by-step manner.
  8 | 
  9 | ### 1. Download dependencies
 10 | 
 11 | `gsMap` requires specific reference files:
 12 | 
 13 | - **Gene transfer format (GTF) file**, for gene coordinates on the genome.
 14 | - **LD reference panel (PLINK bfile)**, for computing LD scores.
 15 | - **SNP weight file**, to adjust correlations between SNP-trait association statistics.
 16 | - **Homologous gene transformations file** (optional), to map genes between species.
 17 | - **Enhancer-gene mapping file** (optional), for linking SNPs to genes based on enhancer annotations.
 18 | 
 19 | To download the resources:
 20 | 
 21 | ```bash
 22 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_resource.tar.gz
 23 | tar -xvzf gsMap_resource.tar.gz
 24 | ```
 25 | 
 26 | Directory structure:
 27 | 
 28 | ```bash
 29 | tree -L 2
 30 | 
 31 | gsMap_resource
 32 |     ├── genome_annotation
 33 |     │   ├── enhancer
 34 |     │   └── gtf
 35 |     ├── homologs
 36 |     │   ├── macaque_human_homologs.txt
 37 |     │   └── mouse_human_homologs.txt
 38 |     ├── LD_Reference_Panel
 39 |     │   └── 1000G_EUR_Phase3_plink
 40 |     ├── LDSC_resource
 41 |     │   ├── hapmap3_snps
 42 |     │   └── weights_hm3_no_hla
 43 |     └── quick_mode
 44 |         ├── baseline
 45 |         ├── SNP_gene_pair
 46 |         └── snp_gene_weight_matrix.h5ad
 47 | ```
 48 | 
 49 | If you want to use your own reference files, please ensure that the genome build versions (e.g., Hg37 or Hg38) are consistent between the GTF file and the LD reference panel.
 50 | 
 51 | ### 2. Download example data
 52 | 
 53 | ```bash
 54 | wget https://yanglab.westlake.edu.cn/data/gsMap/gsMap_example_data.tar.gz
 55 | tar -xvzf gsMap_example_data.tar.gz
 56 | ```
 57 | 
 58 | Directory structure:
 59 | 
 60 | ```bash
 61 | tree -L 2
 62 | 
 63 | gsMap_example_data/
 64 | ├── GWAS
 65 | │   ├── BCX2_MCHC_EA_GWAMA.sumstats.gz
 66 | │   ├── GIANT_EUR_Height_2022_Nature.sumstats.gz
 67 | │   ├── gwas_config.yaml
 68 | │   └── IQ_NG_2018.sumstats.gz
 69 | └── ST
 70 |     └── E16.5_E1S1.MOSTA.h5ad
 71 | ```
 72 | 
 73 | ## Running `gsMap`
 74 | 
 75 | ### 1. find latent representations (optional)
 76 | 
 77 | **Objective**: Learn latent representations for spots. The latent embedding learned from this step will be store in the AnnData object `obsm` field under the key `latent_GVAE`.
 78 | 
 79 | ```{note}
 80 | The `--workdir` parameter specifies the working directory for gsMap, where all outputs will be saved.
 81 | ```
 82 | 
 83 | **Execution**: <span style="color:#31a354"> required memory: ~60G (120K cells) </span>
 84 | 
 85 | ```bash
 86 | gsmap run_find_latent_representations \
 87 |     --workdir './example/Mouse_Embryo' \
 88 |     --sample_name 'E16.5_E1S1.MOSTA' \
 89 |     --input_hdf5_path 'gsMap_example_data/ST/E16.5_E1S1.MOSTA.h5ad' \
 90 |     --annotation 'annotation' \
 91 |     --data_layer 'count'
 92 | ```
 93 | 
 94 | ### 2. generate gene specificity scores
 95 | 
 96 | **Objective**: Identify homogeneous spots for each spot based on their latent representations specified by `--latent_representation`, and then generate gene specificity scores (GSS) for each spot by aggregating information from its homogeneous spots.
 97 | 
 98 | ```{note}
 99 | If your ST data is not from a human species but you want to map human GWAS data to it, please provide a homologous transformation file to convert gene names. The first column should list gene names from the ST data species, and the second column from the GWAS data species.
100 | ```
101 | 
102 | **Execution**: <span style="color:#31a354"> required memory: ~45G (120K cells) </span>
103 | 
104 | ```bash
105 | gsmap run_latent_to_gene \
106 |     --workdir './example/Mouse_Embryo' \
107 |     --sample_name 'E16.5_E1S1.MOSTA' \
108 |     --annotation 'annotation' \
109 |     --latent_representation 'latent_GVAE' \
110 |     --num_neighbour 51 \
111 |     --num_neighbour_spatial 201 \
112 |     --homolog_file 'gsMap_resource/homologs/mouse_human_homologs.txt'
113 | ```
114 | 
115 | ### 3. generate ldscore
116 | 
117 | **Objective**: Assign gene specificity scores (GSS) to SNPs and compute the stratified LD score.
118 | 
119 | **Execution**: <span style="color:#31a354"> required memory: ~40G </span>
120 | 
121 | **Three SNP to gene linking strategies are available:**
122 | 
123 | ````{tab} 1. Use TSS
124 | This strategy uses TSS to assign GSS to SNPs. The --`gene_window_size parameter` defines the window size around the gene body for this assignment. If a SNP falls within the window of multiple genes, the GSS from the nearest gene will be used.
125 | 
126 | ```bash
127 | for CHROM in {1..22}
128 | do
129 |     gsmap run_generate_ldscore \
130 |         --workdir './example/Mouse_Embryo' \
131 |         --sample_name 'E16.5_E1S1.MOSTA' \
132 |         --chrom $CHROM \
133 |         --bfile_root 'gsMap_resource/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC' \
134 |         --keep_snp_root 'gsMap_resource/LDSC_resource/hapmap3_snps/hm' \
135 |         --gtf_annotation_file 'gsMap_resource/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf' \
136 |         --gene_window_size 50000
137 | done
138 | ```
139 | ````
140 | 
141 | ````{tab} 2. Use Enhancer-Gene Linking
142 | This strategy uses enhancer-gene linking to assign GSS to SNPs. When a SNP maps to multiple enhancers, the GSS for the SNP is determined by the `--snp_multiple_enhancer_strategy` parameter. By default, this is set to `max_mkscore`, which assigns the SNP the maximum GSS among the enhancers it maps to. Another option is `nearest_TSS`.
143 | 
144 | ```bash
145 | for CHROM in {1..22}
146 | do
147 |     gsmap run_generate_ldscore \
148 |         --workdir './example/Mouse_Embryo' \
149 |         --sample_name 'E16.5_E1S1.MOSTA' \
150 |         --chrom $CHROM \
151 |         --bfile_root 'gsMap_resource/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC' \
152 |         --keep_snp_root 'gsMap_resource/LDSC_resource/hapmap3_snps/hm' \
153 |         --gtf_annotation_file 'gsMap_resource/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf' \
154 |         --enhancer_annotation_file 'gsMap_resource/genome_annotation/enhancer/by_tissue/ALL/ABC_roadmap_merged.bed' \
155 |         --snp_multiple_enhancer_strategy 'max_mkscore' \
156 |         --gene_window_enhancer_priority 'enhancer_only'
157 | done
158 | ```
159 | ````
160 | 
161 | ````{tab} 3. Use TSS and Enhancer-Gene Linking
162 | This strategy uses both TSS and enhancer-gene linking to assign GSS to SNPs. If a SNP maps to both a gene TSS window and an enhancer linked to a different gene, the `--gene_window_enhancer_priority` parameter decides which gene the SNP is assigned to. The options are `gene_window_first` or `enhancer_first`.
163 | 
164 | ```bash
165 | for CHROM in {1..22}
166 | do
167 |     gsmap run_generate_ldscore \
168 |         --workdir './example/Mouse_Embryo' \
169 |         --sample_name 'E16.5_E1S1.MOSTA' \
170 |         --chrom $CHROM \
171 |         --bfile_root 'gsMap_resource/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC' \
172 |         --keep_snp_root 'gsMap_resource/LDSC_resource/hapmap3_snps/hm' \
173 |         --gtf_annotation_file 'gsMap_resource/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf' \
174 |         --gene_window_size 50000 \
175 |         --enhancer_annotation_file 'gsMap_resource/genome_annotation/enhancer/by_tissue/ALL/ABC_roadmap_merged.bed' \
176 |         --snp_multiple_enhancer_strategy 'max_mkscore' \
177 |         --gene_window_enhancer_priority 'gene_window_first'
178 | done
179 | ```
180 | ````
181 | 
182 | ```{caution}
183 | If you run out of memory during this step or the next, you can reduce the `--spots_per_chunk` parameter to a smaller value. Generally, 40GB of memory is required when `--spots_per_chunk` is set to 1000.
184 | ```
185 | 
186 | ### 4. spatial ldsc
187 | 
188 | **Objective**: Run spatial LDSC to associate spots with traits.
189 | 
190 | **Execution**: <span style="color:#31a354"> required memory: ~40G </span>
191 | 
192 | ```bash
193 | gsmap run_spatial_ldsc \
194 |     --workdir './example/Mouse_Embryo' \
195 |     --sample_name 'E16.5_E1S1.MOSTA' \
196 |     --trait_name 'IQ' \
197 |     --sumstats_file 'gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
198 |     --w_file 'gsMap_resource/LDSC_resource/weights_hm3_no_hla/weights.' \
199 |     --num_processes 4
200 | ```
201 | 
202 | ### 5. cauchy combination (optional)
203 | 
204 | **Objective**: Aggregate P values of individual spots within specific spatial regions (cell types) to evaluate the association of these regions (cell types) with the trait.
205 | 
206 | **Execution**: <span style="color:#31a354"> required memory: ~12G </span>
207 | 
208 | ```bash
209 | gsmap run_cauchy_combination \
210 |     --workdir './example/Mouse_Embryo' \
211 |     --sample_name 'E16.5_E1S1.MOSTA' \
212 |     --trait_name 'IQ' \
213 |     --annotation 'annotation'
214 | ```
215 | 
216 | ### 6. report generation (optional)
217 | 
218 | **Objective**: Generate gsMap reports, including visualizations of mapping results and diagnostic plots.
219 | 
220 | ```{note}
221 | The default genes for visualization are the top 50 genes whose GSS shows the highest correlation with the -log10 p-values of the trait-cell associations. To select specific genes for visualization, use the `--selected_genes` parameter.
222 | ```
223 | 
224 | **Execution**: <span style="color:#31a354"> required memory: ~60G </span>
225 | 
226 | ```bash
227 | gsmap run_report \
228 |     --workdir './example/Mouse_Embryo' \
229 |     --sample_name 'E16.5_E1S1.MOSTA' \
230 |     --trait_name 'IQ' \
231 |     --annotation 'annotation' \
232 |     --sumstats_file 'gsMap_example_data/GWAS/IQ_NG_2018.sumstats.gz' \
233 |     --top_corr_genes 50
234 | ```
235 | 


--------------------------------------------------------------------------------
/docs/source/tutorials.rst:
--------------------------------------------------------------------------------
 1 | gsMap Tutorials
 2 | =================
 3 | 
 4 | 
 5 | Welcome to the gsMap Tutorials. In this section, we provide detailed examples and guides to help you understand and utilize gsMap effectively. Our tutorials include mouse embryo ST data and three sets of cleaned GWAS summary statistics to demonstrate the application of gsMap in spatially mapping trait-associated cells (spots).
 6 | 
 7 | Datasets used in the tutorials
 8 | -------------------------------
 9 | 
10 | 1. **ST data**
11 |      1. Mouse E16.5 embryo sagittal section - (E16.5_E1S1.MOSTA.h5ad)
12 | 
13 | 2. **GWAS data**
14 |      1. Height - (GIANT_EUR_Height_2022_Nature.sumstats.gz)
15 |      2. Intelligence - (IQ_NG_2018.sumstats.gz)
16 |      3. Mean corpusular hemoglobin concerntration - (BCX2_MCHC_EA_GWAMA.sumstats.gz)
17 | 
18 | For how to use your own data, please check the :ref:`data-format` section.
19 | 
20 | Tutorials
21 | ---------------
22 | The tutorials are organized as follows:
23 | 
24 | .. toctree::
25 |     :maxdepth: 1
26 | 
27 |     quick_mode.md
28 |     step_by_step.md
29 |     advanced_usage.md
30 |     10x.md
31 |     data_format.md
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["flit_core >=3.2,<4"]
  3 | build-backend = "flit_core.buildapi"
  4 | 
  5 | [project]
  6 | name = "gsMap"
  7 | authors = [{ name = "liyang", email = "songliyang@westlake.edu.cn" },
  8 |     { name = "wenhao", email = "chenwenhao@westlake.edu.cn" }]
  9 | readme = "README.md"
 10 | license = { file = "LICENSE" }
 11 | requires-python = ">=3.10"
 12 | classifiers = [
 13 |     "Development Status :: 3 - Alpha",
 14 |     "Intended Audience :: Developers",
 15 |     "License :: OSI Approved :: MIT License",
 16 |     "Programming Language :: Python :: 3.10",
 17 |     "Programming Language :: Python :: 3.11",
 18 |     "Programming Language :: Python :: 3.12",
 19 |     "Operating System :: POSIX :: Linux",
 20 | ]
 21 | dynamic = ["version", "description"]
 22 | dependencies = [
 23 |     "numpy < 2.0.0",
 24 |     "pandas",
 25 |     "scipy",
 26 |     "scikit-learn",
 27 |     "matplotlib",
 28 |     "seaborn",
 29 |     "tqdm",
 30 |     "pyyaml",
 31 |     "torch",
 32 |     "torch-geometric",
 33 |     "pyranges",
 34 |     "pyfiglet",
 35 |     'plotly',
 36 |     'kaleido',
 37 |     'jinja2',
 38 |     'scanpy >=1.8.0',
 39 |     'zarr>=2,<3',
 40 |     'bitarray >=2.9.2, <3.0.0',
 41 |     'pyarrow',
 42 |     'scikit-misc'
 43 | ]
 44 | 
 45 | [project.optional-dependencies]
 46 | doc = [
 47 |     "sphinx",
 48 |     'sphinx-argparse',
 49 |     'sphinx-autobuild',
 50 |     'sphinx-autodoc-typehints',
 51 |     'sphinx-basic-ng',
 52 |     'sphinx-charts',
 53 |     'sphinx-copybutton',
 54 |     'sphinx_inline_tabs',
 55 |     'sphinx-markdown-tables',
 56 |     'sphinx-rtd-theme',
 57 |     'sphinxcontrib-applehelp',
 58 |     'sphinxcontrib-devhelp',
 59 |     'sphinxcontrib-htmlhelp',
 60 |     'sphinxcontrib-jquery',
 61 |     'sphinxcontrib-jsmath',
 62 |     'sphinxcontrib-qthelp',
 63 |     'sphinxcontrib-serializinghtml',
 64 |     'furo',
 65 |     'myst-parser',
 66 |     'nbsphinx',
 67 | ]
 68 | 
 69 | tests = [
 70 |     "pytest>=7.0.0",
 71 |     "pytest-cov>=4.0.0",
 72 |     "coverage",
 73 | ]
 74 | 
 75 | [project.urls]
 76 | Home = "https://github.com/JianYang-Lab/gsMap"
 77 | Documentation = "https://yanglab.westlake.edu.cn/gsmap/document/software"
 78 | Website = "https://yanglab.westlake.edu.cn/gsmap/home"
 79 | 
 80 | [project.scripts]
 81 | gsmap = "gsMap.main:main"
 82 | 
 83 | [tool.flit.module]
 84 | name = "gsMap"
 85 | 
 86 | [tool.flit.sdist]
 87 | # Include the HTML template in the source distribution
 88 | include = [
 89 |     "src/gsMap/templates/*.html"
 90 | ]
 91 | 
 92 | 
 93 | [tool.ruff]
 94 | src = ["src"]
 95 | line-length = 99
 96 | indent-width = 4
 97 | target-version = "py312"
 98 | 
 99 | # Exclude a variety of commonly ignored directories.
100 | exclude = [
101 |     ".bzr",
102 |     ".direnv",
103 |     ".eggs",
104 |     ".git",
105 |     ".git-rewrite",
106 |     ".hg",
107 |     ".mypy_cache",
108 |     ".nox",
109 |     ".pants.d",
110 |     ".pytype",
111 |     ".ruff_cache",
112 |     ".svn",
113 |     ".tox",
114 |     ".venv",
115 |     "__pypackages__",
116 |     "_build",
117 |     "buck-out",
118 |     "build",
119 |     "dist",
120 |     "node_modules",
121 |     "venv",
122 | ]
123 | 
124 | [tool.ruff.lint]
125 | select = [
126 |     "F",  # Errors detected by Pyflakes
127 |     "E",  # Error detected by Pycodestyle
128 |     "W",  # Warning detected by Pycodestyle
129 |     "I",  # isort
130 |     "B",  # flake8-bugbear
131 |     "TID",  # flake8-tidy-imports
132 |     "C4",  # flake8-comprehensions
133 |     "BLE",  # flake8-blind-except
134 |     "UP",  # pyupgrade
135 |     "RUF100",  # Report unused noqa directives
136 |     "PT", # pytest style
137 |     "NPY", # numpy formatting
138 |     "TCH", # flake8-type-checking
139 |     "FA", # flake8-future-annotations
140 | ]
141 | ignore = [
142 |     # allow I, O, l as variable names -> I is the identity matrix
143 |     "E741",
144 |     # Errors from function calls in argument defaults. These are fine when the result is immutable.
145 |     "B008",
146 |     # Raising ValueError is sufficient in tests.
147 |     "PT011",
148 |     # We support np.random functions.
149 |     "NPY002",
150 |     # Line too long
151 |     "E501",
152 |     # Loop variable is not used
153 |     "B007",
154 |     # Allow string in percent format
155 |     "UP031",
156 |     # Allow dict call within literal
157 |     "C408"
158 | ]
159 | 
160 | [tool.ruff.lint.pydocstyle]
161 | convention = "numpy"
162 | 
163 | [tool.ruff.lint.per-file-ignores]
164 | "docs/*" = ["I", "BLE001"]
165 | "tests/*" = ["D"]
166 | "*/__init__.py" = ["F401"]
167 | 
168 | [tool.ruff.format]
169 | docstring-code-format = true
170 | # Like Black, use double quotes for strings.
171 | quote-style = "double"
172 | 
173 | # Like Black, indent with spaces, rather than tabs.
174 | indent-style = "space"
175 | 
176 | # Like Black, respect magic trailing commas.
177 | skip-magic-trailing-comma = false
178 | 
179 | # Like Black, automatically detect the appropriate line ending.
180 | line-ending = "auto"
181 | 
182 | [tool.jupytext]
183 | formats = "ipynb,md"
184 | 
185 | [tool.ruff.lint.flake8-type-checking]
186 | exempt-modules = []
187 | strict = true
188 | 


--------------------------------------------------------------------------------
/schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JianYang-Lab/gsMap/13b72534e0ad3d32c648025b32c50305239c517a/schematic.png


--------------------------------------------------------------------------------
/src/gsMap/GNN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JianYang-Lab/gsMap/13b72534e0ad3d32c648025b32c50305239c517a/src/gsMap/GNN/__init__.py


--------------------------------------------------------------------------------
/src/gsMap/GNN/adjacency_matrix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scipy.sparse as sp
 4 | import torch
 5 | from sklearn.neighbors import NearestNeighbors
 6 | 
 7 | 
 8 | def cal_spatial_net(adata, n_neighbors=5, verbose=True):
 9 |     """Construct the spatial neighbor network."""
10 |     if verbose:
11 |         print("------Calculating spatial graph...")
12 |     coor = pd.DataFrame(adata.obsm["spatial"], index=adata.obs.index)
13 |     nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(coor)
14 |     distances, indices = nbrs.kneighbors(coor)
15 |     n_cells, n_neighbors = indices.shape
16 |     cell_indices = np.arange(n_cells)
17 |     cell1 = np.repeat(cell_indices, n_neighbors)
18 |     cell2 = indices.flatten()
19 |     distance = distances.flatten()
20 |     knn_df = pd.DataFrame({"Cell1": cell1, "Cell2": cell2, "Distance": distance})
21 |     knn_df = knn_df[knn_df["Distance"] > 0].copy()
22 |     cell_id_map = dict(zip(cell_indices, coor.index, strict=False))
23 |     knn_df["Cell1"] = knn_df["Cell1"].map(cell_id_map)
24 |     knn_df["Cell2"] = knn_df["Cell2"].map(cell_id_map)
25 |     return knn_df
26 | 
27 | 
28 | def sparse_mx_to_torch_sparse_tensor(sparse_mx):
29 |     """Convert a scipy sparse matrix to a torch sparse tensor."""
30 |     sparse_mx = sparse_mx.tocoo().astype(np.float32)
31 |     indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
32 |     values = torch.from_numpy(sparse_mx.data)
33 |     shape = torch.Size(sparse_mx.shape)
34 |     return torch.sparse_coo_tensor(indices, values, shape)
35 | 
36 | 
37 | def preprocess_graph(adj):
38 |     """Symmetrically normalize the adjacency matrix."""
39 |     adj = sp.coo_matrix(adj)
40 |     adj_ = adj + sp.eye(adj.shape[0])
41 |     rowsum = np.array(adj_.sum(1)).flatten()
42 |     degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5))
43 |     adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
44 |     return sparse_mx_to_torch_sparse_tensor(adj_normalized)
45 | 
46 | 
47 | def construct_adjacency_matrix(adata, params, verbose=True):
48 |     """Construct the adjacency matrix from spatial data."""
49 |     spatial_net = cal_spatial_net(adata, n_neighbors=params.n_neighbors, verbose=verbose)
50 |     if verbose:
51 |         num_edges = spatial_net.shape[0]
52 |         num_cells = adata.n_obs
53 |         print(f"The graph contains {num_edges} edges, {num_cells} cells.")
54 |         print(f"{num_edges / num_cells:.2f} neighbors per cell on average.")
55 |     cell_ids = {cell: idx for idx, cell in enumerate(adata.obs.index)}
56 |     spatial_net["Cell1"] = spatial_net["Cell1"].map(cell_ids)
57 |     spatial_net["Cell2"] = spatial_net["Cell2"].map(cell_ids)
58 |     if params.weighted_adj:
59 |         distance_normalized = spatial_net["Distance"] / (spatial_net["Distance"].max() + 1)
60 |         weights = np.exp(-0.5 * distance_normalized**2)
61 |         adj_org = sp.coo_matrix(
62 |             (weights, (spatial_net["Cell1"], spatial_net["Cell2"])),
63 |             shape=(adata.n_obs, adata.n_obs),
64 |         )
65 |     else:
66 |         adj_org = sp.coo_matrix(
67 |             (np.ones(spatial_net.shape[0]), (spatial_net["Cell1"], spatial_net["Cell2"])),
68 |             shape=(adata.n_obs, adata.n_obs),
69 |         )
70 |     adj_norm = preprocess_graph(adj_org)
71 |     norm_value = adj_org.shape[0] ** 2 / ((adj_org.shape[0] ** 2 - adj_org.sum()) * 2)
72 |     graph_dict = {"adj_org": adj_org, "adj_norm": adj_norm, "norm_value": norm_value}
73 |     return graph_dict
74 | 


--------------------------------------------------------------------------------
/src/gsMap/GNN/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch_geometric.nn import GATConv
 5 | 
 6 | 
 7 | def full_block(in_features, out_features, p_drop):
 8 |     return nn.Sequential(
 9 |         nn.Linear(in_features, out_features),
10 |         nn.BatchNorm1d(out_features),
11 |         nn.ELU(),
12 |         nn.Dropout(p=p_drop),
13 |     )
14 | 
15 | 
16 | class GATModel(nn.Module):
17 |     def __init__(self, input_dim, params, num_classes=1):
18 |         super().__init__()
19 |         self.var = params.var
20 |         self.num_classes = num_classes
21 |         self.params = params
22 | 
23 |         # Encoder
24 |         self.encoder = nn.Sequential(
25 |             full_block(input_dim, params.feat_hidden1, params.p_drop),
26 |             full_block(params.feat_hidden1, params.feat_hidden2, params.p_drop),
27 |         )
28 | 
29 |         # GAT Layers
30 |         self.gat1 = GATConv(
31 |             in_channels=params.feat_hidden2,
32 |             out_channels=params.gat_hidden1,
33 |             heads=params.nheads,
34 |             dropout=params.p_drop,
35 |         )
36 |         self.gat2 = GATConv(
37 |             in_channels=params.gat_hidden1 * params.nheads,
38 |             out_channels=params.gat_hidden2,
39 |             heads=1,
40 |             concat=False,
41 |             dropout=params.p_drop,
42 |         )
43 |         if self.var:
44 |             self.gat3 = GATConv(
45 |                 in_channels=params.gat_hidden1 * params.nheads,
46 |                 out_channels=params.gat_hidden2,
47 |                 heads=1,
48 |                 concat=False,
49 |                 dropout=params.p_drop,
50 |             )
51 | 
52 |         # Decoder
53 |         self.decoder = nn.Sequential(
54 |             full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
55 |             full_block(params.feat_hidden2, params.feat_hidden1, params.p_drop),
56 |             nn.Linear(params.feat_hidden1, input_dim),
57 |         )
58 | 
59 |         # Clustering Layer
60 |         self.cluster = nn.Sequential(
61 |             full_block(params.gat_hidden2, params.feat_hidden2, params.p_drop),
62 |             nn.Linear(params.feat_hidden2, self.num_classes),
63 |         )
64 | 
65 |     def encode(self, x, edge_index):
66 |         x = self.encoder(x)
67 |         x = self.gat1(x, edge_index)
68 |         x = F.relu(x)
69 |         x = F.dropout(x, p=self.params.p_drop, training=self.training)
70 | 
71 |         mu = self.gat2(x, edge_index)
72 |         if self.var:
73 |             logvar = self.gat3(x, edge_index)
74 |             return mu, logvar
75 |         else:
76 |             return mu, None
77 | 
78 |     def reparameterize(self, mu, logvar):
79 |         if self.training and logvar is not None:
80 |             std = torch.exp(0.5 * logvar)
81 |             eps = torch.randn_like(std)
82 |             return eps * std + mu
83 |         else:
84 |             return mu
85 | 
86 |     def forward(self, x, edge_index):
87 |         mu, logvar = self.encode(x, edge_index)
88 |         z = self.reparameterize(mu, logvar)
89 |         x_reconstructed = self.decoder(z)
90 |         # pred_label = F.softmax(self.cluster(z), dim=1)
91 |         pred_label = self.cluster(z)
92 |         return pred_label, x_reconstructed, z, mu, logvar
93 | 


--------------------------------------------------------------------------------
/src/gsMap/GNN/train.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from tqdm import tqdm
 7 | 
 8 | from gsMap.GNN.model import GATModel
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def reconstruction_loss(decoded, x):
14 |     """Compute the mean squared error loss."""
15 |     return F.mse_loss(decoded, x)
16 | 
17 | 
18 | def label_loss(pred_label, true_label):
19 |     """Compute the cross-entropy loss."""
20 |     return F.cross_entropy(pred_label, true_label.long())
21 | 
22 | 
23 | class ModelTrainer:
24 |     def __init__(self, node_x, graph_dict, params, label=None):
25 |         """Initialize the ModelTrainer with data and hyperparameters."""
26 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27 |         self.params = params
28 |         self.epochs = params.epochs
29 |         self.node_x = torch.FloatTensor(node_x).to(self.device)
30 |         self.adj_norm = graph_dict["adj_norm"].to(self.device).coalesce()
31 |         self.label = label
32 |         self.num_classes = 1
33 | 
34 |         if self.label is not None:
35 |             self.label = torch.tensor(self.label).to(self.device)
36 |             self.num_classes = len(torch.unique(self.label))
37 | 
38 |         # Set up the model
39 |         self.model = GATModel(self.params.feat_cell, self.params, self.num_classes).to(self.device)
40 |         self.optimizer = torch.optim.Adam(
41 |             self.model.parameters(), lr=self.params.gat_lr, weight_decay=self.params.gcn_decay
42 |         )
43 | 
44 |     def run_train(self):
45 |         """Train the model."""
46 |         self.model.train()
47 |         prev_loss = float("inf")
48 |         logger.info("Start training...")
49 |         pbar = tqdm(range(self.epochs), desc="GAT-AE model train:", total=self.epochs)
50 |         for epoch in range(self.epochs):
51 |             start_time = time.time()
52 |             self.optimizer.zero_grad()
53 |             pred_label, de_feat, latent_z, mu, logvar = self.model(self.node_x, self.adj_norm)
54 |             loss_rec = reconstruction_loss(de_feat, self.node_x)
55 | 
56 |             if self.label is not None:
57 |                 loss_pre = label_loss(pred_label, self.label)
58 |                 loss = self.params.rec_w * loss_rec + self.params.label_w * loss_pre
59 |             else:
60 |                 loss = loss_rec
61 | 
62 |             loss.backward()
63 |             self.optimizer.step()
64 | 
65 |             batch_time = time.time() - start_time
66 |             left_time = batch_time * (self.epochs - epoch - 1) / 60  # in minutes
67 | 
68 |             pbar.set_postfix({"Left time": f"{left_time:.2f} mins", "Loss": f"{loss.item():.4f}"})
69 |             pbar.update(1)
70 | 
71 |             if abs(loss.item() - prev_loss) <= self.params.convergence_threshold and epoch >= 200:
72 |                 pbar.close()
73 |                 logger.info("Convergence reached. Training stopped.")
74 |                 break
75 |             prev_loss = loss.item()
76 |         else:
77 |             pbar.close()
78 |             logger.info("Max epochs reached. Training stopped.")
79 | 
80 |     def get_latent(self):
81 |         """Retrieve the latent representation from the model."""
82 |         self.model.eval()
83 |         with torch.no_grad():
84 |             _, _, latent_z, _, _ = self.model(self.node_x, self.adj_norm)
85 |         return latent_z.cpu().numpy()
86 | 


--------------------------------------------------------------------------------
/src/gsMap/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Genetically informed spatial mapping of cells for complex traits
3 | """
4 | 
5 | __version__ = "1.73.5"
6 | 


--------------------------------------------------------------------------------
/src/gsMap/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/src/gsMap/cauchy_combination_test.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scanpy as sc
  7 | import scipy as sp
  8 | 
  9 | from gsMap.config import CauchyCombinationConfig
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # The fun of cauchy combination
 15 | def acat_test(pvalues, weights=None):
 16 |     """acat_test()
 17 |     Aggregated Cauchy Assocaition Test
 18 |     A p-value combination method using the Cauchy distribution.
 19 | 
 20 |     Inspired by: https://github.com/yaowuliu/ACAT/blob/master/R/ACAT.R
 21 |     Inputs:
 22 |         pvalues: <list or numpy array>
 23 |             The p-values you want to combine.
 24 |         weights: <list or numpy array>, default=None
 25 |             The weights for each of the p-values. If None, equal weights are used.
 26 | 
 27 |     Returns
 28 |     -------
 29 |         pval: <float>
 30 |             The ACAT combined p-value.
 31 |     """
 32 |     if any(np.isnan(pvalues)):
 33 |         raise Exception("Cannot have NAs in the p-values.")
 34 |     if any((i > 1) | (i < 0) for i in pvalues):
 35 |         raise Exception("P-values must be between 0 and 1.")
 36 |     if any(i == 1 for i in pvalues) & any(i == 0 for i in pvalues):
 37 |         raise Exception("Cannot have both 0 and 1 p-values.")
 38 |     if any(i == 0 for i in pvalues):
 39 |         logger.info("Warn: p-values are exactly 0.")
 40 |         return 0
 41 |     if any(i == 1 for i in pvalues):
 42 |         logger.info("Warn: p-values are exactly 1.")
 43 |         return 1
 44 |     if weights is None:
 45 |         weights = [1 / len(pvalues) for i in pvalues]
 46 |     elif len(weights) != len(pvalues):
 47 |         raise Exception("Length of weights and p-values differs.")
 48 |     elif any(i < 0 for i in weights):
 49 |         raise Exception("All weights must be positive.")
 50 |     else:
 51 |         weights = [i / np.sum(weights) for i in weights]
 52 | 
 53 |     pvalues = np.array(pvalues)
 54 |     weights = np.array(weights)
 55 | 
 56 |     if not any(i < 1e-15 for i in pvalues):
 57 |         cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
 58 |     else:
 59 |         is_small = [i < (1e-15) for i in pvalues]
 60 |         is_large = [i >= (1e-15) for i in pvalues]
 61 |         cct_stat = sum((weights[is_small] / pvalues[is_small]) / np.pi)
 62 |         cct_stat += sum(weights[is_large] * np.tan((0.5 - pvalues[is_large]) * np.pi))
 63 | 
 64 |     if cct_stat > 1e15:
 65 |         pval = (1 / cct_stat) / np.pi
 66 |     else:
 67 |         pval = 1 - sp.stats.cauchy.cdf(cct_stat)
 68 | 
 69 |     return pval
 70 | 
 71 | 
 72 | def run_Cauchy_combination(config: CauchyCombinationConfig):
 73 |     ldsc_list = []
 74 | 
 75 |     for sample_name in config.sample_name_list:
 76 |         config.sample_name = sample_name
 77 | 
 78 |         # Load the LDSC results for the current sample
 79 |         logger.info(f"------Loading LDSC results for sample {sample_name}...")
 80 |         ldsc_input_file = config.get_ldsc_result_file(
 81 |             trait_name=config.trait_name,
 82 |         )
 83 |         ldsc = pd.read_csv(ldsc_input_file, compression="gzip")
 84 |         ldsc["spot"] = ldsc["spot"].astype(str)
 85 |         ldsc.index = ldsc["spot"]
 86 | 
 87 |         # Load the spatial transcriptomics (ST) data for the current sample
 88 |         logger.info(f"------Loading ST data for sample {sample_name}...")
 89 |         h5ad_file = config.hdf5_with_latent_path
 90 |         adata = sc.read_h5ad(h5ad_file)
 91 | 
 92 |         # Identify common cells between LDSC results and ST data
 93 |         common_cells = np.intersect1d(ldsc.index, adata.obs_names)
 94 |         adata = adata[common_cells]
 95 |         ldsc = ldsc.loc[common_cells]
 96 | 
 97 |         # Add annotations to the LDSC dataframe
 98 |         ldsc["annotation"] = adata.obs.loc[ldsc.spot, config.annotation].to_list()
 99 |         ldsc_list.append(ldsc)
100 | 
101 |     # Concatenate all LDSC dataframes from different samples
102 |     ldsc_all = pd.concat(ldsc_list)
103 | 
104 |     # Run the Cauchy combination
105 |     p_cauchy = []
106 |     p_median = []
107 |     annotations = ldsc_all["annotation"].unique()
108 | 
109 |     for ct in annotations:
110 |         p_values = ldsc_all.loc[ldsc_all["annotation"] == ct, "p"]
111 | 
112 |         # Handle extreme outliers to enhance robustness
113 |         p_values_log = -np.log10(p_values)
114 |         median_log = np.median(p_values_log)
115 |         iqr_log = np.percentile(p_values_log, 75) - np.percentile(p_values_log, 25)
116 | 
117 |         p_values_filtered = p_values[p_values_log < median_log + 3 * iqr_log]
118 |         n_removed = len(p_values) - len(p_values_filtered)
119 | 
120 |         # Remove outliers if the number is reasonable
121 |         if 0 < n_removed < max(len(p_values) * 0.01, 20):
122 |             logger.info(f"Removed {n_removed}/{len(p_values)} outliers (median + 3IQR) for {ct}.")
123 |             p_cauchy_temp = acat_test(p_values_filtered)
124 |         else:
125 |             p_cauchy_temp = acat_test(p_values)
126 | 
127 |         p_median_temp = np.median(p_values)
128 |         p_cauchy.append(p_cauchy_temp)
129 |         p_median.append(p_median_temp)
130 | 
131 |     # Prepare the results dataframe
132 |     results = pd.DataFrame({"annotation": annotations, "p_cauchy": p_cauchy, "p_median": p_median})
133 |     results.sort_values(by="p_cauchy", inplace=True)
134 | 
135 |     # Save the results
136 |     Path(config.output_file).parent.mkdir(parents=True, exist_ok=True, mode=0o755)
137 |     output_file = Path(config.output_file)
138 |     results.to_csv(
139 |         output_file,
140 |         compression="gzip",
141 |         index=False,
142 |     )
143 |     logger.info(f"Cauchy combination results saved at {output_file}.")
144 |     return results
145 | 


--------------------------------------------------------------------------------
/src/gsMap/create_slice_mean.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import anndata
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scanpy as sc
  8 | import scipy
  9 | import zarr
 10 | from scipy.stats import gmean, rankdata
 11 | from tqdm import tqdm
 12 | 
 13 | from gsMap.config import CreateSliceMeanConfig
 14 | 
 15 | # %% Helper functions
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def get_common_genes(h5ad_files, config: CreateSliceMeanConfig):
 20 |     """
 21 |     Get common genes from a list of h5ad files.
 22 |     """
 23 |     common_genes = None
 24 |     for file in tqdm(h5ad_files, desc="Finding common genes"):
 25 |         adata = sc.read_h5ad(file)
 26 |         sc.pp.filter_genes(adata, min_cells=1)
 27 |         adata.var_names_make_unique()
 28 |         if common_genes is None:
 29 |             common_genes = adata.var_names
 30 |         else:
 31 |             common_genes = common_genes.intersection(adata.var_names)
 32 |     # sort
 33 | 
 34 |     if config.species is not None:
 35 |         homologs = pd.read_csv(config.homolog_file, sep="\t")
 36 |         if homologs.shape[1] < 2:
 37 |             raise ValueError(
 38 |                 "Homologs file must have at least two columns: one for the species and one for the human gene symbol."
 39 |             )
 40 |         homologs.columns = [config.species, "HUMAN_GENE_SYM"]
 41 |         homologs.set_index(config.species, inplace=True)
 42 |         common_genes = np.intersect1d(common_genes, homologs.index)
 43 | 
 44 |     common_genes = sorted(common_genes)
 45 |     return common_genes
 46 | 
 47 | 
 48 | def calculate_one_slice_mean(
 49 |     sample_name, file_path: Path, common_genes, zarr_group_path, data_layer
 50 | ):
 51 |     """
 52 |     Calculate the geometric mean (using log trick) of gene expressions for a single slice and store it in a Zarr group.
 53 |     """
 54 |     # file_name = file_path.name
 55 |     gmean_zarr_group = zarr.open(zarr_group_path, mode="a")
 56 |     adata = anndata.read_h5ad(file_path)
 57 | 
 58 |     if data_layer in adata.layers.keys():
 59 |         adata.X = adata.layers[data_layer]
 60 |     elif data_layer == "X":
 61 |         pass
 62 |     else:
 63 |         raise ValueError(f"Data layer {data_layer} not found in {file_path}")
 64 | 
 65 |     adata = adata[:, common_genes].copy()
 66 |     n_cells = adata.shape[0]
 67 | 
 68 |     if not scipy.sparse.issparse(adata.X):
 69 |         adata_X = scipy.sparse.csr_matrix(adata.X)
 70 |     elif isinstance(adata.X, scipy.sparse.csr_matrix):
 71 |         adata_X = adata.X  # Avoid copying if already CSR
 72 |     else:
 73 |         adata_X = adata.X.tocsr()
 74 | 
 75 |     ranks = np.zeros((n_cells, adata.n_vars), dtype=np.float16)
 76 |     for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
 77 |         data = adata_X[i, :].toarray().flatten()
 78 |         ranks[i, :] = rankdata(data, method="average")
 79 | 
 80 |     gM = gmean(ranks, axis=0).reshape(-1, 1)
 81 | 
 82 |     # Calculate the expression fractio
 83 |     adata_X_bool = adata.X.astype(bool)
 84 |     frac = (np.asarray(adata_X_bool.sum(axis=0)).flatten()).reshape(-1, 1)
 85 | 
 86 |     # Save to zarr group
 87 |     gmean_frac = np.concatenate([gM, frac], axis=1)
 88 |     s1_zarr = gmean_zarr_group.array(sample_name, data=gmean_frac, chunks=None, dtype="f4")
 89 |     s1_zarr.attrs["spot_number"] = adata.shape[0]
 90 | 
 91 | 
 92 | def merge_zarr_means(zarr_group_path, output_file, common_genes):
 93 |     """
 94 |     Merge all Zarr arrays into a weighted geometric mean and save to a Parquet file.
 95 |     """
 96 |     gmean_zarr_group = zarr.open(zarr_group_path, mode="a")
 97 | 
 98 |     sample_gmeans = []
 99 |     sample_weights = []
100 |     frac_sum = None
101 |     total_spot_number = 0
102 | 
103 |     # Collect all geometric means and their weights (spot numbers)
104 |     for key in tqdm(gmean_zarr_group.array_keys(), desc="Merging Zarr arrays"):
105 |         s1 = gmean_zarr_group[key]
106 |         s1_array_gmean = s1[:][:, 0]
107 |         s1_array_frac = s1[:][:, 1]
108 |         n = s1.attrs["spot_number"]
109 | 
110 |         sample_gmeans.append(s1_array_gmean)
111 |         sample_weights.append(n)
112 | 
113 |         if frac_sum is None:
114 |             frac_sum = s1_array_frac
115 |         else:
116 |             frac_sum += s1_array_frac
117 | 
118 |         total_spot_number += n
119 | 
120 |     # Convert to arrays
121 |     sample_gmeans = np.array(sample_gmeans)
122 |     sample_weights = np.array(sample_weights)
123 | 
124 |     final_gmean = gmean(sample_gmeans, axis=0, weights=sample_weights[:, np.newaxis])
125 | 
126 |     final_frac = frac_sum / total_spot_number
127 | 
128 |     # Save the final mean to a Parquet file
129 |     gene_names = common_genes
130 |     final_df = pd.DataFrame({"gene": gene_names, "G_Mean": final_gmean, "frac": final_frac})
131 |     final_df.set_index("gene", inplace=True)
132 |     final_df.to_parquet(output_file)
133 |     return final_df
134 | 
135 | 
136 | def run_create_slice_mean(config: CreateSliceMeanConfig):
137 |     """
138 |     Main entrypoint to create slice means.
139 |     Now works with a config that can accept either:
140 |     1. An h5ad_yaml file.
141 |     2. Direct lists of sample names and h5ad files.
142 |     """
143 |     h5ad_files = list(config.h5ad_dict.values())
144 | 
145 |     # Step 2: Get common genes from the h5ad files
146 |     common_genes = get_common_genes(h5ad_files, config)
147 |     logger.info(f"Found {len(common_genes)} common genes across all files.")
148 | 
149 |     # Step 3: Initialize the Zarr group
150 |     zarr_group_path = config.slice_mean_output_file.with_suffix(".zarr")
151 | 
152 |     for sample_name, h5ad_file in config.h5ad_dict.items():
153 |         # Step 4: Process each file to calculate the slice means
154 |         if zarr_group_path.exists():
155 |             zarr_group = zarr.open(zarr_group_path.as_posix(), mode="r")
156 |             # Check if the slice mean for this file already exists
157 |             if sample_name in zarr_group.array_keys():
158 |                 logger.info(f"Skipping {sample_name}, already processed.")
159 |                 continue
160 | 
161 |         calculate_one_slice_mean(
162 |             sample_name, h5ad_file, common_genes, zarr_group_path, config.data_layer
163 |         )
164 | 
165 |     output_file = config.slice_mean_output_file
166 |     final_df = merge_zarr_means(zarr_group_path, output_file, common_genes)
167 | 
168 |     logger.info(f"Final slice mean and expression fraction saved to {output_file}")
169 |     return final_df
170 | 


--------------------------------------------------------------------------------
/src/gsMap/diagnosis.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import warnings
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scanpy as sc
  8 | from scipy.stats import norm
  9 | 
 10 | from gsMap.config import DiagnosisConfig
 11 | from gsMap.utils.manhattan_plot import ManhattanPlot
 12 | from gsMap.utils.regression_read import _read_chr_files
 13 | from gsMap.visualize import draw_scatter, estimate_point_size_for_plot, load_ldsc, load_st_coord
 14 | 
 15 | warnings.filterwarnings("ignore", category=FutureWarning)
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def convert_z_to_p(gwas_data):
 20 |     """Convert Z-scores to P-values."""
 21 |     gwas_data["P"] = norm.sf(abs(gwas_data["Z"])) * 2
 22 |     min_p_value = 1e-300
 23 |     gwas_data["P"] = gwas_data["P"].clip(lower=min_p_value)
 24 |     return gwas_data
 25 | 
 26 | 
 27 | def load_gene_diagnostic_info(config: DiagnosisConfig):
 28 |     """Load or compute gene diagnostic info."""
 29 |     gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
 30 |     if gene_diagnostic_info_save_path.exists():
 31 |         logger.info(
 32 |             f"Loading gene diagnostic information from {gene_diagnostic_info_save_path}..."
 33 |         )
 34 |         return pd.read_csv(gene_diagnostic_info_save_path)
 35 |     else:
 36 |         logger.info(
 37 |             "Gene diagnostic information not found. Calculating gene diagnostic information..."
 38 |         )
 39 |         return compute_gene_diagnostic_info(config)
 40 | 
 41 | 
 42 | def compute_gene_diagnostic_info(config: DiagnosisConfig):
 43 |     """Calculate gene diagnostic info and save it to adata."""
 44 |     logger.info("Loading ST data and LDSC results...")
 45 |     # adata = sc.read_h5ad(config.hdf5_with_latent_path, backed='r')
 46 |     mk_score = pd.read_feather(config.mkscore_feather_path)
 47 |     mk_score.set_index("HUMAN_GENE_SYM", inplace=True)
 48 |     mk_score = mk_score.T
 49 |     trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
 50 | 
 51 |     # Align marker scores with trait LDSC results
 52 |     mk_score = mk_score.loc[trait_ldsc_result.index]
 53 | 
 54 |     # Filter out genes with no variation
 55 |     has_variation = (~mk_score.eq(mk_score.iloc[0], axis=1)).any()
 56 |     mk_score = mk_score.loc[:, has_variation]
 57 | 
 58 |     logger.info("Calculating correlation between gene marker scores and trait logp-values...")
 59 |     corr = mk_score.corrwith(trait_ldsc_result["logp"])
 60 |     corr.name = "PCC"
 61 | 
 62 |     grouped_mk_score = mk_score.groupby(adata.obs[config.annotation]).median()
 63 |     max_annotations = grouped_mk_score.idxmax()
 64 | 
 65 |     high_GSS_Gene_annotation_pair = pd.DataFrame(
 66 |         {
 67 |             "Gene": max_annotations.index,
 68 |             "Annotation": max_annotations.values,
 69 |             "Median_GSS": grouped_mk_score.max().values,
 70 |         }
 71 |     )
 72 | 
 73 |     high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair.merge(
 74 |         corr, left_on="Gene", right_index=True
 75 |     )
 76 | 
 77 |     # Prepare the final gene diagnostic info dataframe
 78 |     gene_diagnostic_info_cols = ["Gene", "Annotation", "Median_GSS", "PCC"]
 79 |     gene_diagnostic_info = (
 80 |         high_GSS_Gene_annotation_pair[gene_diagnostic_info_cols]
 81 |         .drop_duplicates()
 82 |         .dropna(subset=["Gene"])
 83 |     )
 84 |     gene_diagnostic_info.sort_values("PCC", ascending=False, inplace=True)
 85 | 
 86 |     # Save gene diagnostic info to a file
 87 |     gene_diagnostic_info_save_path = config.get_gene_diagnostic_info_save_path(config.trait_name)
 88 |     gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
 89 |     logger.info(f"Gene diagnostic information saved to {gene_diagnostic_info_save_path}.")
 90 | 
 91 |     return gene_diagnostic_info.reset_index()
 92 | 
 93 | 
 94 | def load_gwas_data(config: DiagnosisConfig):
 95 |     """Load and process GWAS data."""
 96 |     logger.info("Loading and processing GWAS data...")
 97 |     gwas_data = pd.read_csv(config.sumstats_file, compression="gzip", sep="\t")
 98 |     return convert_z_to_p(gwas_data)
 99 | 
100 | 
101 | def load_snp_gene_pairs(config: DiagnosisConfig):
102 |     """Load SNP-gene pairs from multiple chromosomes."""
103 |     ldscore_save_dir = Path(config.ldscore_save_dir)
104 |     snp_gene_pair_file_prefix = ldscore_save_dir / "SNP_gene_pair/SNP_gene_pair_chr"
105 |     return pd.concat(
106 |         [
107 |             pd.read_feather(file)
108 |             for file in _read_chr_files(snp_gene_pair_file_prefix.as_posix(), suffix=".feather")
109 |         ]
110 |     )
111 | 
112 | 
113 | def filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER):
114 |     """Filter the SNPs based on significance levels."""
115 |     pass_suggestive_line_mask = gwas_data_with_gene_annotation_sort["P"] < 1e-5
116 |     pass_suggestive_line_number = pass_suggestive_line_mask.sum()
117 | 
118 |     if pass_suggestive_line_number > SUBSAMPLE_SNP_NUMBER:
119 |         snps2plot = gwas_data_with_gene_annotation_sort[pass_suggestive_line_mask].SNP
120 |         logger.info(
121 |             f"To reduce the number of SNPs to plot, only {snps2plot.shape[0]} SNPs with P < 1e-5 are plotted."
122 |         )
123 |     else:
124 |         snps2plot = gwas_data_with_gene_annotation_sort.head(SUBSAMPLE_SNP_NUMBER).SNP
125 |         logger.info(
126 |             f"To reduce the number of SNPs to plot, only {SUBSAMPLE_SNP_NUMBER} SNPs with the smallest P-values are plotted."
127 |         )
128 | 
129 |     return snps2plot
130 | 
131 | 
132 | def generate_manhattan_plot(config: DiagnosisConfig):
133 |     """Generate Manhattan plot."""
134 |     # report_save_dir = config.get_report_dir(config.trait_name)
135 |     gwas_data = load_gwas_data(config)
136 |     snp_gene_pair = load_snp_gene_pairs(config)
137 |     gwas_data_with_gene = snp_gene_pair.merge(gwas_data, on="SNP", how="inner").rename(
138 |         columns={"gene_name": "GENE"}
139 |     )
140 |     gene_diagnostic_info = load_gene_diagnostic_info(config)
141 |     gwas_data_with_gene_annotation = gwas_data_with_gene.merge(
142 |         gene_diagnostic_info, left_on="GENE", right_on="Gene", how="left"
143 |     )
144 | 
145 |     gwas_data_with_gene_annotation = gwas_data_with_gene_annotation[
146 |         ~gwas_data_with_gene_annotation["Annotation"].isna()
147 |     ]
148 |     gwas_data_with_gene_annotation_sort = gwas_data_with_gene_annotation.sort_values("P")
149 | 
150 |     snps2plot = filter_snps(gwas_data_with_gene_annotation_sort, SUBSAMPLE_SNP_NUMBER=100_000)
151 |     gwas_data_to_plot = gwas_data_with_gene_annotation[
152 |         gwas_data_with_gene_annotation["SNP"].isin(snps2plot)
153 |     ].reset_index(drop=True)
154 |     gwas_data_to_plot["Annotation_text"] = (
155 |         "PCC: "
156 |         + gwas_data_to_plot["PCC"].round(2).astype(str)
157 |         + "<br>"
158 |         + "Annotation: "
159 |         + gwas_data_to_plot["Annotation"].astype(str)
160 |     )
161 | 
162 |     # Verify data integrity
163 |     if gwas_data_with_gene_annotation_sort.empty:
164 |         logger.error("Filtered GWAS data is empty, cannot create Manhattan plot")
165 |         return
166 | 
167 |     if len(gwas_data_to_plot) == 0:
168 |         logger.error("No SNPs passed filtering criteria for Manhattan plot")
169 |         return
170 | 
171 |     # Log some diagnostic information
172 |     logger.info(f"Creating Manhattan plot with {len(gwas_data_to_plot)} SNPs")
173 |     logger.info(f"Chromosome column values: {gwas_data_to_plot['CHR'].unique()}")
174 | 
175 |     fig = ManhattanPlot(
176 |         dataframe=gwas_data_to_plot,
177 |         title="gsMap Diagnosis Manhattan Plot",
178 |         point_size=3,
179 |         highlight_gene_list=config.selected_genes
180 |         or gene_diagnostic_info.Gene.iloc[: config.top_corr_genes].tolist(),
181 |         suggestiveline_value=-np.log10(1e-5),
182 |         annotation="Annotation_text",
183 |     )
184 | 
185 |     save_manhattan_plot_path = config.get_manhattan_html_plot_path(config.trait_name)
186 |     fig.write_html(save_manhattan_plot_path)
187 |     logger.info(f"Diagnostic Manhattan Plot saved to {save_manhattan_plot_path}.")
188 | 
189 | 
190 | def generate_GSS_distribution(config: DiagnosisConfig):
191 |     """Generate GSS distribution plots."""
192 |     # logger.info('Loading ST data...')
193 |     # adata = sc.read_h5ad(config.hdf5_with_latent_path)
194 |     mk_score = pd.read_feather(config.mkscore_feather_path).set_index("HUMAN_GENE_SYM").T
195 | 
196 |     plot_genes = (
197 |         config.selected_genes
198 |         or load_gene_diagnostic_info(config).Gene.iloc[: config.top_corr_genes].tolist()
199 |     )
200 |     if config.selected_genes is not None:
201 |         logger.info(
202 |             f"Generating GSS & Expression distribution plot for selected genes: {plot_genes}..."
203 |         )
204 |     else:
205 |         logger.info(
206 |             f"Generating GSS & Expression distribution plot for top {config.top_corr_genes} correlated genes..."
207 |         )
208 | 
209 |     if config.customize_fig:
210 |         pixel_width, pixel_height, point_size = (
211 |             config.fig_width,
212 |             config.fig_height,
213 |             config.point_size,
214 |         )
215 |     else:
216 |         (pixel_width, pixel_height), point_size = estimate_point_size_for_plot(
217 |             adata.obsm["spatial"]
218 |         )
219 |     sub_fig_save_dir = config.get_GSS_plot_dir(config.trait_name)
220 | 
221 |     # save plot gene list
222 |     config.get_GSS_plot_select_gene_file(config.trait_name).write_text("\n".join(plot_genes))
223 | 
224 |     for selected_gene in plot_genes:
225 |         expression_series = pd.Series(
226 |             adata[:, selected_gene].X.toarray().flatten(), index=adata.obs.index, name="Expression"
227 |         )
228 |         threshold = np.quantile(expression_series, 0.9999)
229 |         expression_series[expression_series > threshold] = threshold
230 |         generate_and_save_plots(
231 |             adata,
232 |             mk_score,
233 |             expression_series,
234 |             selected_gene,
235 |             point_size,
236 |             pixel_width,
237 |             pixel_height,
238 |             sub_fig_save_dir,
239 |             config.sample_name,
240 |             config.annotation,
241 |         )
242 | 
243 | 
244 | def generate_and_save_plots(
245 |     adata,
246 |     mk_score,
247 |     expression_series,
248 |     selected_gene,
249 |     point_size,
250 |     pixel_width,
251 |     pixel_height,
252 |     sub_fig_save_dir,
253 |     sample_name,
254 |     annotation,
255 | ):
256 |     """Generate and save the plots."""
257 |     select_gene_expression_with_space_coord = load_st_coord(adata, expression_series, annotation)
258 |     sub_fig_1 = draw_scatter(
259 |         select_gene_expression_with_space_coord,
260 |         title=f"{selected_gene} (Expression)",
261 |         annotation="annotation",
262 |         color_by="Expression",
263 |         point_size=point_size,
264 |         width=pixel_width,
265 |         height=pixel_height,
266 |     )
267 |     save_plot(sub_fig_1, sub_fig_save_dir, sample_name, selected_gene, "Expression")
268 | 
269 |     select_gene_GSS_with_space_coord = load_st_coord(
270 |         adata, mk_score[selected_gene].rename("GSS"), annotation
271 |     )
272 |     sub_fig_2 = draw_scatter(
273 |         select_gene_GSS_with_space_coord,
274 |         title=f"{selected_gene} (GSS)",
275 |         annotation="annotation",
276 |         color_by="GSS",
277 |         point_size=point_size,
278 |         width=pixel_width,
279 |         height=pixel_height,
280 |     )
281 |     save_plot(sub_fig_2, sub_fig_save_dir, sample_name, selected_gene, "GSS")
282 | 
283 |     # combined_fig = make_subplots(rows=1, cols=2,
284 |     #                              subplot_titles=(f'{selected_gene} (Expression)', f'{selected_gene} (GSS)'))
285 |     # for trace in sub_fig_1.data:
286 |     #     combined_fig.add_trace(trace, row=1, col=1)
287 |     # for trace in sub_fig_2.data:
288 |     #     combined_fig.add_trace(trace, row=1, col=2)
289 |     #
290 | 
291 | 
292 | def save_plot(sub_fig, sub_fig_save_dir, sample_name, selected_gene, plot_type):
293 |     """Save the plot to HTML and PNG."""
294 |     save_sub_fig_path = (
295 |         sub_fig_save_dir / f"{sample_name}_{selected_gene}_{plot_type}_Distribution.png"
296 |     )
297 |     # sub_fig.write_html(str(save_sub_fig_path))
298 |     sub_fig.update_layout(showlegend=False)
299 |     sub_fig.write_image(save_sub_fig_path)
300 |     assert save_sub_fig_path.exists(), f"Failed to save {plot_type} plot for {selected_gene}."
301 | 
302 | 
303 | def generate_gsMap_plot(config: DiagnosisConfig):
304 |     """Generate gsMap plot."""
305 |     logger.info("Creating gsMap plot...")
306 | 
307 |     trait_ldsc_result = load_ldsc(config.get_ldsc_result_file(config.trait_name))
308 |     space_coord_concat = load_st_coord(adata, trait_ldsc_result, annotation=config.annotation)
309 | 
310 |     if config.customize_fig:
311 |         pixel_width, pixel_height, point_size = (
312 |             config.fig_width,
313 |             config.fig_height,
314 |             config.point_size,
315 |         )
316 |     else:
317 |         (pixel_width, pixel_height), point_size = estimate_point_size_for_plot(
318 |             adata.obsm["spatial"]
319 |         )
320 |     fig = draw_scatter(
321 |         space_coord_concat,
322 |         title=f"{config.trait_name} (gsMap)",
323 |         point_size=point_size,
324 |         width=pixel_width,
325 |         height=pixel_height,
326 |         annotation=config.annotation,
327 |     )
328 | 
329 |     output_dir = config.get_gsMap_plot_save_dir(config.trait_name)
330 |     output_file_html = config.get_gsMap_html_plot_save_path(config.trait_name)
331 |     output_file_png = output_file_html.with_suffix(".png")
332 |     output_file_csv = output_file_html.with_suffix(".csv")
333 | 
334 |     fig.write_html(output_file_html)
335 |     fig.write_image(output_file_png)
336 |     space_coord_concat.to_csv(output_file_csv)
337 | 
338 |     logger.info(f"gsMap plot created and saved in {output_dir}.")
339 | 
340 | 
341 | def run_Diagnosis(config: DiagnosisConfig):
342 |     """Main function to run the diagnostic plot generation."""
343 |     global adata
344 |     adata = sc.read_h5ad(config.hdf5_with_latent_path)
345 |     if "log1p" not in adata.uns.keys() and adata.X.max() > 14:
346 |         sc.pp.normalize_total(adata, target_sum=1e4)
347 |         sc.pp.log1p(adata)
348 | 
349 |     if config.plot_type in ["gsMap", "all"]:
350 |         generate_gsMap_plot(config)
351 |     if config.plot_type in ["manhattan", "all"]:
352 |         generate_manhattan_plot(config)
353 |     if config.plot_type in ["GSS", "all"]:
354 |         generate_GSS_distribution(config)
355 | 


--------------------------------------------------------------------------------
/src/gsMap/find_latent_representation.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | 
  4 | import numpy as np
  5 | import scanpy as sc
  6 | import torch
  7 | from sklearn.decomposition import PCA
  8 | from sklearn.preprocessing import LabelEncoder
  9 | 
 10 | from gsMap.config import FindLatentRepresentationsConfig
 11 | from gsMap.GNN.adjacency_matrix import construct_adjacency_matrix
 12 | from gsMap.GNN.train import ModelTrainer
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def set_seed(seed_value):
 18 |     """
 19 |     Set seed for reproducibility in PyTorch and other libraries.
 20 |     """
 21 |     torch.manual_seed(seed_value)
 22 |     np.random.seed(seed_value)
 23 |     random.seed(seed_value)
 24 |     if torch.cuda.is_available():
 25 |         logger.info("Using GPU for computations.")
 26 |         torch.cuda.manual_seed(seed_value)
 27 |         torch.cuda.manual_seed_all(seed_value)
 28 |     else:
 29 |         logger.info("Using CPU for computations.")
 30 | 
 31 | 
 32 | def preprocess_data(adata, params):
 33 |     """
 34 |     Preprocess the AnnData
 35 |     """
 36 |     logger.info("Preprocessing data...")
 37 |     adata.var_names_make_unique()
 38 | 
 39 |     if params.data_layer in adata.layers.keys():
 40 |         logger.info(f"Using data layer: {params.data_layer}...")
 41 |         adata.X = adata.layers[params.data_layer].copy()
 42 |     elif params.data_layer == "X":
 43 |         logger.info(f"Using data layer: {params.data_layer}...")
 44 |         if adata.X.dtype == "float32" or adata.X.dtype == "float64":
 45 |             logger.warning("The data layer should be raw count data")
 46 |     else:
 47 |         raise ValueError(f"Invalid data layer: {params.data_layer}, please check the input data.")
 48 | 
 49 |     if params.data_layer in ["count", "counts", "X"]:
 50 |         # HVGs based on count
 51 |         logger.info("Dealing with count data...")
 52 |         sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=params.feat_cell)
 53 | 
 54 |         # Get the pearson residuals
 55 |         if params.pearson_residuals:
 56 |             sc.experimental.pp.normalize_pearson_residuals(adata, inplace=False)
 57 |             pearson_residuals = sc.experimental.pp.normalize_pearson_residuals(
 58 |                 adata, inplace=False, clip=10
 59 |             )
 60 |             adata.layers["pearson_residuals"] = pearson_residuals["X"]
 61 | 
 62 |         # Normalize the data
 63 |         sc.pp.normalize_total(adata, target_sum=1e4)
 64 |         sc.pp.log1p(adata)
 65 | 
 66 |     elif params.data_layer in adata.layers.keys():
 67 |         sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=params.feat_cell)
 68 | 
 69 |     return adata
 70 | 
 71 | 
 72 | class LatentRepresentationFinder:
 73 |     def __init__(self, adata, args: FindLatentRepresentationsConfig):
 74 |         self.params = args
 75 | 
 76 |         if "pearson_residuals" in adata.layers:
 77 |             self.expression_array = (
 78 |                 adata[:, adata.var.highly_variable].layers["pearson_residuals"].copy()
 79 |             )
 80 |         else:
 81 |             self.expression_array = adata[:, adata.var.highly_variable].X.copy()
 82 |             self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
 83 | 
 84 |         # Construct the neighboring graph
 85 |         self.graph_dict = construct_adjacency_matrix(adata, self.params)
 86 | 
 87 |     def compute_pca(self):
 88 |         self.latent_pca = PCA(n_components=self.params.n_comps).fit_transform(
 89 |             self.expression_array
 90 |         )
 91 |         return self.latent_pca
 92 | 
 93 |     def run_gnn_vae(self, label, verbose="whole ST data"):
 94 |         # Use PCA if specified
 95 |         if self.params.input_pca:
 96 |             node_X = self.compute_pca()
 97 |         else:
 98 |             node_X = self.expression_array
 99 | 
100 |         # Update the input shape
101 |         self.params.n_nodes = node_X.shape[0]
102 |         self.params.feat_cell = node_X.shape[1]
103 | 
104 |         # Run GNN
105 |         logger.info(f"Finding latent representations for {verbose}...")
106 |         gvae = ModelTrainer(node_X, self.graph_dict, self.params, label)
107 |         gvae.run_train()
108 | 
109 |         del self.graph_dict
110 | 
111 |         return gvae.get_latent()
112 | 
113 | 
114 | def run_find_latent_representation(args: FindLatentRepresentationsConfig):
115 |     set_seed(2024)
116 | 
117 |     # Load the ST data
118 |     logger.info(f"Loading ST data of {args.sample_name}...")
119 |     adata = sc.read_h5ad(args.input_hdf5_path)
120 |     sc.pp.filter_genes(adata, min_cells=1)
121 | 
122 |     logger.info(f"The ST data contains {adata.shape[0]} cells, {adata.shape[1]} genes.")
123 | 
124 |     # Load the cell type annotation
125 |     if args.annotation is not None:
126 |         # Remove cells without enough annotations
127 |         adata = adata[~adata.obs[args.annotation].isnull()]
128 |         num = adata.obs[args.annotation].value_counts()
129 |         valid_annotations = num[num >= 30].index.to_list()
130 |         adata = adata[adata.obs[args.annotation].isin(valid_annotations)]
131 | 
132 |         le = LabelEncoder()
133 |         label = le.fit_transform(adata.obs[args.annotation])
134 |     else:
135 |         label = None
136 | 
137 |     # Preprocess data
138 |     adata = preprocess_data(adata, args)
139 | 
140 |     latent_rep = LatentRepresentationFinder(adata, args)
141 |     latent_gvae = latent_rep.run_gnn_vae(label)
142 |     latent_pca = latent_rep.latent_pca
143 | 
144 |     # Add latent representations to the AnnData object
145 |     logger.info("Adding latent representations...")
146 |     adata.obsm["latent_GVAE"] = latent_gvae
147 |     adata.obsm["latent_PCA"] = latent_pca
148 | 
149 |     # Run UMAP based on latent representations
150 |     # for name in ['latent_GVAE', 'latent_PCA']:
151 |     #    sc.pp.neighbors(adata, n_neighbors=10, use_rep=name)
152 |     #    sc.tl.umap(adata)
153 |     #    adata.obsm['X_umap_' + name] = adata.obsm['X_umap']
154 | 
155 |     # Save the AnnData object
156 |     logger.info("Saving ST data...")
157 |     adata.write(args.hdf5_with_latent_path)
158 | 


--------------------------------------------------------------------------------
/src/gsMap/format_sumstats.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import re
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy.stats import chi2
  8 | 
  9 | from gsMap.config import FormatSumstatsConfig
 10 | 
 11 | VALID_SNPS = {"AC", "AG", "CA", "CT", "GA", "GT", "TC", "TG"}
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | default_cnames = {
 15 |     # RS NUMBER
 16 |     "SNP": "SNP",
 17 |     "RS": "SNP",
 18 |     "RSID": "SNP",
 19 |     "RS_NUMBER": "SNP",
 20 |     "RS_NUMBERS": "SNP",
 21 |     # P-VALUE
 22 |     "P": "P",
 23 |     "PVALUE": "P",
 24 |     "P_VALUE": "P",
 25 |     "PVAL": "P",
 26 |     "P_VAL": "P",
 27 |     "GC_PVALUE": "P",
 28 |     "p": "P",
 29 |     # EFFECT_ALLELE (A1)
 30 |     "A1": "A1",
 31 |     "ALLELE1": "A1",
 32 |     "ALLELE_1": "A1",
 33 |     "EFFECT_ALLELE": "A1",
 34 |     "REFERENCE_ALLELE": "A1",
 35 |     "INC_ALLELE": "A1",
 36 |     "EA": "A1",
 37 |     # NON_EFFECT_ALLELE (A2)
 38 |     "A2": "A2",
 39 |     "ALLELE2": "A2",
 40 |     "ALLELE_2": "A2",
 41 |     "OTHER_ALLELE": "A2",
 42 |     "NON_EFFECT_ALLELE": "A2",
 43 |     "DEC_ALLELE": "A2",
 44 |     "NEA": "A2",
 45 |     # N
 46 |     "N": "N",
 47 |     "NCASE": "N_CAS",
 48 |     "CASES_N": "N_CAS",
 49 |     "N_CASE": "N_CAS",
 50 |     "N_CASES": "N_CAS",
 51 |     "N_CONTROLS": "N_CON",
 52 |     "N_CAS": "N_CAS",
 53 |     "N_CON": "N_CON",
 54 |     "NCONTROL": "N_CON",
 55 |     "CONTROLS_N": "N_CON",
 56 |     "N_CONTROL": "N_CON",
 57 |     "WEIGHT": "N",
 58 |     # SIGNED STATISTICS
 59 |     "ZSCORE": "Z",
 60 |     "Z-SCORE": "Z",
 61 |     "GC_ZSCORE": "Z",
 62 |     "Z": "Z",
 63 |     "OR": "OR",
 64 |     "B": "BETA",
 65 |     "BETA": "BETA",
 66 |     "LOG_ODDS": "LOG_ODDS",
 67 |     "EFFECTS": "BETA",
 68 |     "EFFECT": "BETA",
 69 |     "b": "BETA",
 70 |     "beta": "BETA",
 71 |     # SE
 72 |     "se": "SE",
 73 |     # INFO
 74 |     "INFO": "INFO",
 75 |     "Info": "INFO",
 76 |     # MAF
 77 |     "EAF": "FRQ",
 78 |     "FRQ": "FRQ",
 79 |     "MAF": "FRQ",
 80 |     "FRQ_U": "FRQ",
 81 |     "F_U": "FRQ",
 82 |     "frq_A1": "FRQ",
 83 |     "frq": "FRQ",
 84 |     "freq": "FRQ",
 85 | }
 86 | 
 87 | 
 88 | def get_compression(fh):
 89 |     """
 90 |     Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
 91 |     """
 92 |     if fh.endswith("gz"):
 93 |         compression = "gzip"
 94 |     elif fh.endswith("bz2"):
 95 |         compression = "bz2"
 96 |     else:
 97 |         compression = None
 98 | 
 99 |     return compression
100 | 
101 | 
102 | def gwas_checkname(gwas, config):
103 |     """
104 |     Iterpret column names of gwas
105 |     """
106 |     old_name = gwas.columns
107 |     mapped_cnames = {}
108 |     for col in gwas.columns:
109 |         mapped_cnames[col] = default_cnames.get(col, col)
110 |     gwas.columns = list(mapped_cnames.values())
111 | 
112 |     # When column names are provided by users
113 |     name_updates = {
114 |         "SNP": config.snp,
115 |         "A1": config.a1,
116 |         "A2": config.a2,
117 |         "INFO": config.info,
118 |         "BETA": config.beta,
119 |         "SE": config.se,
120 |         "P": config.p,
121 |         "FRQ": config.frq,
122 |         "N": config.n,
123 |         "Z": config.z,
124 |         "Chr": config.chr,
125 |         "Pos": config.pos,
126 |         "OR": config.OR,
127 |         "SE_OR": config.se_OR,
128 |     }
129 | 
130 |     for key, value in name_updates.items():
131 |         if value is not None and value in gwas.columns:
132 |             gwas.rename(columns={value: key}, inplace=True)
133 |     new_name = gwas.columns
134 |     # check the name duplication
135 |     for head in new_name:
136 |         numc = list(new_name).count(head)
137 |         if numc > 1:
138 |             raise ValueError(
139 |                 f"Found {numc} different {head} columns, please check your {head} column."
140 |             )
141 | 
142 |     name_dict = {new_name[i]: old_name[i] for i in range(len(new_name))}
143 | 
144 |     # When at OR scale
145 |     if "OR" in new_name and "SE_OR" in new_name:
146 |         gwas["BETA"] = gwas.OR.apply(lambda x: math.log(x) if x > 0 else None)
147 |         gwas["SE"] = gwas.SE_OR.apply(lambda x: math.log(x) if x > 0 else None)
148 | 
149 |     interpreting = {
150 |         "SNP": "Variant ID (e.g., rs number).",
151 |         "A1": "Allele 1, interpreted as the effect allele for signed sumstat.",
152 |         "A2": "Allele 2, interpreted as the non-effect allele for signed sumstat.",
153 |         "BETA": "[linear/logistic] regression coefficient (0 → no effect; above 0 → A1 is trait/risk increasing).",
154 |         "SE": "Standard error of the regression coefficient.",
155 |         "OR": "Odds ratio, will be transferred to linear scale.",
156 |         "SE_OR": "Standard error of the odds ratio, will be transferred to linear scale.",
157 |         "P": "P-Value.",
158 |         "Z": "Z-Value.",
159 |         "N": "Sample size.",
160 |         "INFO": "INFO score (imputation quality; higher → better imputation).",
161 |         "FRQ": "Allele frequency of A1.",
162 |         "Chr": "Chromsome.",
163 |         "Pos": "SNP positions.",
164 |     }
165 | 
166 |     logger.info("\nIterpreting column names as follows:")
167 |     for key, _value in interpreting.items():
168 |         if key in new_name:
169 |             logger.info(f"{name_dict[key]}: {interpreting[key]}")
170 | 
171 |     return gwas
172 | 
173 | 
174 | def gwas_checkformat(gwas, config):
175 |     """
176 |     Check column names required for different format
177 |     """
178 |     if config.format == "gsMap":
179 |         condition1 = np.any(np.isin(["P", "Z"], gwas.columns))
180 |         condition2 = np.all(np.isin(["BETA", "SE"], gwas.columns))
181 |         if not (condition1 or condition2):
182 |             raise ValueError(
183 |                 "To munge GWAS data into gsMap format, either P or Z values, or both BETA and SE values, are required."
184 |             )
185 |         else:
186 |             if "Z" in gwas.columns:
187 |                 pass
188 |             elif "P" in gwas.columns:
189 |                 gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
190 |             else:
191 |                 gwas["Z"] = gwas.BETA / gwas.SE
192 | 
193 |     elif config.format == "COJO":
194 |         condition = np.all(np.isin(["A1", "A2", "FRQ", "BETA", "SE", "P", "N"], gwas.columns))
195 |         if not condition:
196 |             raise ValueError(
197 |                 "To munge GWAS data into COJO format, either A1|A2|FRQ|BETA|SE|P|N, are required."
198 |             )
199 |         else:
200 |             gwas["Z"] = np.sqrt(chi2.isf(gwas.P, 1)) * np.where(gwas["BETA"] < 0, -1, 1)
201 | 
202 |     return gwas
203 | 
204 | 
205 | def filter_info(info, config):
206 |     """Remove INFO < args.info_min (default 0.9) and complain about out-of-bounds INFO."""
207 |     if type(info) is pd.Series:  # one INFO column
208 |         jj = ((info > 2.0) | (info < 0)) & info.notnull()
209 |         ii = info >= config.info_min
210 |     elif type(info) is pd.DataFrame:  # several INFO columns
211 |         jj = ((info > 2.0) & info.notnull()).any(axis=1) | ((info < 0) & info.notnull()).any(
212 |             axis=1
213 |         )
214 |         ii = info.sum(axis=1) >= config.info_min * (len(info.columns))
215 |     else:
216 |         raise ValueError("Expected pd.DataFrame or pd.Series.")
217 | 
218 |     bad_info = jj.sum()
219 |     if bad_info > 0:
220 |         msg = "WARNING: {N} SNPs had INFO outside of [0,1.5]. The INFO column may be mislabeled."
221 |         logger.warning(msg.format(N=bad_info))
222 | 
223 |     return ii
224 | 
225 | 
226 | def filter_frq(frq, config):
227 |     """
228 |     Filter on MAF. Remove MAF < args.maf_min and out-of-bounds MAF.
229 |     """
230 |     jj = (frq < 0) | (frq > 1)
231 |     bad_frq = jj.sum()
232 |     if bad_frq > 0:
233 |         msg = "WARNING: {N} SNPs had FRQ outside of [0,1]. The FRQ column may be mislabeled."
234 |         logger.warning(msg.format(N=bad_frq))
235 | 
236 |     frq = np.minimum(frq, 1 - frq)
237 |     ii = frq > config.maf_min
238 |     return ii & ~jj
239 | 
240 | 
241 | def filter_pvals(P, config):
242 |     """Remove out-of-bounds P-values"""
243 |     ii = (P > 0) & (P <= 1)
244 |     bad_p = (~ii).sum()
245 |     if bad_p > 0:
246 |         msg = "WARNING: {N} SNPs had P outside of (0,1]. The P column may be mislabeled."
247 |         logger.warning(msg.format(N=bad_p))
248 | 
249 |     return ii
250 | 
251 | 
252 | def filter_alleles(a):
253 |     """Remove alleles that do not describe strand-unambiguous SNPs"""
254 |     return a.isin(VALID_SNPS)
255 | 
256 | 
257 | def gwas_qc(gwas, config):
258 |     """
259 |     Filter out SNPs based on INFO, FRQ, MAF, N, and Genotypes.
260 |     """
261 |     old = len(gwas)
262 |     logger.info("\nFiltering SNPs as follows:")
263 |     # filter: SNPs with missing values
264 |     drops = {"NA": 0, "P": 0, "INFO": 0, "FRQ": 0, "A": 0, "SNP": 0, "Dup": 0, "N": 0}
265 | 
266 |     gwas = gwas.dropna(
267 |         axis=0, how="any", subset=filter(lambda x: x != "INFO", gwas.columns)
268 |     ).reset_index(drop=True)
269 | 
270 |     drops["NA"] = old - len(gwas)
271 |     logger.info(f"Removed {drops['NA']} SNPs with missing values.")
272 | 
273 |     # filter: SNPs with Info < 0.9
274 |     if "INFO" in gwas.columns:
275 |         old = len(gwas)
276 |         gwas = gwas.loc[filter_info(gwas["INFO"], config)]
277 |         drops["INFO"] = old - len(gwas)
278 |         logger.info(f"Removed {drops['INFO']} SNPs with INFO <= 0.9.")
279 | 
280 |     # filter: SNPs with MAF <= 0.01
281 |     if "FRQ" in gwas.columns:
282 |         old = len(gwas)
283 |         gwas = gwas.loc[filter_frq(gwas["FRQ"], config)]
284 |         drops["FRQ"] += old - len(gwas)
285 |         logger.info(f"Removed {drops['FRQ']} SNPs with MAF <= 0.01.")
286 | 
287 |     # filter: P-value that out-of-bounds [0,1]
288 |     if "P" in gwas.columns:
289 |         old = len(gwas)
290 |         gwas = gwas.loc[filter_pvals(gwas["P"], config)]
291 |         drops["P"] += old - len(gwas)
292 |         logger.info(f"Removed {drops['P']} SNPs with out-of-bounds p-values.")
293 | 
294 |     # filter: Variants that are strand-ambiguous
295 |     if "A1" in gwas.columns and "A2" in gwas.columns:
296 |         gwas.A1 = gwas.A1.str.upper()
297 |         gwas.A2 = gwas.A2.str.upper()
298 |         gwas = gwas.loc[filter_alleles(gwas.A1 + gwas.A2)]
299 |         drops["A"] += old - len(gwas)
300 |         logger.info(f"Removed {drops['A']} variants that were not SNPs or were strand-ambiguous.")
301 | 
302 |     # filter: Duplicated rs numbers
303 |     if "SNP" in gwas.columns:
304 |         old = len(gwas)
305 |         gwas = gwas.drop_duplicates(subset="SNP").reset_index(drop=True)
306 |         drops["Dup"] += old - len(gwas)
307 |         logger.info(f"Removed {drops['Dup']} SNPs with duplicated rs numbers.")
308 | 
309 |     # filter:Sample size
310 |     n_min = gwas.N.quantile(0.9) / 1.5
311 |     old = len(gwas)
312 |     gwas = gwas[gwas.N >= n_min].reset_index(drop=True)
313 |     drops["N"] += old - len(gwas)
314 |     logger.info(f"Removed {drops['N']} SNPs with N < {n_min}.")
315 | 
316 |     return gwas
317 | 
318 | 
319 | def variant_to_rsid(gwas, config):
320 |     """
321 |     Convert variant id (Chr, Pos) to rsid
322 |     """
323 |     logger.info("\nConverting the SNP position to rsid. This process may take some time.")
324 |     unique_ids = set(gwas["id"])
325 |     chr_format = gwas["Chr"].unique().astype(str)
326 |     chr_format = [re.sub(r"\d+", "", value) for value in chr_format][1]
327 | 
328 |     dtype = {"chr": str, "pos": str, "ref": str, "alt": str, "dbsnp": str}
329 |     chunk_iter = pd.read_csv(
330 |         config.dbsnp,
331 |         chunksize=config.chunksize,
332 |         sep="\t",
333 |         skiprows=1,
334 |         dtype=dtype,
335 |         names=["chr", "pos", "ref", "alt", "dbsnp"],
336 |     )
337 | 
338 |     # Iterate over chunks
339 |     matching_id = pd.DataFrame()
340 |     for chunk in chunk_iter:
341 |         chunk["id"] = chr_format + chunk["chr"] + "_" + chunk["pos"]
342 |         matching_id = pd.concat(
343 |             [matching_id, chunk[chunk["id"].isin(unique_ids)][["dbsnp", "id"]]]
344 |         )
345 | 
346 |     matching_id = matching_id.drop_duplicates(subset="dbsnp").reset_index(drop=True)
347 |     matching_id = matching_id.drop_duplicates(subset="id").reset_index(drop=True)
348 |     matching_id.index = matching_id.id
349 |     return matching_id
350 | 
351 | 
352 | def clean_SNP_id(gwas, config):
353 |     """
354 |     Clean SNP id
355 |     """
356 |     old = len(gwas)
357 |     condition1 = "SNP" in gwas.columns
358 |     condition2 = np.all(np.isin(["Chr", "Pos"], gwas.columns))
359 | 
360 |     if not (condition1 or condition2):
361 |         raise ValueError("Either SNP rsid, or both SNP chromosome and position, are required.")
362 |     elif condition1:
363 |         pass
364 |     elif condition2:
365 |         if config.dbsnp is None:
366 |             raise ValueError("To Convert SNP positions to rsid, dbsnp reference is required.")
367 |         else:
368 |             gwas["id"] = gwas["Chr"].astype(str) + "_" + gwas["Pos"].astype(str)
369 |             gwas = gwas.drop_duplicates(subset="id").reset_index(drop=True)
370 |             gwas.index = gwas.id
371 | 
372 |             matching_id = variant_to_rsid(gwas, config)
373 |             gwas = gwas.loc[matching_id.id]
374 |             gwas["SNP"] = matching_id.dbsnp
375 |             num_fail = old - len(gwas)
376 |             logger.info(f"Removed {num_fail} SNPs that did not convert to rsid.")
377 | 
378 |     return gwas
379 | 
380 | 
381 | def gwas_metadata(gwas, config):
382 |     """
383 |     Report key features of GWAS data
384 |     """
385 |     logger.info("\nSummary of GWAS data:")
386 |     CHISQ = gwas.Z**2
387 |     mean_chisq = CHISQ.mean()
388 |     logger.info("Mean chi^2 = " + str(round(mean_chisq, 3)))
389 |     if mean_chisq < 1.02:
390 |         logger.warning("Mean chi^2 may be too small.")
391 | 
392 |     logger.info("Lambda GC = " + str(round(CHISQ.median() / 0.4549, 3)))
393 |     logger.info("Max chi^2 = " + str(round(CHISQ.max(), 3)))
394 |     logger.info(
395 |         f"{(CHISQ > 29).sum()} Genome-wide significant SNPs (some may have been removed by filtering)."
396 |     )
397 | 
398 | 
399 | def gwas_format(config: FormatSumstatsConfig):
400 |     """
401 |     Format GWAS data
402 |     """
403 |     logger.info(f"------Formating gwas data for {config.sumstats}...")
404 |     compression_type = get_compression(config.sumstats)
405 |     gwas = pd.read_csv(
406 |         config.sumstats,
407 |         delim_whitespace=True,
408 |         header=0,
409 |         compression=compression_type,
410 |         na_values=[".", "NA"],
411 |     )
412 | 
413 |     if isinstance(config.n, int | float):
414 |         logger.info(f"Set the sample size of gwas data as {config.n}.")
415 |         gwas["N"] = config.n
416 |         config.n = "N"
417 | 
418 |     logger.info(f"Read {len(gwas)} SNPs from {config.sumstats}.")
419 | 
420 |     # Check name and format
421 |     gwas = gwas_checkname(gwas, config)
422 |     gwas = gwas_checkformat(gwas, config)
423 |     # Clean the snp id
424 |     gwas = clean_SNP_id(gwas, config)
425 |     # QC
426 |     gwas = gwas_qc(gwas, config)
427 |     # Meta
428 |     gwas_metadata(gwas, config)
429 | 
430 |     # Saving the data
431 |     if config.format == "COJO":
432 |         keep = ["SNP", "A1", "A2", "FRQ", "BETA", "SE", "P", "N"]
433 |         appendix = ".cojo"
434 |     elif config.format == "gsMap":
435 |         keep = ["SNP", "A1", "A2", "Z", "N"]
436 |         appendix = ".sumstats"
437 | 
438 |     if "Chr" in gwas.columns and "Pos" in gwas.columns and config.keep_chr_pos is True:
439 |         keep = keep + ["Chr", "Pos"]
440 | 
441 |     gwas = gwas[keep]
442 |     out_name = config.out + appendix + ".gz"
443 | 
444 |     logger.info(f"\nWriting summary statistics for {len(gwas)} SNPs to {out_name}.")
445 |     gwas.to_csv(out_name, sep="\t", index=False, float_format="%.3f", compression="gzip")
446 | 


--------------------------------------------------------------------------------
/src/gsMap/latent_to_gene.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scanpy as sc
  7 | import scipy
  8 | from scipy.stats import gmean, rankdata
  9 | from sklearn.metrics.pairwise import cosine_similarity
 10 | from sklearn.neighbors import NearestNeighbors
 11 | from tqdm import tqdm, trange
 12 | 
 13 | from gsMap.config import LatentToGeneConfig
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def find_neighbors(coor, num_neighbour):
 19 |     """
 20 |     Find Neighbors of each cell (based on spatial coordinates).
 21 |     """
 22 |     nbrs = NearestNeighbors(n_neighbors=num_neighbour).fit(coor)
 23 |     distances, indices = nbrs.kneighbors(coor, return_distance=True)
 24 |     cell_indices = np.arange(coor.shape[0])
 25 |     cell1 = np.repeat(cell_indices, indices.shape[1])
 26 |     cell2 = indices.flatten()
 27 |     distance = distances.flatten()
 28 |     spatial_net = pd.DataFrame({"Cell1": cell1, "Cell2": cell2, "Distance": distance})
 29 |     return spatial_net
 30 | 
 31 | 
 32 | def build_spatial_net(adata, annotation, num_neighbour):
 33 |     """
 34 |     Build spatial neighbourhood matrix for each spot (cell) based on the spatial coordinates.
 35 |     """
 36 |     logger.info("------Building spatial graph based on spatial coordinates...")
 37 | 
 38 |     coor = adata.obsm["spatial"]
 39 |     if annotation is not None:
 40 |         logger.info("Cell annotations are provided...")
 41 |         spatial_net_list = []
 42 |         # Cells with annotations
 43 |         for ct in adata.obs[annotation].dropna().unique():
 44 |             idx = np.where(adata.obs[annotation] == ct)[0]
 45 |             coor_temp = coor[idx, :]
 46 |             spatial_net_temp = find_neighbors(coor_temp, min(num_neighbour, coor_temp.shape[0]))
 47 |             # Map back to original indices
 48 |             spatial_net_temp["Cell1"] = idx[spatial_net_temp["Cell1"].values]
 49 |             spatial_net_temp["Cell2"] = idx[spatial_net_temp["Cell2"].values]
 50 |             spatial_net_list.append(spatial_net_temp)
 51 |             logger.info(f"{ct}: {coor_temp.shape[0]} cells")
 52 | 
 53 |         # Cells labeled as nan
 54 |         if pd.isnull(adata.obs[annotation]).any():
 55 |             idx_nan = np.where(pd.isnull(adata.obs[annotation]))[0]
 56 |             logger.info(f"Nan: {len(idx_nan)} cells")
 57 |             spatial_net_temp = find_neighbors(coor, num_neighbour)
 58 |             spatial_net_temp = spatial_net_temp[spatial_net_temp["Cell1"].isin(idx_nan)]
 59 |             spatial_net_list.append(spatial_net_temp)
 60 |         spatial_net = pd.concat(spatial_net_list, axis=0)
 61 |     else:
 62 |         logger.info("Cell annotations are not provided...")
 63 |         spatial_net = find_neighbors(coor, num_neighbour)
 64 | 
 65 |     return spatial_net.groupby("Cell1")["Cell2"].apply(np.array).to_dict()
 66 | 
 67 | 
 68 | def find_neighbors_regional(cell_pos, spatial_net_dict, coor_latent, config, cell_annotations):
 69 |     num_neighbour = config.num_neighbour
 70 |     annotations = config.annotation
 71 | 
 72 |     cell_use_pos = spatial_net_dict.get(cell_pos, [])
 73 |     if len(cell_use_pos) == 0:
 74 |         return []
 75 | 
 76 |     cell_latent = coor_latent[cell_pos, :].reshape(1, -1)
 77 |     neighbors_latent = coor_latent[cell_use_pos, :]
 78 |     similarity = cosine_similarity(cell_latent, neighbors_latent).reshape(-1)
 79 | 
 80 |     if annotations is not None:
 81 |         cell_annotation = cell_annotations[cell_pos]
 82 |         neighbor_annotations = cell_annotations[cell_use_pos]
 83 |         mask = neighbor_annotations == cell_annotation
 84 |         if not np.any(mask):
 85 |             return []
 86 |         similarity = similarity[mask]
 87 |         cell_use_pos = cell_use_pos[mask]
 88 | 
 89 |     if len(similarity) == 0:
 90 |         return []
 91 | 
 92 |     indices = np.argsort(-similarity)  # descending order
 93 |     top_indices = indices[:num_neighbour]
 94 |     cell_select_pos = cell_use_pos[top_indices]
 95 |     return cell_select_pos
 96 | 
 97 | 
 98 | def compute_regional_mkscore(
 99 |     cell_pos,
100 |     spatial_net_dict,
101 |     coor_latent,
102 |     config,
103 |     cell_annotations,
104 |     ranks,
105 |     frac_whole,
106 |     adata_X_bool,
107 |     pearson_residuals,
108 | ):
109 |     """
110 |     Compute gmean ranks of a region.
111 |     """
112 |     cell_select_pos = find_neighbors_regional(
113 |         cell_pos, spatial_net_dict, coor_latent, config, cell_annotations
114 |     )
115 |     if len(cell_select_pos) == 0:
116 |         return np.zeros(ranks.shape[1], dtype=np.float16)
117 | 
118 |     # Ratio of expression ranks
119 |     ranks_tg = ranks[cell_select_pos, :]
120 |     gene_ranks_region = gmean(ranks_tg, axis=0)
121 |     gene_ranks_region[gene_ranks_region <= 1] = 0
122 | 
123 |     if not config.no_expression_fraction:
124 |         # Ratio of expression fractions
125 |         frac_focal = adata_X_bool[cell_select_pos, :].sum(axis=0).A1 / len(cell_select_pos)
126 |         frac_region = frac_focal / frac_whole
127 |         frac_region[frac_region <= 1] = 0
128 |         frac_region[frac_region > 1] = 1
129 | 
130 |         # Simultaneously consider the ratio of expression fractions and ranks
131 |         gene_ranks_region = gene_ranks_region * frac_region
132 | 
133 |     mkscore = np.exp(gene_ranks_region) - 1 if not pearson_residuals else gene_ranks_region
134 | 
135 |     return mkscore.astype(np.float16, copy=False)
136 | 
137 | 
138 | def run_latent_to_gene(config: LatentToGeneConfig):
139 |     logger.info("------Loading the spatial data...")
140 |     adata = sc.read_h5ad(config.hdf5_with_latent_path)
141 |     logger.info(f"Loaded spatial data with {adata.n_obs} cells and {adata.n_vars} genes.")
142 | 
143 |     if config.annotation is not None:
144 |         logger.info(f"------Cell annotations are provided as {config.annotation}...")
145 |         initial_cell_count = adata.n_obs
146 |         adata = adata[~pd.isnull(adata.obs[config.annotation]), :]
147 |         logger.info(
148 |             f"Removed null annotations. Cells retained: {adata.n_obs} (initial: {initial_cell_count})."
149 |         )
150 | 
151 |     # Homologs transformation
152 |     if config.homolog_file is not None and config.species is not None:
153 |         species_col_name = f"{config.species}_homolog"
154 | 
155 |         # Check if homolog conversion has already been performed
156 |         if species_col_name in adata.var.columns:
157 |             logger.warning(
158 |                 f"Column '{species_col_name}' already exists in adata.var. "
159 |                 f"It appears gene names have already been converted to human gene symbols. "
160 |                 f"Skipping homolog transformation."
161 |             )
162 |         else:
163 |             logger.info(f"------Transforming the {config.species} to HUMAN_GENE_SYM...")
164 |             homologs = pd.read_csv(config.homolog_file, sep="\t")
165 |             if homologs.shape[1] != 2:
166 |                 raise ValueError(
167 |                     "Homologs file must have two columns: one for the species and one for the human gene symbol."
168 |                 )
169 | 
170 |             homologs.columns = [config.species, "HUMAN_GENE_SYM"]
171 |             homologs.set_index(config.species, inplace=True)
172 | 
173 |             # original_gene_names = adata.var_names.copy()
174 | 
175 |             # Filter genes present in homolog file
176 |             adata = adata[:, adata.var_names.isin(homologs.index)]
177 |             logger.info(f"{adata.shape[1]} genes retained after homolog transformation.")
178 |             if adata.shape[1] < 100:
179 |                 raise ValueError("Too few genes retained in ST data (<100).")
180 | 
181 |             # Create mapping table of original to human gene names
182 |             gene_mapping = pd.Series(
183 |                 homologs.loc[adata.var_names, "HUMAN_GENE_SYM"].values, index=adata.var_names
184 |             )
185 | 
186 |             # Store original species gene names in var dataframe with the suffixed column name
187 |             adata.var[species_col_name] = adata.var_names.values
188 | 
189 |             # Convert var_names to human gene symbols
190 |             adata.var_names = gene_mapping.values
191 |             adata.var.index.name = "HUMAN_GENE_SYM"
192 | 
193 |             # Remove duplicated genes after conversion
194 |             adata = adata[:, ~adata.var_names.duplicated()]
195 |             logger.info(f"{adata.shape[1]} genes retained after removing duplicates.")
196 | 
197 |     if config.annotation is not None:
198 |         cell_annotations = adata.obs[config.annotation].values
199 |         logger.info(f"Using cell annotations for {len(cell_annotations)} cells.")
200 |     else:
201 |         cell_annotations = None
202 | 
203 |     # Build the spatial graph
204 |     logger.info("------Building the spatial graph...")
205 |     spatial_net_dict = build_spatial_net(adata, config.annotation, config.num_neighbour_spatial)
206 |     logger.info("Spatial graph built successfully.")
207 | 
208 |     # Extract the latent representation
209 |     logger.info("------Extracting the latent representation...")
210 |     coor_latent = adata.obsm[config.latent_representation]
211 |     coor_latent = coor_latent.astype(np.float32)
212 |     logger.info("Latent representation extracted.")
213 | 
214 |     # Geometric mean across slices
215 |     gM = None
216 |     frac_whole = None
217 |     if config.gM_slices is not None:
218 |         logger.info("Geometrical mean across multiple slices is provided.")
219 |         gM_df = pd.read_parquet(config.gM_slices)
220 |         if config.species is not None:
221 |             homologs = pd.read_csv(config.homolog_file, sep="\t")
222 |             if homologs.shape[1] < 2:
223 |                 raise ValueError(
224 |                     "Homologs file must have at least two columns: one for the species and one for the human gene symbol."
225 |                 )
226 |             homologs.columns = [config.species, "HUMAN_GENE_SYM"]
227 |             homologs.set_index(config.species, inplace=True)
228 |             gM_df = gM_df.loc[gM_df.index.isin(homologs.index)]
229 |             gM_df.index = homologs.loc[gM_df.index, "HUMAN_GENE_SYM"].values
230 |         common_genes = np.intersect1d(adata.var_names, gM_df.index)
231 |         gM_df = gM_df.loc[common_genes]
232 |         gM = gM_df["G_Mean"].values
233 |         frac_whole = gM_df["frac"].values
234 |         adata = adata[:, common_genes]
235 |         logger.info(
236 |             f"{len(common_genes)} common genes retained after loading the cross slice geometric mean."
237 |         )
238 | 
239 |     # Compute ranks after taking common genes with gM_slices
240 |     logger.info("------Ranking the spatial data...")
241 |     if not scipy.sparse.issparse(adata.X):
242 |         adata_X = scipy.sparse.csr_matrix(adata.X)
243 |     elif isinstance(adata.X, scipy.sparse.csr_matrix):
244 |         adata_X = adata.X  # Avoid copying if already CSR
245 |     else:
246 |         adata_X = adata.X.tocsr()
247 | 
248 |     # Create mappings
249 |     n_cells = adata.n_obs
250 |     n_genes = adata.n_vars
251 |     pearson_residuals = True if "pearson_residuals" in adata.layers else False
252 |     ranks = np.zeros((n_cells, adata.n_vars), dtype=np.float16)
253 | 
254 |     if pearson_residuals:
255 |         logger.info("Using pearson residuals for ranking.")
256 |         data = adata.layers["pearson_residuals"]
257 |         for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
258 |             ranks[i, :] = rankdata(data[i, :], method="average")
259 |     else:
260 |         for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
261 |             data = adata_X[i, :].toarray().flatten()
262 |             ranks[i, :] = rankdata(data, method="average")
263 | 
264 |     if gM is None:
265 |         gM = gmean(ranks, axis=0)
266 |         gM = gM.astype(np.float16)
267 | 
268 |     adata_X_bool = adata_X.astype(bool)
269 |     if frac_whole is None:
270 |         # Compute the fraction of each gene across cells
271 |         frac_whole = np.asarray(adata_X_bool.sum(axis=0)).flatten() / n_cells
272 |         logger.info("Gene expression proportion of each gene across cells computed.")
273 |     else:
274 |         logger.info(
275 |             "Gene expression proportion of each gene across cells in all sections has been provided."
276 |         )
277 | 
278 |     frac_whole += 1e-12  # Avoid division by zero
279 |     # Normalize the ranks
280 |     ranks /= gM
281 | 
282 |     def compute_mk_score_wrapper(cell_pos):
283 |         return compute_regional_mkscore(
284 |             cell_pos,
285 |             spatial_net_dict,
286 |             coor_latent,
287 |             config,
288 |             cell_annotations,
289 |             ranks,
290 |             frac_whole,
291 |             adata_X_bool,
292 |             pearson_residuals,
293 |         )
294 | 
295 |     logger.info("------Computing marker scores...")
296 |     mk_score = np.zeros((n_cells, n_genes), dtype=np.float16)
297 |     for cell_pos in trange(n_cells, desc="Calculating marker scores"):
298 |         mk_score[cell_pos, :] = compute_mk_score_wrapper(cell_pos)
299 | 
300 |     mk_score = mk_score.T
301 |     logger.info("Marker scores computed.")
302 | 
303 |     # Remove mitochondrial genes
304 |     gene_names = adata.var_names.values.astype(str)
305 |     mt_gene_mask = ~(np.char.startswith(gene_names, "MT-") | np.char.startswith(gene_names, "mt-"))
306 |     mk_score = mk_score[mt_gene_mask, :]
307 |     gene_names = gene_names[mt_gene_mask]
308 |     logger.info(f"Removed mitochondrial genes. Remaining genes: {len(gene_names)}.")
309 | 
310 |     # Save the marker scores
311 |     logger.info("------Saving marker scores ...")
312 |     output_file_path = Path(config.mkscore_feather_path)
313 |     output_file_path.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
314 |     mk_score_df = pd.DataFrame(mk_score, index=gene_names, columns=adata.obs_names)
315 |     mk_score_df.reset_index(inplace=True)
316 |     mk_score_df.rename(columns={"index": "HUMAN_GENE_SYM"}, inplace=True)
317 |     mk_score_df.to_feather(output_file_path)
318 |     logger.info(f"Marker scores saved to {output_file_path}.")
319 | 
320 |     # Save the modified adata object to disk
321 |     adata.write(config.hdf5_with_latent_path)
322 |     logger.info(f"Modified adata object saved to {config.hdf5_with_latent_path}.")
323 | 


--------------------------------------------------------------------------------
/src/gsMap/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from gsMap import __version__
 4 | from gsMap.config import cli_function_registry
 5 | 
 6 | 
 7 | def main():
 8 |     parser = create_parser()
 9 |     args = parser.parse_args()
10 |     if args.subcommand is None:
11 |         parser.print_help()
12 |         exit(1)
13 |     args.func(args)
14 | 
15 | 
16 | def create_parser():
17 |     parser = argparse.ArgumentParser(
18 |         description=" gsMap: genetically informed spatial mapping of cells for complex traits",
19 |         formatter_class=argparse.RawTextHelpFormatter,
20 |         prog="gsMap",
21 |     )
22 |     parser.add_argument(
23 |         "--version", "-v", action="version", version=f"gsMap version {__version__}"
24 |     )
25 |     subparsers = parser.add_subparsers(
26 |         dest="subcommand", help="Subcommands", title="Available subcommands"
27 |     )
28 |     for subcommand in cli_function_registry.values():
29 |         subcommand_parser = subparsers.add_parser(
30 |             subcommand.name,
31 |             help=subcommand.description,
32 |             formatter_class=argparse.ArgumentDefaultsHelpFormatter,
33 |         )
34 |         subcommand.add_args_function(subcommand_parser)
35 |         subcommand_parser.set_defaults(func=subcommand.func)
36 |     return parser
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/src/gsMap/report.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | 
  5 | import pandas as pd
  6 | from jinja2 import Environment, FileSystemLoader
  7 | 
  8 | import gsMap
  9 | from gsMap.cauchy_combination_test import run_Cauchy_combination
 10 | from gsMap.config import CauchyCombinationConfig, ReportConfig
 11 | from gsMap.diagnosis import run_Diagnosis
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | # Load the Jinja2 environment
 16 | try:
 17 |     from importlib.resources import files
 18 | 
 19 |     template_dir = files("gsMap").joinpath("templates")
 20 | except (ImportError, FileNotFoundError):
 21 |     # Fallback to a relative path if running in development mode
 22 |     template_dir = os.path.join(os.path.dirname(__file__), "templates")
 23 | 
 24 | # Set up Jinja2 environment
 25 | env = Environment(loader=FileSystemLoader(template_dir))
 26 | 
 27 | # Load the template
 28 | template = env.get_template("report_template.html")
 29 | 
 30 | 
 31 | def copy_files_to_report_dir(result_dir, report_dir, files_to_copy):
 32 |     """Copy specified files (HTML or PNG) to the report directory."""
 33 |     os.makedirs(report_dir, exist_ok=True)
 34 |     for file in files_to_copy:
 35 |         shutil.copy2(file, os.path.join(report_dir, os.path.basename(file)))
 36 | 
 37 | 
 38 | def load_cauchy_table(csv_file):
 39 |     """Load the Cauchy combination table from a compressed CSV file using Pandas."""
 40 |     df = pd.read_csv(csv_file, compression="gzip")
 41 |     table_data = df[["annotation", "p_cauchy", "p_median"]].to_dict(orient="records")
 42 |     return table_data
 43 | 
 44 | 
 45 | def load_gene_diagnostic_info(csv_file):
 46 |     """Load the Gene Diagnostic Info CSV file and return the top 50 rows."""
 47 |     df = pd.read_csv(csv_file)
 48 |     top_50 = df.head(50).to_dict(orient="records")
 49 |     return top_50
 50 | 
 51 | 
 52 | def embed_html_content(file_path):
 53 |     """Read the content of an HTML file and return it as a string."""
 54 |     with open(file_path) as f:
 55 |         return f.read()
 56 | 
 57 | 
 58 | def check_and_run_cauchy_combination(config):
 59 |     cauchy_result_file = config.get_cauchy_result_file(config.trait_name)
 60 |     if cauchy_result_file.exists():
 61 |         logger.info(
 62 |             f"Cauchy combination already done for trait {config.trait_name}. Results saved at {cauchy_result_file}. Skipping..."
 63 |         )
 64 |     else:
 65 |         logger.info(f"Running Cauchy combination for trait {config.trait_name}...")
 66 |         cauchy_config = CauchyCombinationConfig(
 67 |             workdir=config.workdir,
 68 |             sample_name=config.sample_name,
 69 |             annotation=config.annotation,
 70 |             trait_name=config.trait_name,
 71 |         )
 72 |         run_Cauchy_combination(cauchy_config)
 73 | 
 74 |     df = pd.read_csv(cauchy_result_file, compression="gzip")
 75 |     table_data = df[["annotation", "p_cauchy", "p_median"]].to_dict(orient="records")
 76 | 
 77 |     return table_data
 78 | 
 79 | 
 80 | def run_report(config: ReportConfig, run_parameters=None):
 81 |     logger.info("Running gsMap Diagnosis Module")
 82 |     run_Diagnosis(config)
 83 |     logger.info("gsMap Diagnosis running successfully")
 84 | 
 85 |     report_dir = config.get_report_dir(config.trait_name)
 86 |     gene_diagnostic_info_file = config.get_gene_diagnostic_info_save_path(config.trait_name)
 87 |     gene_diagnostic_info = load_gene_diagnostic_info(gene_diagnostic_info_file)
 88 | 
 89 |     # Load data (Cauchy table and gene diagnostic info)
 90 |     cauchy_table = check_and_run_cauchy_combination(config)
 91 | 
 92 |     # Paths to PNGs for gene expression and GSS distribution
 93 |     gss_distribution_dir = config.get_GSS_plot_dir(config.trait_name)
 94 | 
 95 |     gene_plots = []
 96 |     plot_select_gene_list = (
 97 |         config.get_GSS_plot_select_gene_file(config.trait_name).read_text().splitlines()
 98 |     )
 99 |     for gene_name in plot_select_gene_list:
100 |         expression_png = (
101 |             gss_distribution_dir / f"{config.sample_name}_{gene_name}_Expression_Distribution.png"
102 |         )
103 |         gss_png = gss_distribution_dir / f"{config.sample_name}_{gene_name}_GSS_Distribution.png"
104 |         # check if expression and GSS plots exist
105 |         if not os.path.exists(expression_png) or not os.path.exists(gss_png):
106 |             print(f"Skipping gene {gene_name} as expression or GSS plot is missing.")
107 |             continue
108 |         gene_plots.append(
109 |             {
110 |                 "name": gene_name,
111 |                 "expression_plot": expression_png.relative_to(
112 |                     report_dir
113 |                 ),  # Path for gene expression plot
114 |                 "gss_plot": gss_png.relative_to(report_dir),  # Path for GSS distribution plot
115 |             }
116 |         )
117 | 
118 |     # # Copy PNG files to the report directory
119 |     # copy_files_to_report_dir(result_dir, report_dir, [gene['expression_plot'] for gene in gene_plots] + [gene['gss_plot'] for gene in gene_plots])
120 | 
121 |     # Update paths to point to copied images inside the report folder
122 |     # for gene in gene_plots:
123 |     #     gene['expression_plot'] = os.path.join(os.path.basename(gene['expression_plot']))
124 |     #     gene['gss_plot'] = os.path.join(os.path.basename(gene['gss_plot']))
125 | 
126 |     # Sample data for other report components
127 |     title = f"{config.sample_name} Genetic Spatial Mapping Report"
128 | 
129 |     genetic_mapping_plot = embed_html_content(
130 |         config.get_gsMap_html_plot_save_path(config.trait_name)
131 |     )
132 |     manhattan_plot = embed_html_content(config.get_manhattan_html_plot_path(config.trait_name))
133 | 
134 |     gsmap_version = gsMap.__version__
135 |     # Render the template with dynamic content, including the run parameters
136 | 
137 |     trait_name = config.trait_name
138 |     default_run_parameters = {
139 |         "Sample Name": config.sample_name,
140 |         "Trait Name": trait_name,
141 |         "Summary Statistics File": config.sumstats_file,
142 |         "HDF5 Path": config.hdf5_with_latent_path,
143 |         "Annotation": config.annotation,
144 |         "Spatial LDSC Save Directory": config.ldsc_save_dir,
145 |         "Cauchy Directory": config.cauchy_save_dir,
146 |         "Report Directory": config.get_report_dir(trait_name),
147 |         "gsMap Report File": config.get_gsMap_report_file(trait_name),
148 |         "Gene Diagnostic Info File": config.get_gene_diagnostic_info_save_path(trait_name),
149 |         "Report Generation Date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
150 |     }
151 | 
152 |     if run_parameters is not None:
153 |         default_run_parameters.update(run_parameters)
154 | 
155 |     output_html = template.render(
156 |         title=title,
157 |         genetic_mapping_plot=genetic_mapping_plot,  # Inlined genetic mapping plot
158 |         manhattan_plot=manhattan_plot,  # Inlined Manhattan plot
159 |         cauchy_table=cauchy_table,
160 |         gene_plots=gene_plots,  # List of PNG paths for gene plots
161 |         gsmap_version=gsmap_version,
162 |         parameters=default_run_parameters,  # Pass the run parameters to the template
163 |         gene_diagnostic_info=gene_diagnostic_info,  # Include top 50 gene diagnostic info rows
164 |     )
165 | 
166 |     # Save the generated HTML report in the 'report' directory
167 |     report_file = config.get_gsMap_report_file(config.trait_name)
168 |     with open(report_file, "w") as f:
169 |         f.write(output_html)
170 | 
171 |     logger.info(f"Report generated successfully! Saved at {report_file}.")
172 |     logger.info(
173 |         "Copy the report directory to your local PC and open the HTML report file in a web browser to view the report."
174 |     )
175 | 


--------------------------------------------------------------------------------
/src/gsMap/run_all_mode.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from pathlib import Path
  4 | 
  5 | from gsMap.cauchy_combination_test import run_Cauchy_combination
  6 | from gsMap.config import (
  7 |     CauchyCombinationConfig,
  8 |     FindLatentRepresentationsConfig,
  9 |     GenerateLDScoreConfig,
 10 |     LatentToGeneConfig,
 11 |     ReportConfig,
 12 |     RunAllModeConfig,
 13 |     SpatialLDSCConfig,
 14 | )
 15 | from gsMap.find_latent_representation import run_find_latent_representation
 16 | from gsMap.generate_ldscore import run_generate_ldscore
 17 | from gsMap.latent_to_gene import run_latent_to_gene
 18 | from gsMap.report import run_report
 19 | from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
 20 | 
 21 | 
 22 | def format_duration(seconds):
 23 |     hours = int(seconds // 3600)
 24 |     minutes = int((seconds % 3600) // 60)
 25 |     return f"{hours}h {minutes}m"
 26 | 
 27 | 
 28 | def run_pipeline(config: RunAllModeConfig):
 29 |     # # Set up logging
 30 |     _current_datatime = time.strftime("%Y%m%d_%H%M%S")
 31 |     log_file = (
 32 |         Path(config.workdir)
 33 |         / config.sample_name
 34 |         / f"gsMap_pipeline_{config.sample_name}_{_current_datatime}.log"
 35 |     )
 36 |     log_file.parent.mkdir(parents=True, exist_ok=True)
 37 |     logging.basicConfig(
 38 |         level=logging.INFO,
 39 |         format="[{asctime}] {levelname:.5s} | {name} - {message}",
 40 |         handlers=[
 41 |             logging.FileHandler(log_file),
 42 |         ],
 43 |         style="{",
 44 |     )
 45 | 
 46 |     logger = logging.getLogger("gsMap.pipeline")
 47 |     logger.info("Starting pipeline with configuration: %s", config)
 48 |     pipeline_start_time = time.time()
 49 | 
 50 |     # Step 1: Find latent representations
 51 |     if config.latent_representation is not None:
 52 |         logger.warning(
 53 |             f"Using the provided latent representation: {config.latent_representation} in {config.hdf5_path}. This would skip the Find_latent_representations step."
 54 |         )
 55 |         logger.info(
 56 |             "Skipping step 1: Find latent representations, as latent representation is provided."
 57 |         )
 58 |         latent_to_gene_input_hdf5_path = config.hdf5_path
 59 |     else:
 60 |         latent_to_gene_input_hdf5_path = None
 61 |         logger.info(
 62 |             "No latent representation provided. Will run the Find_latent_representations step."
 63 |         )
 64 |         find_latent_config = FindLatentRepresentationsConfig(
 65 |             workdir=config.workdir,
 66 |             input_hdf5_path=config.hdf5_path,
 67 |             sample_name=config.sample_name,
 68 |             annotation=config.annotation,
 69 |             data_layer=config.data_layer,
 70 |             n_comps=config.n_comps,
 71 |             pearson_residuals=config.pearson_residuals,
 72 |         )
 73 | 
 74 |         # Step 1: Find latent representations
 75 |         start_time = time.time()
 76 | 
 77 |         logger.info("Step 1: Finding latent representations")
 78 |         if Path(find_latent_config.hdf5_with_latent_path).exists():
 79 |             logger.info(
 80 |                 f"Find latent representations already done. Results saved at {find_latent_config.hdf5_with_latent_path}. Skipping..."
 81 |             )
 82 |         else:
 83 |             run_find_latent_representation(find_latent_config)
 84 |         end_time = time.time()
 85 |         logger.info(f"Step 1 completed in {format_duration(end_time - start_time)}.")
 86 | 
 87 |     latent_to_gene_config = LatentToGeneConfig(
 88 |         input_hdf5_path=latent_to_gene_input_hdf5_path,
 89 |         workdir=config.workdir,
 90 |         sample_name=config.sample_name,
 91 |         annotation=config.annotation,
 92 |         latent_representation=config.latent_representation,
 93 |         num_neighbour=config.num_neighbour,
 94 |         num_neighbour_spatial=config.num_neighbour_spatial,
 95 |         homolog_file=config.homolog_file,
 96 |         gM_slices=config.gM_slices,
 97 |     )
 98 | 
 99 |     ldscore_config = GenerateLDScoreConfig(
100 |         workdir=config.workdir,
101 |         sample_name=config.sample_name,
102 |         chrom="all",
103 |         bfile_root=config.bfile_root,
104 |         keep_snp_root=config.keep_snp_root,
105 |         gtf_annotation_file=config.gtffile,
106 |         spots_per_chunk=5_000,
107 |         baseline_annotation_dir=config.baseline_annotation_dir,
108 |         SNP_gene_pair_dir=config.SNP_gene_pair_dir,
109 |         ldscore_save_format="quick_mode",
110 |     )
111 | 
112 |     # Step 2: Latent to gene
113 |     start_time = time.time()
114 |     logger.info("Step 2: Mapping latent representations to genes")
115 |     if Path(latent_to_gene_config.mkscore_feather_path).exists():
116 |         logger.info(
117 |             f"Latent to gene mapping already done. Results saved at {latent_to_gene_config.mkscore_feather_path}. Skipping..."
118 |         )
119 |     else:
120 |         run_latent_to_gene(latent_to_gene_config)
121 |     end_time = time.time()
122 |     logger.info(f"Step 2 completed in {format_duration(end_time - start_time)}.")
123 | 
124 |     # Step 3: Generate LDScores
125 |     start_time = time.time()
126 |     logger.info("Step 3: Generating LDScores")
127 | 
128 |     # check if LDscore has been generated by the done file
129 |     ldsc_done_file = (
130 |         Path(ldscore_config.ldscore_save_dir) / f"{config.sample_name}_generate_ldscore.done"
131 |     )
132 |     if ldsc_done_file.exists():
133 |         logger.info(
134 |             f"Basic LDScore generation already done. Results saved at {ldscore_config.ldscore_save_dir}. Skipping..."
135 |         )
136 |     else:
137 |         run_generate_ldscore(ldscore_config)
138 |         end_time = time.time()
139 |         logger.info(f"Step 3 completed in {format_duration(end_time - start_time)}.")
140 |         # create a done file
141 |         ldsc_done_file.touch()
142 | 
143 |     # Step 4: Spatial LDSC
144 |     start_time = time.time()
145 |     logger.info("Step 4: Running spatial LDSC")
146 | 
147 |     sumstats_config = config.sumstats_config_dict
148 |     for trait_name in sumstats_config:
149 |         logger.info("Running spatial LDSC for trait: %s", trait_name)
150 |         # detect if the spatial LDSC has been done:
151 |         spatial_ldsc_result_file = (
152 |             Path(config.ldsc_save_dir) / f"{config.sample_name}_{trait_name}.csv.gz"
153 |         )
154 | 
155 |         if spatial_ldsc_result_file.exists():
156 |             logger.info(
157 |                 f"Spatial LDSC already done for trait {trait_name}. Results saved at {spatial_ldsc_result_file}. Skipping..."
158 |             )
159 |             continue
160 | 
161 |         spatial_ldsc_config_trait = SpatialLDSCConfig(
162 |             workdir=config.workdir,
163 |             sumstats_file=sumstats_config[trait_name],
164 |             trait_name=trait_name,
165 |             w_file=config.w_file,
166 |             sample_name=config.sample_name,
167 |             # ldscore_save_dir=spatial_ldsc_config.ldscore_save_dir,
168 |             # ldsc_save_dir=spatial_ldsc_config.ldsc_save_dir,
169 |             num_processes=config.max_processes,
170 |             ldscore_save_format="quick_mode",
171 |             snp_gene_weight_adata_path=config.snp_gene_weight_adata_path,
172 |         )
173 |         run_spatial_ldsc(spatial_ldsc_config_trait)
174 |     end_time = time.time()
175 |     logger.info(f"Step 4 completed in {format_duration(end_time - start_time)}.")
176 | 
177 |     # Step 5: Cauchy combination test
178 |     start_time = time.time()
179 |     logger.info("Step 5: Running Cauchy combination test")
180 |     for trait_name in sumstats_config:
181 |         # check if the cauchy combination has been done
182 |         cauchy_result_file = config.get_cauchy_result_file(trait_name)
183 |         if cauchy_result_file.exists():
184 |             logger.info(
185 |                 f"Cauchy combination already done for trait {trait_name}. Results saved at {cauchy_result_file}. Skipping..."
186 |             )
187 |             continue
188 |         cauchy_config = CauchyCombinationConfig(
189 |             workdir=config.workdir,
190 |             sample_name=config.sample_name,
191 |             annotation=config.annotation,
192 |             trait_name=trait_name,
193 |         )
194 |         run_Cauchy_combination(cauchy_config)
195 |     end_time = time.time()
196 |     logger.info(f"Step 5 completed in {format_duration(end_time - start_time)}.")
197 | 
198 |     # Step 6: Generate final report
199 |     start_time = time.time()
200 |     for trait_name in sumstats_config:
201 |         logger.info("Step 6: Running final report generation for trait: %s", trait_name)
202 |         report_config = ReportConfig(
203 |             workdir=config.workdir,
204 |             sample_name=config.sample_name,
205 |             annotation=config.annotation,
206 |             trait_name=trait_name,
207 |             plot_type="all",
208 |             top_corr_genes=50,
209 |             selected_genes=None,
210 |             sumstats_file=sumstats_config[trait_name],
211 |         )
212 |         gsMap_report_file = report_config.get_gsMap_report_file(trait_name)
213 |         if Path(gsMap_report_file).exists():
214 |             logger.info(
215 |                 f"Final report already generated for trait {trait_name}. Results saved at {gsMap_report_file}. Skipping..."
216 |             )
217 |             continue
218 | 
219 |         # Create the run parameters dictionary for each trait
220 |         run_parameter_dict = {
221 |             "Sample Name": config.sample_name,
222 |             "Trait Name": trait_name,
223 |             "Summary Statistics File": sumstats_config[trait_name],
224 |             "HDF5 Path": config.hdf5_path,
225 |             "Annotation": config.annotation,
226 |             "Number of Processes": config.max_processes,
227 |             "Spatial LDSC Save Directory": config.ldsc_save_dir,
228 |             "Cauchy Directory": config.cauchy_save_dir,
229 |             "Report Directory": config.get_report_dir(trait_name),
230 |             "gsMap Report File": config.get_gsMap_report_file(trait_name),
231 |             "Gene Diagnostic Info File": config.get_gene_diagnostic_info_save_path(trait_name),
232 |             "Spending Time": format_duration(time.time() - pipeline_start_time),
233 |         }
234 | 
235 |         # Pass the run parameter dictionary to the report generation function
236 |         run_report(report_config, run_parameters=run_parameter_dict)
237 | 
238 |     end_time = time.time()
239 |     logger.info(f"Step 6 completed in {format_duration(end_time - start_time)}.")
240 | 
241 |     logger.info("Pipeline completed successfully.")
242 | 


--------------------------------------------------------------------------------
/src/gsMap/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import setuptools
3 | 
4 | if __name__ == "__main__":
5 |     setuptools.setup(name="gsMap")
6 | 


--------------------------------------------------------------------------------
/src/gsMap/templates/report_template.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>{{ title }}</title>
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |     <!-- Bootstrap CSS -->
  8 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
  9 |     <!-- Custom Styles -->
 10 |     <style>
 11 |         body {
 12 |             padding: 20px;
 13 |             font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
 14 |         }
 15 |         .plot-container {
 16 |             margin-bottom: 50px;
 17 |         }
 18 |         .section-description {
 19 |             color: #6c757d;
 20 |             font-size: 0.95rem;
 21 |             margin-bottom: 20px;
 22 |         }
 23 |         .scrollable-table {
 24 |             max-height: 400px;
 25 |             overflow-y: auto;
 26 |         }
 27 |         .table thead th {
 28 |             position: sticky;
 29 |             top: 0;
 30 |             background-color: #f8f9fa;
 31 |         }
 32 |         img {
 33 |             max-width: 100%;
 34 |             height: auto;
 35 |             border: 1px solid #dee2e6;
 36 |             border-radius: 5px;
 37 |         }
 38 |         .gene-select-label {
 39 |             font-weight: bold;
 40 |             margin-bottom: 10px;
 41 |         }
 42 |         .collapse-toggle {
 43 |             cursor: pointer;
 44 |             color: #0d6efd;
 45 |             text-decoration: underline;
 46 |         }
 47 |     </style>
 48 | </head>
 49 | <body>
 50 |     <div class="container-fluid">
 51 |         <h1 class="mb-4">{{ title }}</h1>
 52 | 
 53 |         <!-- Genetic Spatial Mapping Plot -->
 54 |         <div class="plot-container">
 55 |             <h2>Genetic Spatial Mapping Plot</h2>
 56 |             <p class="section-description">This plot shows the spatial genetic mapping results across different tissues.</p>
 57 |             <div class="border rounded p-3">
 58 |                 {{ genetic_mapping_plot|safe }}
 59 |             </div>
 60 |         </div>
 61 | 
 62 |         <!-- Cauchy Combination Result Table -->
 63 |         <div class="plot-container">
 64 |             <h2>Cauchy Combination Result</h2>
 65 |             <p class="section-description">This table presents the results of the Cauchy combination test, summarizing the genetic associations.</p>
 66 |             <div class="scrollable-table">
 67 |                 <table class="table table-hover table-bordered">
 68 |                     <thead class="table-light">
 69 |                         <tr>
 70 |                             <th>Annotation</th>
 71 |                             <th>P Cauchy</th>
 72 |                             <th>P Median</th>
 73 |                         </tr>
 74 |                     </thead>
 75 |                     <tbody>
 76 |                         {% for row in cauchy_table %}
 77 |                         <tr>
 78 |                             <td>{{ row.annotation }}</td>
 79 |                             <td>{{ "%.4e"|format(row.p_cauchy) }}</td>
 80 |                             <td>{{ "%.4e"|format(row.p_median) }}</td>
 81 |                         </tr>
 82 |                         {% endfor %}
 83 |                     </tbody>
 84 |                 </table>
 85 |             </div>
 86 |         </div>
 87 | 
 88 |         <!-- Manhattan Plot -->
 89 |         <div class="plot-container">
 90 |             <h2>Diagnosis Manhattan Plot</h2>
 91 |             <p class="section-description">The Manhattan plot shows the association of SNPs with the top associated gene across the genome.</p>
 92 |             <div class="border rounded p-3">
 93 |                 {{ manhattan_plot|safe }}
 94 |             </div>
 95 |         </div>
 96 | 
 97 |         <!-- Gene Expression and GSS Distribution -->
 98 |         <div class="plot-container">
 99 |             <h2>Gene Expression and GSS Distribution</h2>
100 |             <p class="section-description">Select a gene to view its expression distribution and gene specificity score (GSS).</p>
101 |             <label for="geneSelect" class="gene-select-label">Select a gene:</label>
102 |             <select id="geneSelect" class="form-select mb-4">
103 |                 {% for gene in gene_plots %}
104 |                 <option value="{{ gene.name }}">{{ gene.name }}</option>
105 |                 {% endfor %}
106 |             </select>
107 |             <div id="genePlots" class="row">
108 |                 <div class="col-md-6 mb-4">
109 |                     <h5>Expression Distribution</h5>
110 |                     <img src="{{ gene_plots[0].expression_plot }}" alt="{{ gene_plots[0].name }} Expression Distribution" id="expressionPlotImg" class="img-fluid">
111 |                 </div>
112 |                 <div class="col-md-6 mb-4">
113 |                     <h5>Gene Specificity Score (GSS)</h5>
114 |                     <img src="{{ gene_plots[0].gss_plot }}" alt="{{ gene_plots[0].name }} GSS Distribution" id="gssPlotImg" class="img-fluid">
115 |                 </div>
116 |             </div>
117 |         </div>
118 | 
119 |         <!-- Gene Diagnostic Info Table -->
120 |         <div class="plot-container">
121 |             <h2>Top 50 Gene Diagnostic Info</h2>
122 |             <p class="section-description">This table lists the top 50 genes based on diagnostic criteria, including the gene specificity score (GSS) and PCC.</p>
123 |             <div class="scrollable-table">
124 |                 <table class="table table-hover table-bordered">
125 |                     <thead class="table-light">
126 |                         <tr>
127 |                             <th>Gene</th>
128 |                             <th>Annotation</th>
129 |                             <th>Median GSS</th>
130 |                             <th>PCC</th>
131 |                         </tr>
132 |                     </thead>
133 |                     <tbody>
134 |                         {% for row in gene_diagnostic_info %}
135 |                         <tr>
136 |                             <td>{{ row.Gene }}</td>
137 |                             <td>{{ row.Annotation }}</td>
138 |                             <td>{{ "%.4f"|format(row.Median_GSS) }}</td>
139 |                             <td>{{ "%.4f"|format(row.PCC) }}</td>
140 |                         </tr>
141 |                         {% endfor %}
142 |                     </tbody>
143 |                 </table>
144 |             </div>
145 |         </div>
146 | 
147 |         <!-- Running Info (collapsible) -->
148 |         <div class="plot-container">
149 |             <h2>Running Info</h2>
150 |             <p class="section-description">Click to view detailed run information and parameters.</p>
151 |             <p class="collapse-toggle" data-bs-toggle="collapse" href="#runningInfo" role="button" aria-expanded="false" aria-controls="runningInfo">
152 |                 Show/Hide Running Info
153 |             </p>
154 |             <div class="collapse" id="runningInfo">
155 |                 <div class="card card-body">
156 |                     <p><strong>gsMap Version:</strong> {{ gsmap_version }}</p>
157 |                     <p><strong>Parameters:</strong></p>
158 |                     <ul class="mb-0">
159 |                         {% for key, value in parameters.items() %}
160 |                         <li><strong>{{ key }}:</strong> {{ value }}</li>
161 |                         {% endfor %}
162 |                     </ul>
163 |                 </div>
164 |             </div>
165 |         </div>
166 |     </div>
167 | 
168 |     <!-- JavaScript for Gene Plots -->
169 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
170 |     <script>
171 |         (function() {
172 |             const geneSelect = document.getElementById('geneSelect');
173 |             const expressionPlotImg = document.getElementById('expressionPlotImg');
174 |             const gssPlotImg = document.getElementById('gssPlotImg');
175 | 
176 |             const genePlots = {
177 |                 {% for gene in gene_plots %}
178 |                 "{{ gene.name }}": {
179 |                     expression_plot: "{{ gene.expression_plot }}",
180 |                     gss_plot: "{{ gene.gss_plot }}"
181 |                 }{% if not loop.last %},{% endif %}
182 |                 {% endfor %}
183 |             };
184 | 
185 |             geneSelect.addEventListener('change', function() {
186 |                 const selectedGene = this.value;
187 |                 const selectedGenePlots = genePlots[selectedGene];
188 | 
189 |                 // Update images
190 |                 expressionPlotImg.src = selectedGenePlots.expression_plot;
191 |                 expressionPlotImg.alt = `${selectedGene} Expression Distribution`;
192 |                 gssPlotImg.src = selectedGenePlots.gss_plot;
193 |                 gssPlotImg.alt = `${selectedGene} GSS Distribution`;
194 |             });
195 |         })();
196 |     </script>
197 | </body>
198 | </html>
199 | 


--------------------------------------------------------------------------------
/src/gsMap/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JianYang-Lab/gsMap/13b72534e0ad3d32c648025b32c50305239c517a/src/gsMap/utils/__init__.py


--------------------------------------------------------------------------------
/src/gsMap/utils/regression_read.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import logging
  3 | import os
  4 | 
  5 | import pandas as pd
  6 | 
  7 | logger = logging.getLogger("gsMap.utils.regression_read")
  8 | 
  9 | 
 10 | def _read_sumstats(fh, alleles=False, dropna=False):
 11 |     """Parse GWAS summary statistics."""
 12 |     logger.info(f"Reading summary statistics from {fh} ...")
 13 | 
 14 |     # Determine compression type
 15 |     compression = None
 16 |     if fh.endswith("gz"):
 17 |         compression = "gzip"
 18 |     elif fh.endswith("bz2"):
 19 |         compression = "bz2"
 20 | 
 21 |     # Define columns and dtypes
 22 |     dtype_dict = {"SNP": str, "Z": float, "N": float, "A1": str, "A2": str}
 23 |     usecols = ["SNP", "Z", "N"]
 24 |     if alleles:
 25 |         usecols += ["A1", "A2"]
 26 | 
 27 |     # Read the file
 28 |     try:
 29 |         sumstats = pd.read_csv(
 30 |             fh,
 31 |             sep=r"\s+",
 32 |             na_values=".",
 33 |             usecols=usecols,
 34 |             dtype=dtype_dict,
 35 |             compression=compression,
 36 |         )
 37 |     except (AttributeError, ValueError) as e:
 38 |         logger.error(f"Failed to parse sumstats file: {str(e.args)}")
 39 |         raise ValueError("Improperly formatted sumstats file: " + str(e.args)) from e
 40 | 
 41 |     # Drop NA values if specified
 42 |     if dropna:
 43 |         sumstats = sumstats.dropna(how="any")
 44 | 
 45 |     logger.info(f"Read summary statistics for {len(sumstats)} SNPs.")
 46 | 
 47 |     # Drop duplicates
 48 |     m = len(sumstats)
 49 |     sumstats = sumstats.drop_duplicates(subset="SNP")
 50 |     if m > len(sumstats):
 51 |         logger.info(f"Dropped {m - len(sumstats)} SNPs with duplicated rs numbers.")
 52 | 
 53 |     return sumstats
 54 | 
 55 | 
 56 | def _read_chr_files(base_path, suffix, expected_count=22):
 57 |     """Read chromosome files using glob pattern matching."""
 58 |     # Create the pattern to search for files
 59 |     file_pattern = f"{base_path}[1-9]*{suffix}*"
 60 | 
 61 |     # Find all matching files
 62 |     all_files = glob.glob(file_pattern)
 63 | 
 64 |     # Extract chromosome numbers
 65 |     chr_files = []
 66 |     for file in all_files:
 67 |         try:
 68 |             # Extract the chromosome number from filename
 69 |             file_name = os.path.basename(file)
 70 |             base_name = os.path.basename(base_path)
 71 |             chr_part = file_name.replace(base_name, "").split(suffix)[0]
 72 |             chr_num = int(chr_part)
 73 |             if 1 <= chr_num <= expected_count:
 74 |                 chr_files.append((chr_num, file))
 75 |         except (ValueError, IndexError):
 76 |             continue
 77 | 
 78 |     # Check if we have the expected number of chromosome files
 79 |     if len(chr_files) != expected_count:
 80 |         logger.warning(
 81 |             f"❗ SEVERE WARNING ❗ Expected {expected_count} chromosome files, but found {len(chr_files)}! "
 82 |             f"⚠️ For human GWAS data, all 22 autosomes must be present. Please verify your input files."
 83 |         )
 84 | 
 85 |     # Sort by chromosome number and return file paths
 86 |     chr_files.sort()
 87 |     return [file for _, file in chr_files]
 88 | 
 89 | 
 90 | def _read_file(file_path):
 91 |     """Read a file based on its format/extension."""
 92 |     try:
 93 |         if file_path.endswith(".feather"):
 94 |             return pd.read_feather(file_path)
 95 |         elif file_path.endswith(".parquet"):
 96 |             return pd.read_parquet(file_path)
 97 |         elif file_path.endswith(".gz"):
 98 |             return pd.read_csv(file_path, compression="gzip", sep="\t")
 99 |         elif file_path.endswith(".bz2"):
100 |             return pd.read_csv(file_path, compression="bz2", sep="\t")
101 |         else:
102 |             return pd.read_csv(file_path, sep="\t")
103 |     except Exception as e:
104 |         logger.error(f"Failed to read file {file_path}: {str(e)}")
105 |         raise
106 | 
107 | 
108 | def _read_ref_ld_v2(ld_file):
109 |     """Read reference LD scores for all chromosomes."""
110 |     suffix = ".l2.ldscore"
111 |     logger.info(f"Reading LD score annotations from {ld_file}[1-22]{suffix}...")
112 | 
113 |     # Get the chromosome files
114 |     chr_files = _read_chr_files(ld_file, suffix)
115 | 
116 |     # Read and concatenate all files
117 |     df_list = [_read_file(file) for file in chr_files]
118 | 
119 |     if not df_list:
120 |         logger.error(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
121 |         raise FileNotFoundError(f"No LD score files found matching pattern: {ld_file}*{suffix}*")
122 | 
123 |     ref_ld = pd.concat(df_list, axis=0)
124 |     logger.info(f"Loaded {len(ref_ld)} SNPs from LD score files")
125 | 
126 |     # Set SNP as index
127 |     if "index" in ref_ld.columns:
128 |         ref_ld.rename(columns={"index": "SNP"}, inplace=True)
129 |     if "SNP" in ref_ld.columns:
130 |         ref_ld.set_index("SNP", inplace=True)
131 | 
132 |     return ref_ld
133 | 
134 | 
135 | def _read_w_ld(w_file):
136 |     """Read LD weights for all chromosomes."""
137 |     suffix = ".l2.ldscore"
138 |     logger.info(f"Reading LD score annotations from {w_file}[1-22]{suffix}...")
139 | 
140 |     # Get the chromosome files
141 |     chr_files = _read_chr_files(w_file, suffix)
142 | 
143 |     if not chr_files:
144 |         logger.error(f"No LD score files found matching pattern: {w_file}*{suffix}*")
145 |         raise FileNotFoundError(f"No LD score files found matching pattern: {w_file}*{suffix}*")
146 | 
147 |     # Read and process each file
148 |     w_array = []
149 |     for file in chr_files:
150 |         x = _read_file(file)
151 | 
152 |         # Sort if possible
153 |         if "CHR" in x.columns and "BP" in x.columns:
154 |             x = x.sort_values(by=["CHR", "BP"])
155 | 
156 |         # Drop unnecessary columns
157 |         columns_to_drop = ["MAF", "CM", "Gene", "TSS", "CHR", "BP"]
158 |         columns_to_drop = [col for col in columns_to_drop if col in x.columns]
159 |         if columns_to_drop:
160 |             x = x.drop(columns=columns_to_drop, axis=1)
161 | 
162 |         w_array.append(x)
163 | 
164 |     # Concatenate and set column names
165 |     w_ld = pd.concat(w_array, axis=0)
166 |     logger.info(f"Loaded {len(w_ld)} SNPs from LD weight files")
167 | 
168 |     # Set column names
169 |     w_ld.columns = (
170 |         ["SNP", "LD_weights"] + list(w_ld.columns[2:])
171 |         if len(w_ld.columns) > 2
172 |         else ["SNP", "LD_weights"]
173 |     )
174 | 
175 |     return w_ld
176 | 


--------------------------------------------------------------------------------
/src/gsMap/visualize.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Literal
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import plotly.express as px
  7 | import scanpy as sc
  8 | from scipy.spatial import KDTree
  9 | 
 10 | from gsMap.config import VisualizeConfig
 11 | 
 12 | 
 13 | def load_ldsc(ldsc_input_file):
 14 |     ldsc = pd.read_csv(
 15 |         ldsc_input_file,
 16 |         compression="gzip",
 17 |         dtype={"spot": str, "p": float},
 18 |         index_col="spot",
 19 |         usecols=["spot", "p"],
 20 |     )
 21 |     ldsc["logp"] = -np.log10(ldsc.p)
 22 |     return ldsc
 23 | 
 24 | 
 25 | # %%
 26 | def load_st_coord(adata, feature_series: pd.Series, annotation):
 27 |     spot_name = adata.obs_names.to_list()
 28 |     assert "spatial" in adata.obsm.keys(), "spatial coordinates are not found in adata.obsm"
 29 | 
 30 |     # to DataFrame
 31 |     space_coord = adata.obsm["spatial"]
 32 |     if isinstance(space_coord, np.ndarray):
 33 |         space_coord = pd.DataFrame(space_coord, columns=["sx", "sy"], index=spot_name)
 34 |     else:
 35 |         space_coord = pd.DataFrame(space_coord.values, columns=["sx", "sy"], index=spot_name)
 36 | 
 37 |     space_coord = space_coord[space_coord.index.isin(feature_series.index)]
 38 |     space_coord_concat = pd.concat([space_coord.loc[feature_series.index], feature_series], axis=1)
 39 |     space_coord_concat.head()
 40 |     if annotation is not None:
 41 |         annotation = pd.Series(
 42 |             adata.obs[annotation].values, index=adata.obs_names, name="annotation"
 43 |         )
 44 |         space_coord_concat = pd.concat([space_coord_concat, annotation], axis=1)
 45 |     return space_coord_concat
 46 | 
 47 | 
 48 | def estimate_point_size_for_plot(coordinates, DEFAULT_PIXEL_WIDTH=1000):
 49 |     tree = KDTree(coordinates)
 50 |     distances, _ = tree.query(coordinates, k=2)
 51 |     avg_min_distance = np.mean(distances[:, 1])
 52 |     # get the width and height of the plot
 53 |     width = np.max(coordinates[:, 0]) - np.min(coordinates[:, 0])
 54 |     height = np.max(coordinates[:, 1]) - np.min(coordinates[:, 1])
 55 | 
 56 |     scale_factor = DEFAULT_PIXEL_WIDTH / max(width, height)
 57 |     pixel_width = width * scale_factor
 58 |     pixel_height = height * scale_factor
 59 | 
 60 |     point_size = np.ceil(avg_min_distance * scale_factor)
 61 |     return (pixel_width, pixel_height), point_size
 62 | 
 63 | 
 64 | def draw_scatter(
 65 |     space_coord_concat,
 66 |     title=None,
 67 |     fig_style: Literal["dark", "light"] = "light",
 68 |     point_size: int = None,
 69 |     width=800,
 70 |     height=600,
 71 |     annotation=None,
 72 |     color_by="logp",
 73 | ):
 74 |     # Set theme based on fig_style
 75 |     if fig_style == "dark":
 76 |         px.defaults.template = "plotly_dark"
 77 |     else:
 78 |         px.defaults.template = "plotly_white"
 79 | 
 80 |     custom_color_scale = [
 81 |         (1, "#d73027"),  # Red
 82 |         (7 / 8, "#f46d43"),  # Red-Orange
 83 |         (6 / 8, "#fdae61"),  # Orange
 84 |         (5 / 8, "#fee090"),  # Light Orange
 85 |         (4 / 8, "#e0f3f8"),  # Light Blue
 86 |         (3 / 8, "#abd9e9"),  # Sky Blue
 87 |         (2 / 8, "#74add1"),  # Medium Blue
 88 |         (1 / 8, "#4575b4"),  # Dark Blue
 89 |         (0, "#313695"),  # Deep Blue
 90 |     ]
 91 |     custom_color_scale.reverse()
 92 | 
 93 |     # Create the scatter plot
 94 |     fig = px.scatter(
 95 |         space_coord_concat,
 96 |         x="sx",
 97 |         y="sy",
 98 |         color=color_by,
 99 |         symbol="annotation" if annotation is not None else None,
100 |         title=title,
101 |         color_continuous_scale=custom_color_scale,
102 |         range_color=[0, max(space_coord_concat[color_by])],
103 |     )
104 | 
105 |     # Update marker size if specified
106 |     if point_size is not None:
107 |         fig.update_traces(marker=dict(size=point_size, symbol="circle"))
108 | 
109 |     # Update layout for figure size
110 |     fig.update_layout(
111 |         autosize=False,
112 |         width=width,
113 |         height=height,
114 |     )
115 | 
116 |     # Adjusting the legend
117 |     fig.update_layout(
118 |         legend=dict(
119 |             yanchor="top",
120 |             y=0.95,
121 |             xanchor="left",
122 |             x=1.0,
123 |             font=dict(
124 |                 size=10,
125 |             ),
126 |         )
127 |     )
128 | 
129 |     # Update colorbar to be at the bottom and horizontal
130 |     fig.update_layout(
131 |         coloraxis_colorbar=dict(
132 |             orientation="h",  # Make the colorbar horizontal
133 |             x=0.5,  # Center the colorbar horizontally
134 |             y=-0.0,  # Position below the plot
135 |             xanchor="center",  # Anchor the colorbar at the center
136 |             yanchor="top",  # Anchor the colorbar at the top to keep it just below the plot
137 |             len=0.75,  # Length of the colorbar relative to the plot width
138 |             title=dict(
139 |                 text="-log10(p)" if color_by == "logp" else color_by,  # Colorbar title
140 |                 side="top",  # Place the title at the top of the colorbar
141 |             ),
142 |         )
143 |     )
144 |     # Remove gridlines, axis labels, and ticks
145 |     fig.update_xaxes(
146 |         showgrid=False,  # Hide x-axis gridlines
147 |         zeroline=False,  # Hide x-axis zero line
148 |         showticklabels=False,  # Hide x-axis tick labels
149 |         title=None,  # Remove x-axis title
150 |         scaleanchor="y",  # Link the x-axis scale to the y-axis scale
151 |     )
152 | 
153 |     fig.update_yaxes(
154 |         showgrid=False,  # Hide y-axis gridlines
155 |         zeroline=False,  # Hide y-axis zero line
156 |         showticklabels=False,  # Hide y-axis tick labels
157 |         title=None,  # Remove y-axis title
158 |     )
159 | 
160 |     # Adjust margins to ensure no clipping and equal axis ratio
161 |     fig.update_layout(
162 |         margin=dict(l=0, r=0, t=20, b=10),  # Adjust margins to prevent clipping
163 |         height=width,  # Ensure the figure height matches the width for equal axis ratio
164 |     )
165 | 
166 |     # Adjust the title location and font size
167 |     fig.update_layout(
168 |         title=dict(
169 |             y=0.98,
170 |             x=0.5,  # Center the title horizontally
171 |             xanchor="center",  # Anchor the title at the center
172 |             yanchor="top",  # Anchor the title at the top
173 |             font=dict(
174 |                 size=20  # Increase the title font size
175 |             ),
176 |         )
177 |     )
178 | 
179 |     return fig
180 | 
181 | 
182 | def run_Visualize(config: VisualizeConfig):
183 |     print(f"------Loading LDSC results of {config.ldsc_save_dir}...")
184 |     ldsc = load_ldsc(
185 |         ldsc_input_file=Path(config.ldsc_save_dir)
186 |         / f"{config.sample_name}_{config.trait_name}.csv.gz"
187 |     )
188 | 
189 |     print(f"------Loading ST data of {config.sample_name}...")
190 |     adata = sc.read_h5ad(f"{config.hdf5_with_latent_path}")
191 | 
192 |     space_coord_concat = load_st_coord(adata, ldsc, annotation=config.annotation)
193 |     fig = draw_scatter(
194 |         space_coord_concat,
195 |         title=config.fig_title,
196 |         fig_style=config.fig_style,
197 |         point_size=config.point_size,
198 |         width=config.fig_width,
199 |         height=config.fig_height,
200 |         annotation=config.annotation,
201 |     )
202 | 
203 |     # Visualization
204 |     output_dir = Path(config.output_dir)
205 |     output_dir.mkdir(parents=True, exist_ok=True, mode=0o755)
206 |     output_file_html = output_dir / f"{config.sample_name}_{config.trait_name}.html"
207 |     output_file_pdf = output_dir / f"{config.sample_name}_{config.trait_name}.pdf"
208 |     output_file_csv = output_dir / f"{config.sample_name}_{config.trait_name}.csv"
209 | 
210 |     fig.write_html(str(output_file_html))
211 |     fig.write_image(str(output_file_pdf))
212 |     space_coord_concat.to_csv(str(output_file_csv))
213 | 
214 |     print(
215 |         f"------The visualization result is saved in a html file: {output_file_html} which can interactively viewed in a web browser and a pdf file: {output_file_pdf}."
216 |     )
217 |     print(f"------The visualization data is saved in a csv file: {output_file_csv}.")
218 | 


--------------------------------------------------------------------------------
/tests/test_advanced_usage.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import shlex
  3 | import sys
  4 | from pathlib import Path
  5 | from unittest.mock import patch
  6 | 
  7 | import pandas as pd
  8 | import pytest
  9 | 
 10 | from gsMap.main import main
 11 | 
 12 | 
 13 | def parse_bash_command(command: str) -> list[str]:
 14 |     """Convert multi-line bash command to argument list for sys.argv"""
 15 |     cleaned_command = command.replace("\\\n", " ")
 16 |     cleaned_command = " ".join(cleaned_command.splitlines())
 17 |     cleaned_command = " ".join(cleaned_command.split())
 18 |     return shlex.split(cleaned_command)
 19 | 
 20 | 
 21 | @pytest.mark.real_data
 22 | @pytest.mark.parametrize("symbolic_link_results", ["conditional_config"], indirect=True)
 23 | def test_conditional_analysis(
 24 |     symbolic_link_results,
 25 |     gene_marker_scores_fixture,
 26 |     additional_baseline_dir,
 27 |     spatial_ldsc_fixture,
 28 | ):
 29 |     """Test the conditional analysis functionality by providing additional baseline annotations"""
 30 |     logger = logging.getLogger("test_conditional_analysis")
 31 |     config = symbolic_link_results  # This will have links to latent_to_gene data
 32 | 
 33 |     logger.info("Using linked fixtures for latent representations and gene marker scores")
 34 | 
 35 |     # Step 3: Generate LDScores with additional baseline annotation
 36 |     logger.info("Step 3: Generating LDScores with additional baseline annotation")
 37 |     command = f"""
 38 |     gsmap run_generate_ldscore \
 39 |         --workdir '{config.workdir}' \
 40 |         --sample_name {config.sample_name} \
 41 |         --chrom all \
 42 |         --bfile_root '{config.bfile_root}' \
 43 |         --keep_snp_root '{config.keep_snp_root}' \
 44 |         --gtf_annotation_file '{config.gtffile}' \
 45 |         --gene_window_size 50000 \
 46 |         --additional_baseline_annotation '{additional_baseline_dir}'
 47 |     """
 48 |     with patch.object(sys, "argv", parse_bash_command(command)):
 49 |         main()
 50 | 
 51 |     # Verify additional baseline annotation directory was created
 52 |     additional_baseline_dir_output = (
 53 |         Path(config.workdir) / config.sample_name / "generate_ldscore" / "additional_baseline"
 54 |     )
 55 |     assert additional_baseline_dir_output.exists(), "Additional baseline directory was not created"
 56 | 
 57 |     # Step 4: Run spatial LDSC using the additional baseline annotation
 58 |     logger.info("Step 4: Running spatial LDSC with additional baseline annotation")
 59 |     command = f"""
 60 |     gsmap run_spatial_ldsc \
 61 |         --workdir '{config.workdir}' \
 62 |         --sample_name {config.sample_name} \
 63 |         --trait_name '{config.trait_name}' \
 64 |         --sumstats_file '{config.sumstats_file}' \
 65 |         --w_file '{config.w_file}' \
 66 |         --num_processes {config.max_processes} \
 67 |         --use_additional_baseline_annotation True
 68 |     """
 69 |     with patch.object(sys, "argv", parse_bash_command(command)):
 70 |         main()
 71 | 
 72 |     # Verify LDSC results
 73 |     ldsc_result_file = config.get_ldsc_result_file(config.trait_name)
 74 |     assert ldsc_result_file.exists(), "LDSC result file was not created"
 75 | 
 76 |     logger.info("Conditional analysis test completed successfully")
 77 | 
 78 | 
 79 | @pytest.mark.real_data
 80 | @pytest.mark.parametrize("symbolic_link_results", ["biorep_config1"], indirect=True)
 81 | def test_biological_replicates(symbolic_link_results, biorep_config2, work_dir):
 82 |     """Test gsMap on biological replicates using the slice mean functionality"""
 83 |     logger = logging.getLogger("test_biological_replicates")
 84 |     config1 = symbolic_link_results
 85 |     config2 = biorep_config2  # Just use the config directly without linking
 86 | 
 87 |     slice_mean_file = work_dir / "slice_mean_test.parquet"
 88 | 
 89 |     # Step 1: Create the slice mean from multiple samples
 90 |     logger.info("Step 1: Creating slice mean from multiple samples")
 91 |     command = f"""
 92 |     gsmap create_slice_mean \
 93 |         --sample_name_list {config1.sample_name} {config2.sample_name} \
 94 |         --h5ad_list {config1.hdf5_path} {config2.hdf5_path} \
 95 |         --slice_mean_output_file {slice_mean_file} \
 96 |         --data_layer '{config1.data_layer}' \
 97 |         --homolog_file '{config1.homolog_file}'
 98 |     """
 99 |     with patch.object(sys, "argv", parse_bash_command(command)):
100 |         main()
101 | 
102 |     # Verify slice mean file was created
103 |     assert slice_mean_file.exists(), "Slice mean file was not created"
104 | 
105 |     # Rest of the test continues as before...
106 |     # Verify slice mean file contains expected data
107 |     slice_mean_df = pd.read_parquet(slice_mean_file)
108 |     assert "G_Mean" in slice_mean_df.columns, "G_Mean column not found in slice mean file"
109 |     assert "frac" in slice_mean_df.columns, "frac column not found in slice mean file"
110 |     assert len(slice_mean_df) > 0, "Slice mean file is empty"
111 | 
112 |     # Update config with slice mean file
113 |     config1.gM_slices = str(slice_mean_file)
114 | 
115 |     # Step 2: Test using the slice mean with latent_to_gene
116 |     logger.info("Step 2: Using slice mean with quick_mode")
117 |     command = f"""
118 |     gsmap run_latent_to_gene \
119 |         --workdir '{config1.workdir}' \
120 |         --sample_name {config1.sample_name} \
121 |         --annotation '{config1.annotation}' \
122 |         --latent_representation 'latent_GVAE' \
123 |         --num_neighbour {config1.num_neighbour} \
124 |         --num_neighbour_spatial {config1.num_neighbour_spatial} \
125 |         --homolog_file '{config1.homolog_file}'\
126 |         --gM_slices '{config1.gM_slices}'
127 |     """
128 |     with patch.object(sys, "argv", parse_bash_command(command)):
129 |         main()
130 | 
131 |     # Verify quick_mode results
132 |     mkscore_file = config1.mkscore_feather_path
133 |     assert mkscore_file.exists(), "Marker score file was not created in quick_mode"
134 | 
135 |     # Step 3: Run Cauchy combination across multiple samples
136 |     combined_result_file = work_dir / "combined_IQ_cauchy.csv.gz"
137 |     command = f"""
138 |     gsmap run_cauchy_combination \
139 |         --workdir '{config1.workdir}' \
140 |         --sample_name_list {config1.sample_name} {config1.sample_name} \
141 |         --trait_name '{config1.trait_name}' \
142 |         --annotation '{config1.annotation}' \
143 |         --output_file '{combined_result_file}'
144 |     """
145 |     with patch.object(sys, "argv", parse_bash_command(command)):
146 |         main()
147 | 
148 |     # Verify combined result file was created
149 |     assert combined_result_file.exists(), "Combined Cauchy result file was not created"
150 | 
151 |     logger.info("Biological replicates test completed successfully")
152 | 
153 | 
154 | @pytest.mark.real_data
155 | @pytest.mark.parametrize("symbolic_link_results", ["customlatent_config"], indirect=True)
156 | def test_customized_latent_representations(symbolic_link_results, latent_representations_fixture):
157 |     """Test using customized latent representations in gsMap"""
158 |     logger = logging.getLogger("test_customized_latent")
159 |     config = symbolic_link_results  # This will have links to latent_representation data
160 | 
161 |     logger.info("Using linked fixtures for initial latent representations")
162 | 
163 |     # Step 2: Use the PCA latent representation instead of the default GVAE
164 |     custom_latent = "latent_PCA"
165 |     logger.info(f"Step 2: Using custom latent representation: {custom_latent}")
166 |     command = f"""
167 |     gsmap run_latent_to_gene \
168 |         --workdir '{config.workdir}' \
169 |         --sample_name {config.sample_name} \
170 |         --annotation '{config.annotation}' \
171 |         --latent_representation '{custom_latent}' \
172 |         --num_neighbour {config.num_neighbour} \
173 |         --num_neighbour_spatial {config.num_neighbour_spatial} \
174 |         --homolog_file '{config.homolog_file}'
175 |     """
176 |     with patch.object(sys, "argv", parse_bash_command(command)):
177 |         main()
178 | 
179 |     # Verify mkscore file was created with the custom latent
180 |     mkscore_file = config.mkscore_feather_path
181 |     assert mkscore_file.exists(), f"Marker score file was not created with {custom_latent}"
182 | 
183 |     logger.info("Customized latent representations test completed successfully")
184 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import shlex
 3 | import sys
 4 | from unittest.mock import patch
 5 | 
 6 | import pytest
 7 | 
 8 | from gsMap.main import main
 9 | 
10 | 
11 | def parse_bash_command(command: str) -> list[str]:
12 |     """Convert multi-line bash command to argument list for sys.argv"""
13 |     cleaned_command = command.replace("\\\n", " ")
14 |     cleaned_command = " ".join(cleaned_command.splitlines())
15 |     cleaned_command = " ".join(cleaned_command.split())
16 |     return shlex.split(cleaned_command)
17 | 
18 | 
19 | @pytest.mark.real_data
20 | def test_gsmap_step_by_step_pipeline(cauchy_combination_fixture):
21 |     logger = logging.getLogger("test_gsmap_pipeline")
22 |     logger.info("Pipeline test completed successfully - using shared fixtures")
23 | 
24 | 
25 | @pytest.mark.real_data
26 | @pytest.mark.parametrize("symbolic_link_results", ["quickmode_config"], indirect=True)
27 | def test_gsmap_quick_mode(
28 |     symbolic_link_results, latent_representations_fixture, gene_marker_scores_fixture
29 | ):
30 |     """Test the gsMap quick_mode pipeline with real data"""
31 |     logger = logging.getLogger("test_gsmap_quick_mode")
32 |     logger.info("Starting quick_mode pipeline test with linked fixtures")
33 |     config = symbolic_link_results
34 | 
35 |     # Test the quick_mode command (will reuse linked directories)
36 |     command = f"""
37 |     gsmap quick_mode \
38 |         --workdir '{config.workdir}' \
39 |         --homolog_file '{config.homolog_file}' \
40 |         --sample_name {config.sample_name} \
41 |         --gsMap_resource_dir '{config.gsMap_resource_dir}' \
42 |         --hdf5_path '{config.hdf5_path}' \
43 |         --annotation '{config.annotation}' \
44 |         --data_layer '{config.data_layer}' \
45 |         --sumstats_file '{config.sumstats_file}' \
46 |         --trait_name '{config.trait_name}' \
47 |         --max_processes {config.max_processes}
48 |     """
49 | 
50 |     with patch.object(sys, "argv", parse_bash_command(command)):
51 |         main()
52 | 
53 |     # Verify output files and directories
54 |     # Verify spatial_ldsc step
55 |     spatial_ldsc_result = config.get_ldsc_result_file(config.trait_name)
56 |     assert spatial_ldsc_result.exists(), "Spatial LDSC results not created"
57 |     assert spatial_ldsc_result.stat().st_size > 0, "Spatial LDSC results file is empty"
58 | 
59 |     # Verify cauchy_combination step
60 |     cauchy_result = config.get_cauchy_result_file(config.trait_name)
61 |     assert cauchy_result.exists(), "Cauchy combination results not created"
62 |     assert cauchy_result.stat().st_size > 0, "Cauchy combination results file is empty"
63 | 
64 |     # Verify report generation
65 |     report_file = config.get_gsMap_report_file(config.trait_name)
66 |     assert report_file.exists(), "Final report not created"
67 |     assert report_file.stat().st_size > 0, "Final report file is empty"
68 | 
69 |     logger.info("Quick mode pipeline test completed successfully")
70 | 


--------------------------------------------------------------------------------
/tests/test_docs_cli_parsing.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import shlex
  3 | from pathlib import Path
  4 | 
  5 | import pytest
  6 | 
  7 | from gsMap.main import create_parser
  8 | 
  9 | 
 10 | # Import your original functions
 11 | def extract_bash_blocks(markdown_text):
 12 |     bash_block_pattern = r"""
 13 |         (?:
 14 |             (?:```|~~~)(?:bash|shell|sh){1,}\s*\n
 15 |             (.*?)
 16 |             (?:```|~~~)
 17 |         )
 18 |     """
 19 |     blocks = re.finditer(bash_block_pattern, markdown_text, re.VERBOSE | re.DOTALL)
 20 |     return [block.group(1).strip() for block in blocks]
 21 | 
 22 | 
 23 | def parse_gsmap_commands(bash_script):
 24 |     def join_multiline_commands(script):
 25 |         lines = script.split("\n")
 26 |         joined_lines = []
 27 |         current_line = ""
 28 | 
 29 |         for line in lines:
 30 |             line = line.strip()
 31 |             if not line or line.startswith("#"):
 32 |                 if current_line:
 33 |                     joined_lines.append(current_line)
 34 |                     current_line = ""
 35 |                 continue
 36 | 
 37 |             if line.endswith("\\"):
 38 |                 current_line += line[:-1].strip() + " "
 39 |             else:
 40 |                 current_line += line
 41 |                 joined_lines.append(current_line)
 42 |                 current_line = ""
 43 | 
 44 |         if current_line:
 45 |             joined_lines.append(current_line)
 46 | 
 47 |         return "\n".join(joined_lines)
 48 | 
 49 |     gsmap_pattern = r"""
 50 |         \b(?:\/?\w+\/)*gsmap
 51 |         (?:\.(?:exe|sh))?
 52 |         \s+
 53 |         (.*)
 54 |     """
 55 | 
 56 |     processed_script = join_multiline_commands(bash_script)
 57 |     matches = re.finditer(gsmap_pattern, processed_script, re.VERBOSE)
 58 | 
 59 |     gsmap_commands = []
 60 |     for match in matches:
 61 |         full_command = match.group(0).strip()
 62 |         args_str = match.group(1).strip()
 63 |         args = parse_bash_command(args_str)
 64 |         gsmap_commands.append({"full_command": full_command, "arguments": args})
 65 | 
 66 |     return gsmap_commands
 67 | 
 68 | 
 69 | def parse_markdown_gsmap_commands(markdown_text):
 70 |     all_commands = []
 71 |     bash_blocks = extract_bash_blocks(markdown_text)
 72 | 
 73 |     for i, block in enumerate(bash_blocks, 1):
 74 |         commands = parse_gsmap_commands(block)
 75 |         if commands:
 76 |             all_commands.append({"block_number": i, "commands": commands})
 77 | 
 78 |     return all_commands
 79 | 
 80 | 
 81 | def parse_bash_command(command: str) -> list[str]:
 82 |     cleaned_command = command.replace("\\\n", " ")
 83 |     cleaned_command = " ".join(cleaned_command.splitlines())
 84 |     cleaned_command = " ".join(cleaned_command.split())
 85 |     return shlex.split(cleaned_command)
 86 | 
 87 | 
 88 | # Test fixtures
 89 | @pytest.fixture
 90 | def tutorial_files():
 91 |     return [
 92 |         "docs/source/advanced_usage.md",
 93 |         "docs/source/data_format.md",
 94 |         "docs/source/quick_mode.md",
 95 |         "docs/source/step_by_step.md",
 96 |     ]
 97 | 
 98 | 
 99 | @pytest.fixture
100 | def gsmap_parser():
101 |     return create_parser()
102 | 
103 | 
104 | # Test functions
105 | def test_markdown_files_exist(tutorial_files):
106 |     """Test if all documentation files exist"""
107 |     for file_path in tutorial_files:
108 |         assert Path(file_path).exists(), f"File {file_path} does not exist"
109 | 
110 | 
111 | def test_parse_commands_from_all_docs(tutorial_files, gsmap_parser):
112 |     """Test if all gsmap commands in documentation can be parsed"""
113 |     for file_path in tutorial_files:
114 |         markdown_content = Path(file_path).read_text()
115 |         parsed_commands = parse_markdown_gsmap_commands(markdown_content)
116 |         # Test each command block
117 |         for command_block in parsed_commands:
118 |             for cmd in command_block["commands"]:
119 |                 try:
120 |                     args = gsmap_parser.parse_args(cmd["arguments"])
121 |                     assert hasattr(args, "func"), (
122 |                         f"Command missing function handler: {cmd['full_command']}"
123 |                     )
124 |                 except SystemExit as e:
125 |                     pytest.fail(f"Failed to parse command: {cmd['full_command']}\n{e}")
126 | 


--------------------------------------------------------------------------------
/visualization_web_docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/visualization_web_docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/visualization_web_docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==6
2 | gsMap[doc]
3 | 


--------------------------------------------------------------------------------
/visualization_web_docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # import gsMap
 2 | project = "gsMap portal"
 3 | copyright = "2024, Liyang, Wenhao"
 4 | author = "Liyang, Wenhao"
 5 | # release = gsMap.__version__
 6 | 
 7 | # -- General configuration ---------------------------------------------------
 8 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 9 | 
10 | 
11 | extensions = [
12 |     "sphinx.ext.autodoc",
13 |     "sphinx.ext.autosummary",
14 |     "sphinx.ext.intersphinx",
15 |     "sphinx.ext.napoleon",
16 |     "sphinx.ext.viewcode",
17 |     "sphinx.ext.mathjax",
18 |     "sphinx_autodoc_typehints",
19 |     "sphinx_copybutton",
20 |     "sphinx.ext.viewcode",
21 |     "sphinxarg.ext",
22 |     "nbsphinx",
23 |     "myst_parser",
24 |     "sphinx_charts.charts",
25 |     "sphinxcontrib.jquery",
26 |     "sphinx_inline_tabs",
27 | ]
28 | 
29 | exclude_patterns = []
30 | 
31 | 
32 | # -- Options for HTML output -------------------------------------------------
33 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
34 | 
35 | # html_theme = 'alabaster'
36 | # html_theme = 'classic'
37 | # html_theme = 'sphinx_rtd_theme'
38 | # html_theme = "pydata_sphinx_theme"
39 | html_theme = "furo"
40 | html_static_path = ["_static"]
41 | templates_path = ["_templates"]
42 | 
43 | html_theme_options = {
44 |     # "light_css_variables": {
45 |     #     "color-brand-primary": "#7C4DFF",
46 |     #     "color-brand-content": "#7C4DFF",
47 |     #     "color-code-background": "#f5f5f5",
48 |     # },
49 | }
50 | 
51 | # add plotly.js to the build
52 | html_js_files = [
53 |     "https://cdn.plot.ly/plotly-latest.min.js",
54 | ]
55 | 
56 | rst_epilog = "\n.. include:: .special.rst\n"
57 | 


--------------------------------------------------------------------------------
/visualization_web_docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | gsMap visualization portal
 2 | ===================================
 3 | We developed gsMap to integrate spatial transcriptomics (ST) data with genome-wide association study (GWAS) summary data, enabling the mapping of spatial distributions of cells associated with human complex traits. gsMap has been applied to ST data from human embryos, mouse embryos, the macaque brain cortex, and 110 GWAS complex traits. To facilitate convenient visualization and re-analysis of our results, we created an online gsMap visualization platform. This documentation guides you on how to utilize the online visualization platform.
 4 | 
 5 | Tutorial Video
 6 | ------------
 7 | [here]
 8 | 
 9 | How to use the gsMap visualization portal
10 | ------------
11 | 
12 | We provide three ST datasets: human embryo, mouse embryo, and macaque brain cortex. You can select a dataset by clicking on its name.
13 | 
14 | **Step 1**: Once you have selected an ST dataset, the left navigation bar will display all available traits. Click on a trait's name to select it. Within each ST dataset, there are multiple ST sections; select a section by clicking its name. All results can be downloaded by clicking the `download` button to save both the results and the current figure. On the top left, we provide options to adjust the number and size of spots in ST sections with many spots.
15 | 
16 | .. image:: _static/raw1_add_txt.svg
17 |    :width: 800
18 |    :alt: Model architecture
19 | 
20 | |
21 | 
22 | **Step 2**: After selecting your ST section and traits, the gsMap mapping results will be displayed on the main page. The color represents the significance of the association (−log10 P-value) between the spots and the trait. Hovering over a spot will reveal the specific cell type (or tissue) annotation, spatial coordinates, and association value for that spot. You can adjust the p-value bar to set a threshold and display only the spots that meet the specified significance level.
23 | 
24 | .. image:: _static/raw2_add_txt.svg
25 |    :width: 800
26 |    :alt: Model architecture
27 | 
28 | |
29 | 
30 | **Step 3**: On the right side, you will see the cell type or tissue annotations for this ST dataset. Click the `switch` button to change the spot colors from trait-association significance to spot annotations. Additionally, click the `cell-type` button to select spots belonging to specific annotations. You can select multiple cell types to display results for the annotations you are interested in.
31 | 
32 | .. image:: _static/raw3_add_txt.svg
33 |    :width: 800
34 |    :alt: Model architecture
35 | .. image:: _static/raw4_add_txt.svg
36 |    :width: 800
37 |    :alt: Model architecture
38 | |
39 | 
40 | **Step 4**: To compare gsMap results across different ST sections or traits, click the `compare` button and select the ST sections and traits you wish to compare.
41 | 
42 | .. image:: _static/raw5_add_txt.svg
43 |    :width: 800
44 |    :alt: Model architecture
45 | 


--------------------------------------------------------------------------------