├── .github
    ├── FUNDING.yml
    ├── release_message.sh
    └── workflows
    │   ├── main.yml
    │   └── release.yml
├── .gitignore
├── .gitmodules
├── CONTRIBUTING.md
├── Dockerfile
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── R
    ├── Celligner_helpers.R
    ├── Celligner_methods.R
    ├── DESCRIPTION
    ├── Dockerfile
    ├── NAMESPACE
    ├── README.md
    ├── global_params.R
    ├── install_packages.R
    └── mutlidataset_alignment.R
├── README.md
├── build_docker.sh
├── celligner
    ├── VERSION
    ├── __init__.py
    ├── limma.py
    └── params.py
├── celligner_output.ipynb
├── docs
    ├── Screenshot 2021-10-29 at 10.51.53.png
    ├── Screenshot 2021-10-29 at 10.53.01.png
    ├── celligner.md
    ├── celligner_diagram.png
    ├── celligner_public22q2.png
    ├── example.html
    ├── example.pdf
    ├── index.html
    ├── index.md
    └── typical_celligner.webp
├── install_submodules_and_run.sh
├── man
    ├── calc_gene_stats.Rd
    ├── calc_tumor_CL_cor.Rd
    ├── check_NAs.Rd
    ├── cluster_data.Rd
    ├── create_Seurat_object.Rd
    ├── dot-average_correction.Rd
    ├── dot-center_along_batch_vector.Rd
    ├── dot-compute_tricube_average.Rd
    ├── dot-tricube_weighted_correction.Rd
    ├── find_differentially_expressed_genes.Rd
    ├── get_cluster_averages.Rd
    ├── load_additional_data.Rd
    ├── load_data.Rd
    ├── modified_mnnCorrect.Rd
    ├── run_Celligner.Rd
    ├── run_MNN.Rd
    ├── run_cPCA.Rd
    ├── run_cPCA_analysis.Rd
    ├── run_lm_stats_limma_group.Rd
    └── run_multidataset_alignment.Rd
├── mkdocs.yml
├── requirements.txt
├── run_celligner.py
├── run_celligner_multi_dataset.py
├── run_on_sparkles.sh
├── runs
    ├── 22Q1-newmerging.ipynb
    ├── 22Q1.ipynb
    ├── 22Q2.ipynb
    ├── CCLF-manuscript.ipynb
    ├── cclf_analysisCelligner_plot_scatter.html
    ├── cclf_color_analysisCelligner_plot_scatter.html
    └── testing.ipynb
└── setup.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [broadinstitute]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/release_message.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | previous_tag=$(git tag --sort=-creatordate | sed -n 2p)
3 | git shortlog "${previous_tag}.." | sed 's/^./    &/'
4 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI
 4 | 
 5 | # Controls when the workflow will run
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the main branch
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 |   # Allows you to run this workflow manually from the Actions tab
14 |   workflow_dispatch:
15 | 
16 | jobs:
17 |   linter:
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         python-version: [3.9]
22 |         os: [ubuntu-latest]
23 |     runs-on: ${{ matrix.os }}
24 |     steps:
25 |       - uses: actions/checkout@v2
26 |       - uses: actions/setup-python@v2
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install project
30 |         run: make install
31 |       - name: Run linter
32 |         run: make lint
33 | 
34 |   tests_linux:
35 |     needs: linter
36 |     strategy:
37 |       fail-fast: false
38 |       matrix:
39 |         python-version: [3.9]
40 |         os: [ubuntu-latest]
41 |     runs-on: ${{ matrix.os }}
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - uses: actions/setup-python@v2
45 |         with:
46 |           python-version: ${{ matrix.python-version }}
47 |       - name: Install project
48 |         run: make install
49 |       - name: Run tests
50 |         run: make test
51 |       - name: "Upload coverage to Codecov"
52 |         uses: codecov/codecov-action@v1
53 |         # with:
54 |         #   fail_ci_if_error: true
55 | 
56 |   tests_mac:
57 |     needs: linter
58 |     strategy:
59 |       fail-fast: false
60 |       matrix:
61 |         python-version: [3.9]
62 |         os: [macos-latest]
63 |     runs-on: ${{ matrix.os }}
64 |     steps:
65 |       - uses: actions/checkout@v2
66 |       - uses: actions/setup-python@v2
67 |         with:
68 |           python-version: ${{ matrix.python-version }}
69 |       - name: Install project
70 |         run: make install
71 |       - name: Run tests
72 |         run: make test
73 | 
74 |   tests_win:
75 |     needs: linter
76 |     strategy:
77 |       fail-fast: false
78 |       matrix:
79 |         python-version: [3.9]
80 |         os: [windows-latest]
81 |     runs-on: ${{ matrix.os }}
82 |     steps:
83 |       - uses: actions/checkout@v2
84 |       - uses: actions/setup-python@v2
85 |         with:
86 |           python-version: ${{ matrix.python-version }}
87 |       - name: Install Pip
88 |         run: pip install --user --upgrade pip
89 |       - name: Install project
90 |         run: pip install -e .[test]
91 |       - name: run tests
92 |         run: pytest -s -vvvv -l --tb=long tests
93 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     # Sequence of patterns matched against refs/tags
 6 |     tags:
 7 |       - '*' # Push events to matching v*, i.e. v1.0, v20.15.10
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   release:
14 |     name: Create Release
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |         with:
19 |           # by default, it uses a depth of 1
20 |           # this fetches all history so that we can read each commit
21 |           fetch-depth: 0
22 |       - name: Generate Changelog
23 |         run: .github/release_message.sh > release_message.md
24 |       - name: Release
25 |         uses: softprops/action-gh-release@v1
26 |         with:
27 |           body_path: release_message.md
28 | 
29 |   deploy:
30 |     needs: release
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |     - uses: actions/checkout@v1
34 |     - name: Set up Python
35 |       uses: actions/setup-python@v1
36 |       with:
37 |         python-version: '3.x'
38 |     - name: Install dependencies
39 |       run: |
40 |         python -m pip install --upgrade pip
41 |         pip install setuptools wheel twine
42 |     - name: Build and publish
43 |       env:
44 |         TWINE_USERNAME: __token__
45 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
46 |       run: |
47 |         python setup.py sdist bdist_wheel
48 |         twine upload dist/*
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/r,macos,python,sublimetext
  2 | 
  3 | .code-workspace.code-workspace
  4 | # Edit at https://www.gitignore.io/?templates=r,macos,python,sublimetext
  5 | ### PERSO ###
  6 | data/*
  7 | **.so
  8 | *.code-workspace
  9 | ### macOS ###
 10 | # General
 11 | .DS_Store
 12 | .AppleDouble
 13 | .LSOverride
 14 | .vscode
 15 | 
 16 | # output model
 17 | *.pkl
 18 | 
 19 | # sparkles
 20 | .kubeque-cached-file-hashes
 21 | .sparkles-cache
 22 | 
 23 | # Icon must end with two \r
 24 | Icon
 25 | 
 26 | # Thumbnails
 27 | ._*
 28 | temp/*
 29 | 
 30 | # Files that might appear in the root of a volume
 31 | .DocumentRevisions-V100
 32 | .fseventsd
 33 | .Spotlight-V100
 34 | .TemporaryItems
 35 | .Trashes
 36 | .VolumeIcon.icns
 37 | .com.apple.timemachine.donotpresent
 38 | 
 39 | # Directories potentially created on remote AFP share
 40 | .AppleDB
 41 | .AppleDesktop
 42 | Network Trash Folder
 43 | Temporary Items
 44 | .apdisk
 45 | 
 46 | ### Python ###
 47 | # Byte-compiled / optimized / DLL files
 48 | __pycache__/
 49 | *.py[cod]
 50 | *$py.class
 51 | 
 52 | # C extensions
 53 | *.so
 54 | 
 55 | # Distribution / packaging
 56 | .Python
 57 | build/
 58 | develop-eggs/
 59 | dist/
 60 | downloads/
 61 | eggs/
 62 | .eggs/
 63 | lib/
 64 | lib64/
 65 | parts/
 66 | sdist/
 67 | var/
 68 | wheels/
 69 | pip-wheel-metadata/
 70 | share/python-wheels/
 71 | *.egg-info/
 72 | .installed.cfg
 73 | *.egg
 74 | MANIFEST
 75 | 
 76 | # PyInstaller
 77 | #  Usually these files are written by a python script from a template
 78 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 79 | *.manifest
 80 | *.spec
 81 | 
 82 | # Installer logs
 83 | pip-log.txt
 84 | pip-delete-this-directory.txt
 85 | 
 86 | # Unit test / coverage reports
 87 | htmlcov/
 88 | .tox/
 89 | .nox/
 90 | .coverage
 91 | .coverage.*
 92 | .cache
 93 | nosetests.xml
 94 | coverage.xml
 95 | *.cover
 96 | .hypothesis/
 97 | .pytest_cache/
 98 | 
 99 | # Translations
100 | *.mo
101 | *.pot
102 | 
103 | # Django stuff:
104 | *.log
105 | local_settings.py
106 | db.sqlite3
107 | 
108 | # Flask stuff:
109 | instance/
110 | .webassets-cache
111 | 
112 | # Scrapy stuff:
113 | .scrapy
114 | 
115 | # Sphinx documentation
116 | docs/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # Jupyter Notebook
122 | .ipynb_checkpoints
123 | 
124 | # IPython
125 | profile_default/
126 | ipython_config.py
127 | 
128 | # pyenv
129 | .python-version
130 | 
131 | # pipenv
132 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
133 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
134 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
135 | #   install all needed dependencies.
136 | #Pipfile.lock
137 | 
138 | # celery beat schedule file
139 | celerybeat-schedule
140 | 
141 | # SageMath parsed files
142 | *.sage.py
143 | 
144 | # Environments
145 | .env
146 | .venv
147 | env/
148 | venv/
149 | ENV/
150 | env.bak/
151 | venv.bak/
152 | 
153 | # Spyder project settings
154 | .spyderproject
155 | .spyproject
156 | 
157 | # Rope project settings
158 | .ropeproject
159 | 
160 | # mkdocs documentation
161 | /site
162 | 
163 | # mypy
164 | .mypy_cache/
165 | .dmypy.json
166 | dmypy.json
167 | 
168 | # Pyre type checker
169 | .pyre/
170 | 
171 | ### R ###
172 | # History files
173 | .Rhistory
174 | .Rapp.history
175 | 
176 | # Session Data files
177 | .RData
178 | 
179 | # User-specific files
180 | .Ruserdata
181 | 
182 | # Example code in package build process
183 | *-Ex.R
184 | 
185 | # Output files from R CMD build
186 | /*.tar.gz
187 | 
188 | # Output files from R CMD check
189 | /*.Rcheck/
190 | 
191 | # RStudio files
192 | .Rproj.user/
193 | 
194 | # produced vignettes
195 | vignettes/*.html
196 | vignettes/*.pdf
197 | 
198 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
199 | .httr-oauth
200 | 
201 | # knitr and R markdown default cache directories
202 | /*_cache/
203 | /cache/
204 | 
205 | # Temporary files created by R markdown
206 | *.utf8.md
207 | *.knit.md
208 | 
209 | ### R.Bookdown Stack ###
210 | # R package: bookdown caching files
211 | /*_files/
212 | 
213 | ### SublimeText ###
214 | # Cache files for Sublime Text
215 | *.tmlanguage.cache
216 | *.tmPreferences.cache
217 | *.stTheme.cache
218 | 
219 | # Workspace files are user-specific
220 | *.sublime-workspace
221 | 
222 | # Project files should be checked into the repository, unless a significant
223 | # proportion of contributors will probably not be using Sublime Text
224 | # *.sublime-project
225 | 
226 | # SFTP configuration file
227 | sftp-config.json
228 | 
229 | # Package control specific files
230 | Package Control.last-run
231 | Package Control.ca-list
232 | Package Control.ca-bundle
233 | Package Control.system-ca-bundle
234 | Package Control.cache/
235 | Package Control.ca-certs/
236 | Package Control.merged-ca-bundle
237 | Package Control.user-ca-bundle
238 | oscrypto-ca-bundle.crt
239 | bh_unicode_properties.cache
240 | 
241 | # Sublime-github package stores a github token in this file
242 | # https://packagecontrol.io/packages/sublime-github
243 | GitHub.sublime-settings
244 | 
245 | # tmp files
246 | tmp.py
247 | 
248 | # End of https://www.gitignore.io/api/r,macos,python,sublimetext
249 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "mnnpy"]
2 | 	path = mnnpy
3 | 	url = git@github.com:DeKegel/mnnpy.git
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # How to develop on this project
  2 | 
  3 | celligner welcomes contributions from the community.
  4 | 
  5 | **You need PYTHON3!**
  6 | 
  7 | This instructions are for linux base systems. (Linux, MacOS, BSD, etc.)
  8 | ## Setting up your own fork of this repo.
  9 | 
 10 | - On github interface click on `Fork` button.
 11 | - Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/celligner.git`
 12 | - Enter the directory `cd celligner`
 13 | - Add upstream repo `git remote add upstream https://github.com/broadinstitute/celligner`
 14 | 
 15 | ## Setting up your own virtual environment
 16 | 
 17 | Run `make virtualenv` to create a virtual environment.
 18 | then activate it with `source .venv/bin/activate`.
 19 | 
 20 | ## Install the project in develop mode
 21 | 
 22 | Run `make install` to install the project in develop mode.
 23 | 
 24 | ## Run the tests to ensure everything is working
 25 | 
 26 | Run `make test` to run the tests.
 27 | 
 28 | ## Create a new branch to work on your contribution
 29 | 
 30 | Run `git checkout -b my_contribution`
 31 | 
 32 | ## Make your changes
 33 | 
 34 | Edit the files using your preferred editor. (we recommend VIM or VSCode)
 35 | 
 36 | ## Format the code
 37 | 
 38 | Run `make fmt` to format the code.
 39 | 
 40 | ## Run the linter
 41 | 
 42 | Run `make lint` to run the linter.
 43 | 
 44 | ## Test your changes
 45 | 
 46 | Run `make test` to run the tests.
 47 | 
 48 | Ensure code coverage report shows `100%` coverage, add tests to your PR.
 49 | 
 50 | ## Build the docs locally
 51 | 
 52 | Run `make docs` to build the docs.
 53 | 
 54 | Ensure your new changes are documented.
 55 | 
 56 | ## Commit your changes
 57 | 
 58 | This project uses [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
 59 | 
 60 | Example: `fix(package): update setup.py arguments 🎉` (emojis are fine too)
 61 | 
 62 | ## Push your changes to your fork
 63 | 
 64 | Run `git push origin my_contribution`
 65 | 
 66 | ## Submit a pull request
 67 | 
 68 | On github interface, click on `Pull Request` button.
 69 | 
 70 | Wait CI to run and one of the developers will review your PR.
 71 | ## Makefile utilities
 72 | 
 73 | This project comes with a `Makefile` that contains a number of useful utility.
 74 | 
 75 | ```bash 
 76 | ❯ make
 77 | Usage: make <target>
 78 | 
 79 | Targets:
 80 | help:             ## Show the help.
 81 | install:          ## Install the project in dev mode.
 82 | fmt:              ## Format code using black & isort.
 83 | lint:             ## Run pep8, black, mypy linters.
 84 | test: lint        ## Run tests and generate coverage report.
 85 | watch:            ## Run tests on every change.
 86 | clean:            ## Clean unused files.
 87 | virtualenv:       ## Create a virtual environment.
 88 | release:          ## Create a new tag for release.
 89 | docs:             ## Build the documentation.
 90 | switch-to-poetry: ## Switch to poetry package manager.
 91 | init:             ## Initialize the project based on an application template.
 92 | ```
 93 | 
 94 | ## Making a new release
 95 | 
 96 | This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z`
 97 | Every time a new tag is created and pushed to the remote repo, github actions will
 98 | automatically create a new release on github and trigger a release on PyPI.
 99 | 
100 | For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, 
101 | this token can be generated on [pypi.org](https://pypi.org/account/).
102 | 
103 | To trigger a new release all you need to do is.
104 | 
105 | 1. If you have changes to add to the repo
106 |     * Make your changes following the steps described above.
107 |     * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
108 | 2. Run the tests to ensure everything is working.
109 | 4. Run `make release` to create a new tag and push it to the remote repo.
110 | 
111 | the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked.
112 | 
113 | > **CAUTION**:  The make release will change local changelog files and commit all the unstaged changes you have.
114 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Dockerfile to create celligner image
 4 | #
 5 | # Run build_docker.sh
 6 | 
 7 | FROM python:3.8
 8 | 
 9 | #add R and CMAKE
10 | RUN apt-get update && apt-get install -y r-base cmake
11 | # install
12 | RUN R -e 'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install("limma");' 
13 | 
14 | #install requirements
15 | COPY requirements.txt .
16 | RUN pip install --upgrade pip &&\
17 | 	pip install -r requirements.txt


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | 
  5 | (unreleased)
  6 | ------------
  7 | - Release: version  🚀 [Jérémie Kalfon]
  8 | - Update setup.py. [Jérémie Kalfon]
  9 | - Update requirements.txt. [Jérémie Kalfon]
 10 | - Update README.md. [Jérémie Kalfon]
 11 | - Update README.md. [Jérémie Kalfon]
 12 | - Release: version 1.1.0 🚀 [jkobject]
 13 | 
 14 | 
 15 | 1.1.0 (2022-04-01)
 16 | ------------------
 17 | - Update to the notebooks. [jkobject]
 18 | - Quick debug and adding more datasets. [jkobject]
 19 | - Merge branch 'master' of https://github.com/broadinstitute/celligner.
 20 |   [jkobject]
 21 | - Update README.md. [Jérémie Kalfon]
 22 | - Some trials with the release, making new release too. [jkobject]
 23 | - Som debugs and reformating. [jkobject]
 24 | - Again. [jkobject]
 25 | - Better viz of new dataset. [jkobject]
 26 | - CCLF analysis. [jkobject]
 27 | - Updating the demo with the tests. [jkobject]
 28 | - Some small improvements. [jkobject]
 29 | - Adding a new release notebook. [jkobject]
 30 | - Format. [jkobject]
 31 | - Updating formatting. [jkobject]
 32 | - Debugging limma when different version of rpy2. [jkobject]
 33 | - Update README.md. [Jérémie Kalfon]
 34 | - Merge pull request #3 from broadinstitute/dev. [Jérémie Kalfon]
 35 | 
 36 |   Update README.md
 37 | - From javad's comment. [jkobject]
 38 | - Update README.md. [Jérémie Kalfon]
 39 | - Merge branch 'master' into dev. [Jérémie Kalfon]
 40 | - Update README.md. [Jérémie Kalfon]
 41 | - Adding a bit of doc and WIP on QC. [jkobject]
 42 | - Release: version 1.0.1 🚀 [jkobject]
 43 | 
 44 | 
 45 | 1.0.1 (2021-10-27)
 46 | ------------------
 47 | - Cleanup. [jkobject]
 48 | - Finishing examples. [jkobject]
 49 | - Release: version 1.0.0 🚀 [jkobject]
 50 | 
 51 | 
 52 | 1.0.0 (2021-10-27)
 53 | ------------------
 54 | - Finishig multidataseet alignment and final debugs. [jkobject]
 55 | - Release: version 0.9.3 🚀 [jkobject]
 56 | 
 57 | 
 58 | 0.9.3 (2021-10-25)
 59 | ------------------
 60 | - Release: version 0.9.2 🚀 [jkobject]
 61 | 
 62 | 
 63 | 0.9.2 (2021-10-25)
 64 | ------------------
 65 | - Merge pull request #2 from jkobject/master. [Jérémie Kalfon]
 66 | 
 67 |   tomerge
 68 | - Release: version 0.9.1 🚀 [jkobject]
 69 | - Merge pull request #1 from jkobject/dev. [Jérémie Kalfon]
 70 | - Merge branch 'master' of https://github.com/broadinstitute/celligner.
 71 |   [Jérémie Kalfon]
 72 | - Adding data and planning. [Jérémie Kalfon]
 73 | 
 74 | 
 75 | 0.9.1 (2021-10-25)
 76 | ------------------
 77 | - Nthi. [jkobject]
 78 | - Nothing really. [jkobject]
 79 | - Better readme. [jkobject]
 80 | - Merge branch 'dev' of https://github.com/jkobject/celligner into dev.
 81 |   [jkobject]
 82 | - Update .github/FUNDING.yml. [Jérémie Kalfon]
 83 | - Delete rename_project.yml. [Jérémie Kalfon]
 84 | - Delete rename_project.sh. [Jérémie Kalfon]
 85 | - Delete init.sh. [Jérémie Kalfon]
 86 | - Adding doc and remving snn. [jkobject]
 87 | - Release: version 0.9.0 🚀 [jkobject]
 88 | 
 89 | 
 90 | 0.9.0 (2021-10-24)
 91 | ------------------
 92 | - Updating mnn too. [jkobject]
 93 | - More on celligner. [jkobject]
 94 | - Continuing celligner debug. [jkobject]
 95 | - Debuging marioni 1/n. [jkobject]
 96 | - Updating contrastive and mnnpy. [jkobject]
 97 | - Making it more prod ready. [jkobject]
 98 | - Making a prod version. [jkobject]
 99 | - Finishing debugging. [jkobject]
100 | - Cont. [jkobject]
101 | - Coontinuing. [jkobject]
102 | - Make it more productionalized. [jkobject]
103 | - Improving. [jkobject]
104 | - Continuing adding new mnn version (trying rgular mnn) [jkobject]
105 | - Making more changes. [jkobject]
106 | - Adding too giti. [jkobject]
107 | - Finish debug yay!! [jkobject]
108 | - Finish debug yay!! [jkobject]
109 | - Continuing debugs. [jkobject]
110 | - Making demo. [jkobject]
111 | - Coninuing work. [jkobject]
112 | - Redoing. [jkobject]
113 | - Cleanup. [jkobject]
114 | - WIP on celligner python. [jkobject]
115 | - WIP on python's celigner. [jkobject]
116 | - Update README.md. [Jérémie Kalfon]
117 | - Better doc. [Jérémie Kalfon]
118 | - Update to README on multidataset. [Allie Warren]
119 | - Adding code to run Celligner with additional datasets. [Allie Warren]
120 | - Adding statements to clear unused objects to reduce memory. [Allie
121 |   Warren]
122 | - Adding code to install required packages and adding imports to
123 |   description. [Allie Warren]
124 | - Merge branch 'master' of https://github.com/broadinstitute/celligner.
125 |   [Allie Warren]
126 | - Create README. [acwarren]
127 | - Adding plots to output. [Allie Warren]
128 | - Add documentation to helper methods and add test for NAs in data.
129 |   [Allie Warren]
130 | - Add manual for methods. [Allie Warren]
131 | - Added error tests and updated documentation. [Allie Warren]
132 | - Creating repository for celligner package. [Allie Warren]
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | This is free and unencumbered software released into the public domain.
 3 | 
 4 | Anyone is free to copy, modify, publish, use, compile, sell, or
 5 | distribute this software, either in source code form or as a compiled
 6 | binary, for any purpose, commercial or non-commercial, and by any
 7 | means.
 8 | 
 9 | In jurisdictions that recognize copyright laws, the author or authors
10 | of this software dedicate any and all copyright interest in the
11 | software to the public domain. We make this dedication for the benefit
12 | of the public at large and to the detriment of our heirs and
13 | successors. We intend this dedication to be an overt act of
14 | relinquishment in perpetuity of all present and future rights to this
15 | software under copyright law.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 | OTHER DEALINGS IN THE SOFTWARE.
24 | 
25 | For more information, please refer to <https://unlicense.org>
26 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include HISTORY.md
3 | include Containerfile
4 | graft tests
5 | graft celligner
6 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .ONESHELL:
  2 | ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')")
  3 | USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes")
  4 | 
  5 | .PHONY: help
  6 | help:             ## Show the help.
  7 | 	@echo "Usage: make <target>"
  8 | 	@echo ""
  9 | 	@echo "Targets:"
 10 | 	@fgrep "##" Makefile | fgrep -v fgrep
 11 | 
 12 | 
 13 | .PHONY: show
 14 | show:             ## Show the current environment.
 15 | 	@echo "Current environment:"
 16 | 	@if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi
 17 | 	@echo "Running using $(ENV_PREFIX)"
 18 | 	@$(ENV_PREFIX)python -V
 19 | 	@$(ENV_PREFIX)python -m site
 20 | 
 21 | .PHONY: install
 22 | install:          ## Install the project in dev mode.
 23 | 	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
 24 | 	@echo "Don't forget to run 'make virtualenv' if you got errors."
 25 | 	$(ENV_PREFIX)pip install -e .[test]
 26 | 
 27 | .PHONY: fmt
 28 | fmt:              ## Format code using black & isort.
 29 | 	$(ENV_PREFIX)isort celligner/
 30 | 	$(ENV_PREFIX)black -l 79 celligner/
 31 | 	$(ENV_PREFIX)black -l 79 tests/
 32 | 
 33 | .PHONY: lint
 34 | lint:             ## Run pep8, black, mypy linters.
 35 | 	$(ENV_PREFIX)flake8 celligner/
 36 | 	$(ENV_PREFIX)black -l 79 --check celligner/
 37 | 	$(ENV_PREFIX)black -l 79 --check tests/
 38 | 	$(ENV_PREFIX)mypy --ignore-missing-imports celligner/
 39 | 
 40 | .PHONY: test
 41 | test: lint        ## Run tests and generate coverage report.
 42 | 	$(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=celligner -l --tb=short --maxfail=1 tests/
 43 | 	$(ENV_PREFIX)coverage xml
 44 | 	$(ENV_PREFIX)coverage html
 45 | 
 46 | .PHONY: watch
 47 | watch:            ## Run tests on every change.
 48 | 	ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/
 49 | 
 50 | .PHONY: clean
 51 | clean:            ## Clean unused files.
 52 | 	@find ./ -name '*.pyc' -exec rm -f {} \;
 53 | 	@find ./ -name '__pycache__' -exec rm -rf {} \;
 54 | 	@find ./ -name 'Thumbs.db' -exec rm -f {} \;
 55 | 	@find ./ -name '*~' -exec rm -f {} \;
 56 | 	@rm -rf .cache
 57 | 	@rm -rf .pytest_cache
 58 | 	@rm -rf .mypy_cache
 59 | 	@rm -rf build
 60 | 	@rm -rf dist
 61 | 	@rm -rf *.egg-info
 62 | 	@rm -rf htmlcov
 63 | 	@rm -rf .tox/
 64 | 	@rm -rf docs/_build
 65 | 
 66 | .PHONY: virtualenv
 67 | virtualenv:       ## Create a virtual environment.
 68 | 	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
 69 | 	@echo "creating virtualenv ..."
 70 | 	@rm -rf .venv
 71 | 	@python3 -m venv .venv
 72 | 	@./.venv/bin/pip install -U pip
 73 | 	@./.venv/bin/pip install -e .[test]
 74 | 	@echo
 75 | 	@echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!"
 76 | 
 77 | .PHONY: release
 78 | release:          ## Create a new tag for release.
 79 | 	@echo "WARNING: This operation will create s version tag and push to github"
 80 | 	@read -p "Version? (provide the next x.y.z semver) : " TAG
 81 | 	@echo "creating git tag : $${TAG}"
 82 | 	@git tag $${TAG}
 83 | 	@echo "$${TAG}" > celligner/VERSION
 84 | 	@$(ENV_PREFIX)gitchangelog > HISTORY.md
 85 | 	@git add celligner/VERSION HISTORY.md
 86 | 	@git commit -m "release: version $${TAG} 🚀"
 87 | 	@git push -u origin HEAD --tags
 88 | 	@echo "Github Actions will detect the new tag and release the new version."
 89 | 
 90 | .PHONY: docs
 91 | docs:             ## Build the documentation.
 92 | 	@echo "building documentation ..."
 93 | 	@jupyter nbconvert --to html Celligner_demo.ipynb --output docs/index.html
 94 | 	@$(ENV_PREFIX)mkdocs gh-deploy
 95 | 	URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL
 96 | 
 97 | .PHONY: switch-to-poetry
 98 | switch-to-poetry: ## Switch to poetry package manager.
 99 | 	@echo "Switching to poetry ..."
100 | 	@if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi
101 | 	@rm -rf .venv
102 | 	@poetry init --no-interaction --name=a_flask_test --author=rochacbruno
103 | 	@echo "" >> pyproject.toml
104 | 	@echo "[tool.poetry.scripts]" >> pyproject.toml
105 | 	@echo "celligner = 'celligner.__main__:main'" >> pyproject.toml
106 | 	@cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done
107 | 	@cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done
108 | 	@poetry install --no-interaction
109 | 	@mkdir -p .github/backup
110 | 	@mv requirements* .github/backup
111 | 	@mv setup.py .github/backup
112 | 	@echo "You have switched to https://python-poetry.org/ package manager."
113 | 	@echo "Please run 'poetry shell' or 'poetry run celligner'"
114 | 
115 | .PHONY: init
116 | init:             ## Initialize the project based on an application template.
117 | 	@./.github/init.sh
118 | 
119 | 
120 | # This project has been generated from rochacbruno/python-project-template
121 | # __author__ = 'rochacbruno'
122 | # __repo__ = https://github.com/rochacbruno/python-project-template
123 | # __sponsor__ = https://github.com/sponsors/rochacbruno/
124 | 


--------------------------------------------------------------------------------
/R/Celligner_helpers.R:
--------------------------------------------------------------------------------
  1 | library(magrittr)
  2 | library(tidyverse)
  3 | 
  4 | #' check for NAs in the expression data and remove samples with NAs
  5 | #' @name check_NAs
  6 | #'
  7 | #' @param mat: matrix of gene expression data that is samples by genes
  8 | #' @return matrix of gene expression data, removing samples that have NAs
  9 | #' @export
 10 | #'
 11 | check_NAs <- function(mat) {
 12 |   if(length(which(is.na(rowSums(mat))==T))>0) {
 13 |     warning("Removing sample(s) due to NAs in the data")
 14 |     mat <- mat[!is.na(rowSums(mat)),]
 15 |   }
 16 | 
 17 |   return(mat)
 18 | }
 19 | 
 20 | #'
 21 | #' Differentially expressed genes
 22 | #' @name run_lm_stats_limma_group
 23 | #'
 24 | #' @param mat: Nxp data matrix of N cell lines and p genes
 25 | #' @param phenos: N vector of independent variables. Can be two-group labels as factors, bools, or can be numeric
 26 | #' @param covars: optional Nxk matrix of sample covariates
 27 | #' @param weights: optional N vector of precision weights for each data point
 28 | #' @param target_type: name of the column variable in the data (default 'Gene')
 29 | #' @return table of gene level stata
 30 | #' @description  Estimate linear-model stats for a matrix of data with respect to a group of phenotype variables
 31 | # using limma with empirical Bayes moderated F-stats for p-values
 32 | #' @export
 33 | #'
 34 | run_lm_stats_limma_group <- function (mat, phenos, covars = NULL, weights = NULL, target_type = "Gene",
 35 |           limma_trend = FALSE)
 36 | {
 37 |   require(limma)
 38 |   require(magrittr)
 39 |   require(tibble)
 40 |   require(plyr)
 41 |   require(dplyr)
 42 |   udata <- rownames(mat) %>% intersect(rownames(phenos))
 43 |   if (!is.null(covars)) {
 44 |     udata %<>% intersect(rownames(covars))
 45 |   }
 46 |   form <- as.formula(paste("~", paste0(colnames(phenos), collapse = " + ")))
 47 |   design <- model.matrix(form, data = phenos[udata, , drop = F])
 48 |   if (!is.null(covars)) {
 49 |     covars <- data.frame(covars)
 50 |     form <- as.formula(paste("~", paste0(colnames(covars),
 51 |                                          collapse = " + ")))
 52 |     Cdesign <- model.matrix(form, data = covars[udata, ,
 53 |                                                 drop = F])
 54 |     Cdesign <- Cdesign[, setdiff(colnames(Cdesign), "(Intercept)"),
 55 |                        drop = FALSE]
 56 |     stopifnot(length(intersect(colnames(Cdesign), colnames(design))) ==
 57 |                 0)
 58 |     design %<>% cbind(Cdesign)
 59 |   }
 60 |   if (!is.null(weights)) {
 61 |     if (is.matrix(weights)) {
 62 |       weights <- t(weights[udata, ])
 63 |     }
 64 |     else {
 65 |       weights <- weights[udata]
 66 |     }
 67 |   }
 68 |   design <- design[, colSums(design) > 2, drop = FALSE]
 69 |   targ_coefs <- setdiff(colnames(design), "(Intercept)")
 70 |   fit <- limma::lmFit(t(mat[udata, ]), design, weights = weights)
 71 |   fit <- limma::eBayes(fit, trend = limma_trend)
 72 |   targ_coef <- which(colnames(design) %in% targ_coefs)
 73 |   results <- limma::topTable(fit, coef = targ_coef, number = Inf,
 74 |                              sort.by = "F", genelist = colnames(mat))
 75 |   results %<>% tibble::rownames_to_column(var = target_type)
 76 |   results %<>% magrittr::set_colnames(revalue(colnames(.), c(AveExpr = "Avg",
 77 |                                                    F = "F_stat", P.Value = "p.value", adj.P.Val = "q.value"))) %>%
 78 |     na.omit() %>% dplyr::select(-ProbeID)
 79 |   return(results)
 80 | }
 81 | 
 82 | #'
 83 | #' cPCA
 84 | #' @name run_cPCA_analysis
 85 | #'
 86 | #' @param TCGA_dat: sample by genes matrix of scaled expression data
 87 | #' @param CCLE_dat: sample by genes matrix of scaled expression data
 88 | #' @param tumor_cluster_df: table of sample metadata that includes a column 'seurat_clusters',
 89 | #'  containing transcriptional clusters in the TCGA data
 90 | #' @param CL_cluster_df: table of sample metadata that includes a column 'seurat_clusters',
 91 | #'  containing transcriptional clusters in the CCLE data
 92 | #' @param pc_dims: numbers of cPCs calculated. If set to NULL (default) all cPCs will be calculated, if set to a value
 93 | #' then that number of cPCs will be approximated. Values input should be >= 4.
 94 | #' @return contrastive principal component object containing cPC vectors and values
 95 | #' @description Run contrastive principal components analysis, first removing average cluster expression, to
 96 | # estimate the average intra-cluster covariance. If pc_dims = NULL, all cPCs are calculated. Faster cPCA can be run by setting pc_dims to a
 97 | # value >=4 and approximating just those cPCs.
 98 | #' @export
 99 | #'
100 | run_cPCA_analysis <- function(TCGA_dat, CCLE_dat, tumor_cluster_df, CL_cluster_df, pc_dims=NULL) {
101 |   tumor_clust_avgs <- get_cluster_averages(TCGA_dat, tumor_cluster_df)
102 |   CL_clust_avgs <- get_cluster_averages(CCLE_dat, CL_cluster_df)
103 | 
104 |   TCGA_subtype_ms <- TCGA_dat - tumor_clust_avgs[tumor_cluster_df$seurat_clusters,]
105 |   CCLE_subtype_ms <- CCLE_dat - CL_clust_avgs[CL_cluster_df$seurat_clusters,]
106 | 
107 |   TCGA_cov <- cov(TCGA_subtype_ms)
108 |   CCLE_cov <- cov(CCLE_subtype_ms)
109 | 
110 |   if(!is.null(pc_dims)) {
111 |     cov_diff_eig <- irlba::prcomp_irlba(TCGA_cov - CCLE_cov, n = pc_dims)
112 |   } else {
113 |     cov_diff_eig <- eigen(TCGA_cov - CCLE_cov)
114 |   }
115 |   return(cov_diff_eig)
116 | }
117 | 
118 | #'
119 | #' calculate the average expression per cluster
120 | #' @name get_cluster_averages
121 | #'
122 | #' @param mat: sample by genes matrix of expression data
123 | #' @param cluster_df: table of sample metadata that includes a column 'seurat_clusters',
124 | #' containing transcriptional clusters
125 | #' @return average cluster expression
126 | #' @description calculate the average expression per cluster
127 | #' @export
128 | #'
129 | get_cluster_averages <- function(mat, cluster_df) {
130 |   n_clusts <- nlevels(cluster_df$seurat_clusters)
131 |   clust_avgs <- matrix(NA, nrow = n_clusts, ncol = ncol(mat)) %>%
132 |     magrittr::set_colnames(colnames(mat)) %>%
133 |     magrittr::set_rownames(levels(cluster_df$seurat_clusters))
134 |   for (ii in levels(cluster_df$seurat_clusters)) {
135 |     clust_avgs[ii,] <- colMeans(mat[cluster_df$seurat_clusters == ii,], na.rm=T)
136 |   }
137 |   return(clust_avgs)
138 | }
139 | 
140 | # MNN --------------------------------------------------------------------
141 | 
142 | #'
143 | #' MNN
144 | #' @name modified_mnnCorrect
145 | #'
146 | #' @param ref_mat: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
147 | #' In the standard Celligner pipeline this the cell line data.
148 | #' @param targ_mat: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
149 | #' In the standard Celligner pipeline this the tumor data.
150 | #' @param k1: the number of neighbors within the data being corrected (in standard pipeline the tumor data). By default this is 20.
151 | #' @param k2: the number of neighbors within the reference data (in standard pipeline the cell line data). By default this is 20.
152 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
153 | #' By default is 3.
154 | #' @param subset_genes: the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially
155 | #' expressed genes is usually passed here. By default is NULL, meaning all genes are used
156 | #' @return MNN object, containing the targ_mat corrected data and the mutual nearest neighbor pairs.
157 | #' @description Mutual nearest neighbors correction. Modification of the scran::fastMNN (https://github.com/MarioniLab/scran).
158 | #' Allows for separate k values per dataset, and simplifies some of the IO and doesn't use PCA reduction
159 | #' @export
160 | #'
161 | modified_mnnCorrect <- function(ref_mat, targ_mat, k1 = 20, k2 = 20,
162 |                             ndist = 3, subset_genes = NULL) {
163 |   if (is.null(subset_genes)) {
164 |     subset_genes <- colnames(ref_mat)
165 |   }
166 | 
167 |   sets <- batchelor::findMutualNN(ref_mat[, subset_genes],
168 |                                   targ_mat[, subset_genes],
169 |                                   k1 = k2, k2 = k1,
170 |                                   BPPARAM = BiocParallel::SerialParam())
171 |   mnn_pairs <- as.data.frame(sets) %>%
172 |     dplyr::mutate(ref_ID = rownames(ref_mat)[first],
173 |            targ_ID = rownames(targ_mat)[second],
174 |            pair = seq(nrow(.))) %>%
175 |     dplyr::select(-first, -second)
176 | 
177 |   # Estimate the overall batch vector.
178 |   ave.out <- .average_correction(ref_mat, sets$first, targ_mat, sets$second)
179 |   overall.batch <- colMeans(ave.out$averaged)
180 | 
181 |   #remove variation along the overall batch vector
182 |   ref_mat <- .center_along_batch_vector(ref_mat, overall.batch)
183 |   targ_mat <- .center_along_batch_vector(targ_mat, overall.batch)
184 | 
185 |   # Recompute correction vectors and apply them.
186 |   re.ave.out <- .average_correction(ref_mat, sets$first, targ_mat, sets$second)
187 |   targ_mat <- .tricube_weighted_correction(targ_mat, re.ave.out$averaged, re.ave.out$second, k=k2, ndist=ndist, subset_genes, BPPARAM=BiocParallel::SerialParam())
188 | 
189 |   final <- list(corrected = targ_mat,
190 |                 pairs = mnn_pairs)
191 |   return(final)
192 | }
193 | 
194 | #'
195 | #' calculate the average correction vector
196 | #' @name .average_correction
197 | #'
198 | #' @param refdata: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
199 | #' In the standard Celligner pipeline this the cell line data.
200 | #' @param mnn1: mnn1 pairs
201 | #' @param curdata: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
202 | #' In the standard Celligner pipeline this the tumor data.
203 | #' @param mnn2: mnn2 pairs
204 | #' @return correction vector and pairs
205 | #' @description Computes correction vectors for each MNN pair, and then averages them for each MNN-involved cell in the second batch.
206 | #' Copied from dev version of scran (2018-10-28), with slight modifications as noted https://github.com/MarioniLab/scran
207 | #' @export
208 | #'
209 | .average_correction <- function(refdata, mnn1, curdata, mnn2)
210 |   # Computes correction vectors for each MNN pair, and then
211 |   # averages them for each MNN-involved cell in the second batch.
212 | {
213 |   corvec <- refdata[mnn1,,drop=FALSE] - curdata[mnn2,,drop=FALSE]
214 |   corvec <- rowsum(corvec, mnn2)
215 |   npairs <- table(mnn2)
216 |   stopifnot(identical(names(npairs), rownames(corvec)))
217 |   corvec <- unname(corvec)/as.vector(npairs)
218 |   list(averaged=corvec, second=as.integer(names(npairs)))
219 | }
220 | 
221 | 
222 | #'
223 | #' centers samples within each batch
224 | #' @name .center_along_batch_vector
225 | #'
226 | #' @param mat: matrix of samples by genes
227 | #' @param batch.vec: batch vector
228 | #' @return correction vector and pairs
229 | #' @description Projecting along the batch vector, and shifting all samples to the center within each batch.
230 | #' This removes any variation along the overall batch vector within each matrix.
231 | #' @export
232 | #'
233 | .center_along_batch_vector <- function(mat, batch.vec)
234 |   # Projecting along the batch vector, and shifting all cells to the center _within_ each batch.
235 |   # This removes any variation along the overall batch vector within each matrix.
236 | {
237 |   batch.vec <- batch.vec/sqrt(sum(batch.vec^2))
238 |   batch.loc <- as.vector(mat %*% batch.vec)
239 |   central.loc <- mean(batch.loc)
240 |   mat <- mat + outer(central.loc - batch.loc, batch.vec, FUN="*")
241 |   return(mat)
242 | }
243 | 
244 | 
245 | #' tricube-weighted correction
246 | #' @name .tricube_weighted_correction
247 | #'
248 | #' @param curdata: target matrix of samples by genes
249 | #' @param correction: corrected vector
250 | #' @param in.mnn: mnn pairs
251 | #' @param k: k values, default 20
252 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
253 | #' By default is 3.
254 | #' @param subset_genes: genes used to identify mutual nearest neighbors
255 | #' @param BNPARAM: default NULL
256 | #' @param BPPARAM: default BiocParallel::SerialParam()
257 | #' @return MNN corrected data
258 | #' @description Computing tricube-weighted correction vectors for individual samples,
259 | #' using the nearest neighbouring samples involved in MNN pairs.
260 | #' Modified to use FNN rather than queryKNN for nearest neighbor finding
261 | #' @export
262 | #' @importFrom BiocNeighbors queryKNN
263 | #' @importFrom BiocParallel SerialParam
264 | #'
265 | .tricube_weighted_correction <- function(curdata, correction, in.mnn, k=20, ndist=3, subset_genes, BNPARAM=NULL, BPPARAM=BiocParallel::SerialParam())
266 |   # Computing tricube-weighted correction vectors for individual cells,
267 |   # using the nearest neighbouring cells _involved in MNN pairs_.
268 |   # Modified to use FNN rather than queryKNN for nearest neighbor finding
269 | {
270 |   cur.uniq <- curdata[in.mnn,,drop=FALSE]
271 |   safe.k <- min(k, nrow(cur.uniq))
272 |   # closest <- queryKNN(query=curdata, X=cur.uniq, k=safe.k, BNPARAM=BNPARAM, BPPARAM=BPPARAM)
273 |   closest <- FNN::get.knnx(cur.uniq[, subset_genes], query=curdata[, subset_genes], k=safe.k)
274 |   # weighted.correction <- .compute_tricube_average(correction, closest$index, closest$distance, ndist=ndist)
275 |   weighted.correction <- .compute_tricube_average(correction, closest$nn.index, closest$nn.dist, ndist=ndist)
276 |   curdata + weighted.correction
277 | }
278 | 
279 | #'
280 | #' compute tricube averages
281 | #' @name .compute_tricube_average
282 | #'
283 | #' @param values: correction vector
284 | #' @param indices: nxk matrix for the nearest neighbor indice
285 | #' @param distances: nxk matrix for the nearest neighbor Euclidea distances
286 | #' @param bandwidth: Is set at 'ndist' times the median distance, if not specified.
287 | #' @param ndist: By default is 3.
288 | #' @description Centralized function to compute tricube averages.
289 | #' @export
290 | #'
291 | .compute_tricube_average <- function(vals, indices, distances, bandwidth=NULL, ndist=3)
292 |   # Centralized function to compute tricube averages.
293 |   # Bandwidth is set at 'ndist' times the median distance, if not specified.
294 | {
295 |   if (is.null(bandwidth)) {
296 |     middle <- ceiling(ncol(indices)/2L)
297 |     mid.dist <- distances[,middle]
298 |     bandwidth <- mid.dist * ndist
299 |   }
300 |   bandwidth <- pmax(1e-8, bandwidth)
301 | 
302 |   rel.dist <- distances/bandwidth
303 |   rel.dist[rel.dist > 1] <- 1 # don't use pmin(), as this destroys dimensions.
304 |   tricube <- (1 - rel.dist^3)^3
305 |   weight <- tricube/rowSums(tricube)
306 | 
307 |   output <- 0
308 |   for (kdx in seq_len(ncol(indices))) {
309 |     output <- output + vals[indices[,kdx],,drop=FALSE] * weight[,kdx]
310 |   }
311 | 
312 |   if (is.null(dim(output))) {
313 |     matrix(0, nrow(vals), ncol(vals))
314 |   } else {
315 |     output
316 |   }
317 | }
318 | 


--------------------------------------------------------------------------------
/R/Celligner_methods.R:
--------------------------------------------------------------------------------
  1 | library(magrittr)
  2 | library(tidyverse)
  3 | 
  4 | 
  5 | #' method to load in tumor and cell line expression data and annotations
  6 | #' @name load_data
  7 | #'
  8 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
  9 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data
 10 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
 11 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data
 12 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL
 13 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder
 14 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
 15 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations
 16 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
 17 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
 18 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.
 19 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL
 20 | #' @param cell_line_ann_taiga: if TRUE then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder
 21 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
 22 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.
 23 | #' If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
 24 | #'  if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.
 25 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
 26 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data
 27 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL
 28 | #' @param tumor_taiga: if TRUE then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder
 29 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
 30 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations
 31 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
 32 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations
 33 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
 34 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
 35 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
 36 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
 37 | #' for the cell lines. If null, assumes there are no additional annotations.
 38 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
 39 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
 40 | #' no additional annotations.
 41 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
 42 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
 43 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
 44 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations
 45 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
 46 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations
 47 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL
 48 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder
 49 | #'
 50 | #' @importFrom magrittr "%>%"
 51 | #'
 52 | #' @description load expression and annotation files for cell lines and tumors
 53 | #' @return dat object with cell line and tumor expression data and annotations
 54 | #' @export
 55 | load_data <- function(cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga,
 56 |                       cell_line_ann_name, cell_line_ann_file, cell_line_ann_version, cell_line_ann_taiga,
 57 |                       tumor_data_name, tumor_data_file, tumor_version, tumor_taiga,
 58 |                       tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga,
 59 |                       additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga,
 60 |                       hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) {
 61 |   if (hgnc_taiga) {
 62 |     hgnc.complete.set <- taigr::load.from.taiga(data.name = hgnc_data_name, data.version = hgnc_version, data.file = hgnc_data_file)
 63 |     if (is.null(hgnc.complete.set)) {
 64 |       stop("HGNC gene file input does not exist on taiga")
 65 |     }
 66 |   } else {
 67 |     if (file.exists(file.path(hgnc_data_name, hgnc_data_file))) {
 68 |       hgnc.complete.set <- data.table::fread(file.path(hgnc_data_name, hgnc_data_file)) %>%
 69 |         as.data.frame()
 70 |     } else {
 71 |       stop("HGNC gene file input does not exist")
 72 |     }
 73 |   }
 74 | 
 75 |   if (!all(c("symbol", "ensembl_gene_id", "locus_group") %in% colnames(hgnc.complete.set))) {
 76 |     stop("HGNC gene file does not contain expected columns (symbol, ensembl_gene_id, & locus_group)")
 77 |   }
 78 | 
 79 |   if (tumor_taiga) {
 80 |     TCGA_mat <- taigr::load.from.taiga(data.name = tumor_data_name, data.version = tumor_version, data.file = tumor_data_file)
 81 |     if (is.null(TCGA_mat)) {
 82 |       stop("tumor expression data file input does not exist on taiga")
 83 |     }
 84 |   } else {
 85 |     if (file.exists(file.path(tumor_data_name, tumor_data_file))) {
 86 |       TCGA_mat <- readr::read_tsv(file.path(tumor_data_name, tumor_data_file)) %>%
 87 |         as.data.frame() %>%
 88 |         tibble::column_to_rownames("Gene") %>%
 89 |         as.matrix() %>%
 90 |         t()
 91 |     } else {
 92 |       stop("tumor expression data file input does not exist")
 93 |     }
 94 |   }
 95 | 
 96 | 
 97 |   if (cell_line_taiga) {
 98 |     CCLE_mat <- taigr::load.from.taiga(data.name = cell_line_data_name, data.version = cell_line_version, data.file = cell_line_data_file)
 99 |     if (is.null(CCLE_mat)) {
100 |       stop("cell line expression data file input does not exist on taiga")
101 |     }
102 |   } else {
103 |     if (file.exists(file.path(cell_line_data_name, cell_line_data_file))) {
104 |       CCLE_mat <- readr::read_csv(file.path(cell_line_data_name, cell_line_data_file)) %>%
105 |         as.data.frame() %>%
106 |         tibble::column_to_rownames("X1") %>%
107 |         as.matrix()
108 |     } else {
109 |       stop("cell line data file input does not exist")
110 |     }
111 |   }
112 | 
113 |   # subset gene names to just ensembl IDs
114 |   # add test for this
115 |   colnames(CCLE_mat) <- stringr::str_match(colnames(CCLE_mat), "\\((.+)\\)")[, 2]
116 | 
117 |   # convert tumor gene names to ensembl IDs, if needed
118 |   if (length(grep("ENS", colnames(TCGA_mat))) != ncol(TCGA_mat)) {
119 |     print("converting TCGA column names from HGNC ids to ensembl ids")
120 |     common_genes <- dplyr::intersect(colnames(TCGA_mat), hgnc.complete.set$symbol)
121 |     if (length(common_genes) < 10000) {
122 |       sprint("only %s genes in overlapping between genes in columns of the tumor data and hgnc dataset")
123 |       warning("low overlap of genes in tumor data and gene symbol, either tumor data
124 |               or gene file may not be in correct format")
125 |     }
126 |     TCGA_mat <- TCGA_mat[, common_genes]
127 |     hgnc.complete.set <- dplyr::filter(hgnc.complete.set, symbol %in% common_genes)
128 |     hgnc.complete.set <- hgnc.complete.set[!duplicated(hgnc.complete.set$symbol), ]
129 |     rownames(hgnc.complete.set) <- hgnc.complete.set$symbol
130 |     hgnc.complete.set <- hgnc.complete.set[common_genes, ]
131 |     colnames(TCGA_mat) <- hgnc.complete.set$ensembl_gene_id
132 |   }
133 | 
134 | 
135 |   if (cell_line_ann_taiga) {
136 |     CCLE_ann <- taigr::load.from.taiga(data.name = cell_line_ann_name, data.version = cell_line_ann_version, data.file = cell_line_ann_file)
137 |     column_names <- c("arxspan_id", "lineage", "lineage_subtype")
138 |     if ("DepMap_ID" %in% colnames(CCLE_ann)) {
139 |       column_names[1] <- "DepMap_ID"
140 |     }
141 |     if (is.null(CCLE_ann)) {
142 |       warning("cell line annotation file does not exist on taiga, creating default annotations")
143 |       CCLE_ann <- data.frame(
144 |         sampleID = rownames(CCLE_mat),
145 |         lineage = NA,
146 |         subtype = NA,
147 |         type = "CL"
148 |       )
149 |     }
150 |     if (!all(column_names %in% colnames(CCLE_ann))) {
151 |       warning("cell line annotation file does not contain expected columns (arxspan_id or DepMap_ID, lineage, & lineage_subtype), creating default annotation file")
152 |       CCLE_ann <- data.frame(
153 |         sampleID = rownames(CCLE_mat),
154 |         lineage = NA,
155 |         subtype = NA,
156 |         type = "CL"
157 |       )
158 |     } else {
159 |       CCLE_ann <- CCLE_ann[, column_names]
160 |       colnames(CCLE_ann) <- c("sampleID", "lineage", "subtype")
161 |       CCLE_ann$type <- "CL"
162 |     }
163 |   } else {
164 |     if (file.exists(file.path(cell_line_ann_name, cell_line_ann_file))) {
165 |       CCLE_ann <- data.table::fread(file.path(cell_line_ann_name, cell_line_ann_file)) %>%
166 |         as.data.frame()
167 |     } else {
168 |       warning("cell line annotation file does not exist, creating default annotations")
169 |       CCLE_ann <- data.frame(
170 |         sampleID = rownames(CCLE_mat),
171 |         lineage = NA,
172 |         subtype = NA,
173 |         type = "CL"
174 |       )
175 |     }
176 |   }
177 | 
178 |   if (!all(c("sampleID", "lineage", "subtype", "type") %in% colnames(CCLE_ann))) {
179 |     warning("cell line annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations")
180 |     CCLE_ann <- data.frame(
181 |       sampleID = rownames(CCLE_mat),
182 |       lineage = NA,
183 |       subtype = NA,
184 |       type = "CL"
185 |     )
186 |   }
187 | 
188 |   if (tumor_ann_taiga) {
189 |     TCGA_ann <- taigr::load.from.taiga(data.name = tumor_ann_name, data.version = tumor_ann_version, data.file = tumor_ann_file)
190 |     tumor_column_names <- c("sampleID", "lineage", "subtype")
191 |     if (is.null(TCGA_ann)) {
192 |       warning("tumor annotation file does not exist on taiga, creating default annotations")
193 |       TCGA_ann <- data.frame(
194 |         sampleID = rownames(TCGA_mat),
195 |         lineage = NA,
196 |         subtype = NA,
197 |         type = "tumor"
198 |       )
199 |     }
200 |     if (!all(tumor_column_names %in% colnames(TCGA_ann))) {
201 |       warning("tumor annotation file does not contain expected columns (sampleID, lineage, & subtype), creating default tumor annotations")
202 |       TCGA_ann <- data.frame(
203 |         sampleID = rownames(TCGA_mat),
204 |         lineage = NA,
205 |         subtype = NA,
206 |         type = "tumor"
207 |       )
208 |     } else {
209 |       TCGA_ann <- TCGA_ann[, tumor_column_names]
210 |       TCGA_ann$type <- "tumor"
211 |     }
212 |   } else {
213 |     if (file.exists(file.path(tumor_ann_name, tumor_ann_file))) {
214 |       TCGA_ann <- data.table::fread(file.path(tumor_ann_name, tumor_ann_file)) %>%
215 |         as.data.frame()
216 |     } else {
217 |       warning("tumor annotation file does not exist, creating default annotations")
218 |       TCGA_ann <- data.frame(
219 |         sampleID = rownames(TCGA_mat),
220 |         lineage = NA,
221 |         subtype = NA,
222 |         type = "tumor"
223 |       )
224 |     }
225 |     if (!all(c("sampleID", "lineage", "subtype", "type") %in% colnames(TCGA_ann))) {
226 |       warning("tumor annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations")
227 |       TCGA_ann <- data.frame(
228 |         sampleID = rownames(TCGA_mat),
229 |         lineage = NA,
230 |         subtype = NA,
231 |         type = "tumor"
232 |       )
233 |     }
234 |   }
235 | 
236 |   if (!(is.null(additional_annotations_name) | is.null(additional_annotations_file))) {
237 |     if (additional_annotations_taiga) {
238 |       add_ann <- taigr::load.from.taiga(data.name = additional_annotations_name, data.version = additional_annotations_version, data.file = additional_annotations_file)
239 |       tumor_column_names <- c("sampleID", "lineage", "subtype", "type")
240 |       if (is.null(add_ann)) {
241 |         warning("additional annotation file does not exist on taiga, no additional annotations used")
242 |       }
243 |       if (!all(c("sampleID", "subtype") %in% colnames(add_ann))) {
244 |         warning("additional annotation file does not contain expected columns (sampleID & subtype), no additional annotations used")
245 |       } else {
246 |         shared_samples <- intersect(CCLE_ann$sampleID, add_ann$sampleID)
247 |         CCLE_ann[match(shared_samples, CCLE_ann$sampleID), "subtype"] <- add_ann[match(shared_samples, add_ann$sampleID), "subtype"]
248 |       }
249 |     } else {
250 |       if (file.exists(file.path(additional_annotations_name, additional_annotations_file))) {
251 |         add_ann <- data.table::fread(file.path(additional_annotations_name, additional_annotations_file)) %>%
252 |           as.data.frame()
253 |         if (!all(c("sampleID", "subtype") %in% colnames(add_ann))) {
254 |           warning("additional annotation file does not contain expected columns (sampleID & subtype), no additional annotations used")
255 |         } else {
256 |           shared_samples <- intersect(CCLE_ann$sampleID, add_ann$sampleID)
257 |           CCLE_ann[match(shared_samples, CCLE_ann$sampleID), "subtype"] <- add_ann[match(shared_samples, add_ann$sampleID), "subtype"]
258 |         }
259 |       } else {
260 |         warning("additional annotation file does not exist, no additional annotations used")
261 |       }
262 |     }
263 |   }
264 |   # check for NAs
265 |   TCGA_mat <- check_NAs(TCGA_mat)
266 |   CCLE_mat <- check_NAs(CCLE_mat)
267 | 
268 |   # subset to samples in both the annotation and gene expression matrices, and match ordering between them
269 |   common_cls <- intersect(rownames(CCLE_mat), CCLE_ann$sampleID)
270 |   if (length(setdiff(rownames(CCLE_mat), CCLE_ann$sampleID)) > 0) {
271 |     sprintf("Missing annotations for these cell lines: %s", paste(rownames(CCLE_mat), CCLE_ann$sampleID, collapse = ", "))
272 |   }
273 | 
274 |   CCLE_mat <- CCLE_mat[common_cls, ]
275 |   CCLE_ann <- CCLE_ann[match(common_cls, CCLE_ann$sampleID), ]
276 | 
277 |   common_tumors <- intersect(rownames(TCGA_mat), TCGA_ann$sampleID)
278 |   if (length(setdiff(rownames(TCGA_mat), common_tumors)) > 0) {
279 |     sprintf("Missing annotations for these tumors: %s", paste(rownames(TCGA_mat), common_tumors, collapse = ", "))
280 |   }
281 |   TCGA_mat <- TCGA_mat[common_tumors, ]
282 |   TCGA_ann <- TCGA_ann[match(common_tumors, TCGA_ann$sampleID), ]
283 | 
284 |   # subset genes to functional genes
285 |   func_genes <- dplyr::filter(hgnc.complete.set, !locus_group %in% c("non-coding RNA", "pseudogene"))$ensembl_gene_id
286 |   genes_used <- intersect(colnames(TCGA_mat), colnames(CCLE_mat))
287 |   genes_used <- intersect(genes_used, func_genes)
288 | 
289 |   TCGA_mat <- TCGA_mat[, genes_used]
290 |   CCLE_mat <- CCLE_mat[, genes_used]
291 | 
292 | 
293 |   return(list(TCGA_mat = TCGA_mat, TCGA_ann = TCGA_ann, CCLE_mat = CCLE_mat, CCLE_ann = CCLE_ann))
294 | }
295 | 
296 | #' Method to calculate gene average expression and variance for an expression matrix
297 | #' @name calc_gene_stats
298 | #'
299 | #' @param dat: data object containing tumor and cell line expression data and annotations produced by running load_data
300 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
301 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations
302 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
303 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations
304 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL
305 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder
306 | #'
307 | #' @description calculate the average gene expression and variance
308 | #' @return gene stats matrix
309 | #' @export
310 | calc_gene_stats <- function(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) {
311 |   common_genes <- intersect(colnames(dat$TCGA_mat), colnames(dat$CCLE_mat))
312 | 
313 |   if (hgnc_taiga) {
314 |     hgnc.complete.set <- taigr::load.from.taiga(data.name = hgnc_data_name, data.version = hgnc_version, data.file = hgnc_data_file)
315 |     if (is.null(hgnc.complete.set)) {
316 |       stop("HGNC gene file input does not exist on taiga")
317 |     }
318 |   } else {
319 |     if (file.exists(file.path(hgnc_data_name, hgnc_data_file))) {
320 |       hgnc.complete.set <- data.table::fread(file.path(hgnc_data_name, hgnc_data_file)) %>%
321 |         as.data.frame()
322 |     } else {
323 |       stop("HGNC gene file input does not exist")
324 |     }
325 |   }
326 | 
327 |   if (!all(c("symbol", "ensembl_gene_id", "locus_group") %in% colnames(hgnc.complete.set))) {
328 |     stop("HGNC gene file does not contain expected columns (symbol, ensembl_gene_id, & locus_group)")
329 |   }
330 | 
331 |   hgnc.complete.set <- hgnc.complete.set %>%
332 |     dplyr::select(Gene = ensembl_gene_id, Symbol = symbol) %>%
333 |     filter(Gene %in% common_genes)
334 |   hgnc.complete.set <- hgnc.complete.set[!duplicated(hgnc.complete.set$Gene), ]
335 |   rownames(hgnc.complete.set) <- hgnc.complete.set$Gene
336 |   hgnc.complete.set <- hgnc.complete.set[common_genes, ]
337 | 
338 |   gene_stats <- data.frame(
339 |     Tumor_SD = apply(dat$TCGA_mat, 2, sd, na.rm = T),
340 |     CCLE_SD = apply(dat$CCLE_mat, 2, sd, na.rm = T),
341 |     Tumor_mean = colMeans(dat$TCGA_mat, na.rm = T),
342 |     CCLE_mean = colMeans(dat$CCLE_mat, na.rm = T),
343 |     Gene = common_genes,
344 |     stringsAsFactors = F
345 |   ) %>%
346 |     dplyr::mutate(max_SD = pmax(Tumor_SD, CCLE_SD, na.rm = T)) # add avg and max SD per gene
347 | 
348 |   gene_stats <- left_join(hgnc.complete.set, gene_stats, by = "Gene")
349 | 
350 |   return(gene_stats)
351 | }
352 | 
353 | 
354 | #' Method to create seurat objects given an expression matrix and annotation table
355 | #' @name create_Seurat_object
356 | #'
357 | #' @param exp_mat: matrix of samples by genes, where genes are ensembl gene IDs. Data should be log2(X+1) TPM data.
358 | #' @param ann: matrix of sample anntoations. Expects column 'sampleID' which matches the rownames of exp_mat.
359 | #' @param type: optional parameter, string specifying the data type of the current data (ex. 'tumor'), which is added to the annotation matrix.
360 | #' @description create Seurat object of expression data and annotations and run dimensionality reduction.
361 | #' Dimensionality reductions will be run with the parameters (n_PC_dims, umap_n_neighbors, umap_min_dist, distance_metric) specified in celligner_global.
362 | #' @return Seurat object with scaled expression data and annotations stored in meta.data
363 | #' @export
364 | #'
365 | create_Seurat_object <- function(exp_mat, ann, type = NULL) {
366 |   seu_obj <- Seurat::CreateSeuratObject(t(exp_mat),
367 |     min.cells = 0,
368 |     min.features = 0,
369 |     meta.data = ann %>%
370 |       magrittr::set_rownames(ann$sampleID)
371 |   )
372 |   if (!is.null(type)) {
373 |     seu_obj@meta.data$type <- type
374 |   }
375 |   # mean center the data, important for PCA
376 |   seu_obj <- Seurat::ScaleData(seu_obj, features = rownames(Seurat::GetAssayData(seu_obj)), do.scale = F)
377 | 
378 |   seu_obj %<>% Seurat::RunPCA(
379 |     assay = "RNA",
380 |     features = rownames(Seurat::GetAssayData(seu_obj)),
381 |     npcs = celligner_global$n_PC_dims, verbose = F
382 |   )
383 | 
384 |   seu_obj %<>% Seurat::RunUMAP(
385 |     assay = "RNA", dims = 1:celligner_global$n_PC_dims,
386 |     reduction = "pca",
387 |     n.neighbors = celligner_global$umap_n_neighbors,
388 |     min.dist = celligner_global$umap_min_dist,
389 |     metric = celligner_global$distance_metric, verbose = F
390 |   )
391 | 
392 |   return(seu_obj)
393 | }
394 | 
395 | #' Method to take in a Seurat object and run default Seurat clustering algorithm
396 | #' @name cluster_data
397 | #'
398 | #' @param seu_obj: seurat object containing expression data and sample annotations.
399 | #' Expects PCA for the seurat object has already been calculated.
400 | #' @description cluster data in seurat object, using default Seurat clustering method. Clsuters data
401 | #' within PCA space using the number of dimensions provided in celligner_global$n_PC_dims (default is 70)
402 | #'
403 | #' @return Seurat object with cluster annotations
404 | #' @export
405 | #'
406 | cluster_data <- function(seu_obj) {
407 |   seu_obj <- Seurat::FindNeighbors(seu_obj,
408 |     reduction = "pca",
409 |     dims = 1:celligner_global$n_PC_dims,
410 |     k.param = 20,
411 |     force.recalc = TRUE,
412 |     verbose = FALSE
413 |   )
414 | 
415 |   seu_obj %<>% Seurat::FindClusters(
416 |     reduction = "pca",
417 |     resolution = celligner_global$mod_clust_res
418 |   )
419 | 
420 |   seu_obj@meta.data$cluster <- seu_obj@meta.data$seurat_clusters
421 | 
422 |   return(seu_obj)
423 | }
424 | 
425 | #' Method to find genes that are differentially expressed between clusters within the expression data
426 | #' @name find_differentially_expressed_genes
427 | #'
428 | #' @param seu_obj: seurat object containing expression data and sample annotations. Expects data in the Seurat object
429 | #' slot scale.data and a column 'seurat_clusters' within the meta.data of the Seurat object.
430 | #' @description find genes that are differentially expressed between clusters within the expression data
431 | #'
432 | #' @return table with gene level stats
433 | #' @export
434 | #'
435 | find_differentially_expressed_genes <- function(seu_obj) {
436 |   if (nrow(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")) == 0) {
437 |     stop("Seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first")
438 |   }
439 |   if (!"seurat_clusters" %in% colnames(seu_obj@meta.data)) {
440 |     stop("Seurat object doesn't contain the column 'seurat_clusters', run 'cluster_data' first")
441 |   }
442 |   n_clusts <- nlevels(seu_obj@meta.data$seurat_clusters)
443 |   if (n_clusts > 2) {
444 |     cur_DE_genes <- run_lm_stats_limma_group(
445 |       t(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")),
446 |       seu_obj@meta.data %>% dplyr::select(seurat_clusters),
447 |       limma_trend = TRUE
448 |     ) %>%
449 |       dplyr::select(Gene, gene_stat = F_stat)
450 |   } else if (n_clusts == 2) {
451 |     cur_DE_genes <- run_lm_stats_limma(t(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")),
452 |       seu_obj@meta.data$cluster,
453 |       limma_trend = TRUE
454 |     ) %>%
455 |       dplyr::mutate(gene_stat = abs(t_stat)) %>%
456 |       dplyr::select(Gene, gene_stat)
457 |   } else {
458 |     cur_DE_genes <- data.frame(Gene = colnames(seu_obj), gene_stat = NA)
459 |   }
460 | 
461 |   return(cur_DE_genes)
462 | }
463 | 
464 | #' Method to run contrastive principal components analysis
465 | #' @name run_cPCA
466 | #'
467 | #' @param TCGA_obj: seurat object containing expression data and sample annotations, usually the tumor data
468 | #' @param CCLE_obj: seurat object containing expression data and sample annotations, usually the cell line data
469 | #' @param pc_dims: the number of cPCs calculated. If set to null then all cPCs will be calculated (this is quite slow), but if set to
470 | #' some value >=4 then an approximate cPCA will be calculated, which just calculates the input number of contrastive principle components,
471 | #' which is quicker.
472 | #' @description run contrastive principal components analysis.
473 | #' Set pc_dims to a value >= 4 to run fast cPCA by just calculating the top contrastive principle components
474 | #'
475 | #' @return object containing cPC vectors and values
476 | #' @export
477 | #'
478 | run_cPCA <- function(TCGA_obj, CCLE_obj, pc_dims = NULL) {
479 |   if (nrow(Seurat::GetAssayData(TCGA_obj, assay = "RNA", slot = "scale.data")) == 0) {
480 |     stop("TCGA seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first")
481 |   }
482 |   if (nrow(Seurat::GetAssayData(CCLE_obj, assay = "RNA", slot = "scale.data")) == 0) {
483 |     stop("CCLE seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first")
484 |   }
485 |   cov_diff_eig <- run_cPCA_analysis(t(Seurat::GetAssayData(TCGA_obj, assay = "RNA", slot = "scale.data")),
486 |     t(Seurat::GetAssayData(CCLE_obj, assay = "RNA", slot = "scale.data")),
487 |     TCGA_obj@meta.data, CCLE_obj@meta.data,
488 |     pc_dims = pc_dims
489 |   )
490 |   return(cov_diff_eig)
491 | }
492 | 
493 | #' Method to run mutual nearest neighbors batch correction
494 | #' @name run_MNN
495 | #'
496 | #' @param CCLE_cor: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
497 | #' In the default Celligner pipeline this the cell line data.
498 | #' @param TCGA_cor: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
499 | #' In the default Celligner pipeline this the tumor data.
500 | #' @param k1: the number of neighbors within the data being corrected (by default the tumor data). By default this
501 | #' pulls from the celligner_global paramter mnn_k_tumor, which by default is 50.
502 | #' @param k2: the number of neighbors within the reference data (by default the cell line data). By default this
503 | #' pulls from the celligner_global parameter mnn_k_CL, which by default is 5.
504 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
505 | #' By default this pulls from the celligner_global parameter mnn_ndist, which by default is 3.
506 | #' @param subset_genes: the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially
507 | #' expressed genes is usually passed here.
508 | #' @description run MNN batch correction to align data to a reference dataset
509 | #'
510 | #' @return mutual nearest neighbors object with corrected data for the second dataset provided as input and the mutual nearest neighbors
511 | #' @export
512 | #'
513 | run_MNN <- function(CCLE_cor, TCGA_cor, k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist,
514 |                     subset_genes) {
515 |   mnn_res <- modified_mnnCorrect(CCLE_cor, TCGA_cor,
516 |     k1 = k1, k2 = k2, ndist = ndist,
517 |     subset_genes = subset_genes
518 |   )
519 | 
520 |   return(mnn_res)
521 | }
522 | 
523 | #' Method to calculate the correlation between cell lines and tumor in the Celligner aligned data
524 | #' @name calc_tumor_CL_cor
525 | #'
526 | #' @param Celligner_aligned_data: Celligner aligned data matrix of samples (cells line and tumors) by genes
527 | #' @param Celligner_info: annotation file of cell line and tumor samples with a column 'type' marking samples as either
528 | #' cell lines or tumors and a column 'sampleID' that matches the row names of Celligner_aligned_data
529 | #' @description calculate the correlation between cell line and tumor samples in the Celligner aligned data
530 | #'
531 | #' @return matrix of correlations that is tumors by cell lines
532 | #' @export
533 | #'
534 | calc_tumor_CL_cor <- function(Celligner_aligned_data, Celligner_info) {
535 |   tumors_samples <- dplyr::filter(Celligner_info, type == "tumor")$sampleID
536 |   cl_samples <- dplyr::filter(Celligner_info, type == "CL")$sampleID
537 |   tumor_CL_cor <- cor(t(Celligner_aligned_data[tumor_samples, ]), t(Celligner_aligned_data[cl_samples, ]),
538 |     use = "pairwise"
539 |   )
540 | 
541 | 
542 |   return(tumor_CL_cor)
543 | }
544 | 
545 | 
546 | #' All methods to run Celligner and save the output, if desired
547 | #' @name run_Celligner
548 | #'
549 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
550 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data
551 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
552 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data
553 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL
554 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder
555 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
556 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations
557 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
558 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
559 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.
560 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL
561 | #' @param cell_line_ann_taiga: if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder
562 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
563 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.
564 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
565 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data
566 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL
567 | #' @param tumor_taiga: if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder
568 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
569 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations
570 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
571 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
572 | #'  if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.
573 | #' @param tumor_ann_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
574 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
575 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
576 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
577 | #' for the cell lines. If null, assumes there are no additional annotations.
578 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
579 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
580 | #' no additional annotations.
581 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
582 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
583 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
584 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations
585 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
586 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations
587 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL
588 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder
589 | #' @param save_output: by default is NULL and won't save output, to save output pass in a filepath of where to save the output
590 | #'
591 | #' @importFrom magrittr "%>%"
592 | #'
593 | #' @description run all parts of the Celligner pipeline
594 | #'
595 | #' @return seurat object of the Celligner-aligned data
596 | #' @export
597 | #'
598 | run_Celligner <- function(cell_line_data_name = "public-20q4-a4b3", cell_line_data_file = "CCLE_expression_full", cell_line_version = NULL, cell_line_taiga = TRUE,
599 |                           cell_line_ann_name = "arxspan-cell-line-export-f808", cell_line_ann_file = "ACH", cell_line_ann_version = NULL, cell_line_ann_taiga = TRUE,
600 |                           tumor_data_name = "celligner-input-9827", tumor_data_file = "tumor_expression", tumor_version = NULL, tumor_taiga = TRUE,
601 |                           tumor_ann_name = "celligner-input-9827", tumor_ann_file = "tumor_annotations", tumor_ann_version = NULL, tumor_ann_taiga = TRUE,
602 |                           additional_annotations_name = "celligner-input-9827", additional_annotations_file = "CCLE_annotations", additional_annotations_version = NULL, additional_annotations_taiga = TRUE,
603 |                           hgnc_data_name = "hgnc-87ab", hgnc_data_file = "hgnc_complete_set", hgnc_version = NULL, hgnc_taiga = TRUE,
604 |                           save_output = NULL) {
605 |   require(magrittr)
606 |   require(tidyverse)
607 | 
608 |   dat <- load_data(
609 |     cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga,
610 |     cell_line_ann_name, cell_line_ann_file, cell_line_ann_version, cell_line_ann_taiga,
611 |     tumor_data_name, tumor_data_file, tumor_version, tumor_taiga,
612 |     tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga,
613 |     additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga,
614 |     hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga
615 |   )
616 | 
617 |   gene_stats <- calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga)
618 | 
619 |   comb_ann <- rbind(
620 |     dat$TCGA_ann %>% dplyr::select(sampleID, lineage, subtype) %>%
621 |       dplyr::mutate(type = "tumor"),
622 |     dat$CCLE_ann %>% dplyr::select(sampleID, lineage, subtype) %>%
623 |       dplyr::mutate(type = "CL")
624 |   )
625 | 
626 |   TCGA_obj <- create_Seurat_object(dat$TCGA_mat, dat$TCGA_ann, type = "tumor")
627 |   CCLE_obj <- create_Seurat_object(dat$CCLE_mat, dat$CCLE_ann, type = "CL")
628 | 
629 |   TCGA_obj <- cluster_data(TCGA_obj)
630 |   CCLE_obj <- cluster_data(CCLE_obj)
631 | 
632 |   tumor_DE_genes <- find_differentially_expressed_genes(TCGA_obj)
633 |   CL_DE_genes <- find_differentially_expressed_genes(CCLE_obj)
634 | 
635 |   DE_genes <- full_join(tumor_DE_genes, CL_DE_genes, by = "Gene", suffix = c("_tumor", "_CL")) %>%
636 |     mutate(
637 |       tumor_rank = dplyr::dense_rank(-gene_stat_tumor),
638 |       CL_rank = dplyr::dense_rank(-gene_stat_CL),
639 |       best_rank = pmin(tumor_rank, CL_rank, na.rm = T)
640 |     ) %>%
641 |     dplyr::left_join(gene_stats, by = "Gene")
642 | 
643 |   # take genes that are ranked in the top 1000 from either dataset, used for finding mutual nearest neighbors
644 |   DE_gene_set <- DE_genes %>%
645 |     dplyr::filter(best_rank < celligner_global$top_DE_genes_per) %>%
646 |     .[["Gene"]]
647 | 
648 | 
649 |   cov_diff_eig <- run_cPCA(TCGA_obj, CCLE_obj, celligner_global$fast_cPCA)
650 | 
651 |   if (is.null(celligner_global$fast_cPCA)) {
652 |     cur_vecs <- cov_diff_eig$vectors[, celligner_global$remove_cPCA_dims, drop = FALSE]
653 |   } else {
654 |     cur_vecs <- cov_diff_eig$rotation[, celligner_global$remove_cPCA_dims, drop = FALSE]
655 |   }
656 | 
657 |   # clear unused objects
658 |   rm(TCGA_obj)
659 |   rm(CCLE_obj)
660 |   gc()
661 | 
662 |   rownames(cur_vecs) <- colnames(dat$TCGA_mat)
663 |   TCGA_cor <- resid(lm(t(dat$TCGA_mat) ~ 0 + cur_vecs)) %>% t()
664 |   CCLE_cor <- resid(lm(t(dat$CCLE_mat) ~ 0 + cur_vecs)) %>% t()
665 | 
666 |   # clear unused objects
667 |   rm(dat)
668 |   gc()
669 | 
670 |   mnn_res <- run_MNN(CCLE_cor, TCGA_cor,
671 |     k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist,
672 |     subset_genes = DE_gene_set
673 |   )
674 | 
675 |   combined_mat <- rbind(mnn_res$corrected, CCLE_cor)
676 | 
677 |   comb_obj <- create_Seurat_object(combined_mat, comb_ann)
678 |   comb_obj <- cluster_data(comb_obj)
679 | 
680 |   Celligner_res <- Seurat::Embeddings(comb_obj, reduction = "umap") %>%
681 |     as.data.frame() %>%
682 |     magrittr::set_colnames(c("UMAP_1", "UMAP_2")) %>%
683 |     tibble::rownames_to_column(var = "sampleID") %>%
684 |     dplyr::left_join(comb_obj@meta.data, by = "sampleID")
685 | 
686 |   lineage_averages <- Celligner_res %>%
687 |     dplyr::filter(!lineage %in% c(
688 |       "adrenal_cortex", "embryo", "endocrine", "engineered", "engineered_blood",
689 |       "engineered_breast", "engineered_central_nervous_system", "engineered_kidney",
690 |       "engineered_lung", "engineered_ovary", "engineered_prostate", "epidermoid_carcinoma",
691 |       "nasopharynx", "nerve", "pineal", "teratoma", "unknown"
692 |     )) %>%
693 |     dplyr::group_by(lineage) %>%
694 |     dplyr::summarise(
695 |       UMAP_1 = median(UMAP_1, na.rm = T),
696 |       UMAP_2 = median(UMAP_2, na.rm = T)
697 |     )
698 |   lineage_averages$lineage <- gsub("_", " ", lineage_averages$lineage)
699 |   lineage_lab_aes <- ggplot2::geom_text(data = lineage_averages, mapping = aes(x = UMAP_1, y = UMAP_2, label = lineage), size = 3, color = "#000000")
700 | 
701 | 
702 |   if ("type" %in% colnames(Celligner_res) & "tumor" %in% Celligner_res$type & "CL" %in% Celligner_res$type) {
703 |     celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) +
704 |       ggplot2::geom_point(alpha = 0.7, pch = 21, ggplot2::aes(color = type, fill = lineage, size = type)) +
705 |       ggplot2::scale_color_manual(values = c(tumor = "white", CL = "black")) +
706 |       ggplot2::scale_size_manual(values = c(tumor = 0.75, CL = 1.5)) +
707 |       ggplot2::xlab("UMAP 1") +
708 |       ggplot2::ylab("UMAP 2") +
709 |       ggplot2::guides(
710 |         fill = FALSE,
711 |         color = ggplot2::guide_legend(override.aes = list(color = c("black", "white"), fill = c("white", "black")))
712 |       ) +
713 |       ggplot2::theme_classic()
714 |   } else {
715 |     celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) +
716 |       ggplot2::geom_point(alpha = 0.7, pch = 21, size = 1, ggplot2::aes(fill = lineage)) +
717 |       ggplot2::xlab("UMAP 1") +
718 |       ggplot2::ylab("UMAP 2") +
719 |       ggplot2::theme_classic() +
720 |       ggplot2::theme(legend.position = "none")
721 |   }
722 | 
723 |   print(celligner_plot)
724 |   print(celligner_plot + lineage_lab_aes)
725 | 
726 | 
727 |   if (!is.null(save_output)) {
728 |     if (file.exists(save_output)) {
729 |       print("calculating tumor/cell line correlation")
730 |       tumor_CL_cor <- calc_tumor_CL_cor(combined_mat, comb_obj@meta.data)
731 | 
732 |       print("saving files")
733 |       write.csv(tumor_CL_cor, file.path(save_output, "tumor_CL_cor.csv"))
734 |       write.csv(combined_mat, file.path(save_output, "Celligner_aligned_data.csv"))
735 |       readr::write_csv(Celligner_res, file.path(save_output, "Celligner_info.csv"))
736 |       write.csv(cur_vecs, file.path(save_output, "cPCs.csv"))
737 |       readr::write_csv(DE_genes, file.path(save_output, "DE_genes.csv"))
738 |       ggplot2::ggsave(file.path(save_output, "Celligner_plot.png"), celligner_plot, device = "png", width = 8, height = 6)
739 |       ggplot2::ggsave(file.path(save_output, "labeled_Celligner_plot.png"), celligner_plot + lineage_lab_aes, device = "png", width = 8, height = 6)
740 |     } else {
741 |       warning("can't save output, folder does not exist")
742 |     }
743 |   }
744 | 
745 |   return(comb_obj)
746 | }


--------------------------------------------------------------------------------
/R/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: celligner
 2 | Title: Celligner pipeline
 3 | Version: 0.0.0.9000
 4 | Authors@R: 
 5 |     person(given = "Allie",
 6 |            family = "Warren",
 7 |            role = c("aut", "cre"),
 8 |            email = "alliecwarren@gmail.com",
 9 |            comment = c(ORCID = ""))
10 | Description: R package to run the Celligner method using datasets from taiga. 
11 | Imports: 
12 | 	tidyverse,
13 | 	reshape2,
14 | 	plyr,
15 | 	data.table,
16 | 	Seurat,
17 | 	pdist,
18 | 	FNN,
19 | 	irlba,
20 | 	limma,
21 | 	batchelor,
22 | 	BiocParallel,
23 | 	BiocManager,
24 | 	taigr
25 | License: `use_mit_license()`, `use_gpl3_license()` or friends to
26 |     pick a license
27 | Encoding: UTF-8
28 | LazyData: true
29 | Roxygen: list(markdown = TRUE)
30 | RoxygenNote: 7.1.1
31 | 


--------------------------------------------------------------------------------
/R/Dockerfile:
--------------------------------------------------------------------------------
 1 | ## celligner
 2 | FROM debian:latest
 3 | MAINTAINER Jeremie Kalfon
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils &&
 6 |   apt-get install -y sudo &&
 7 |   sudo apt-get install -y wget libterm-readline-gnu-perl &&
 8 | 
 9 |   # all nice packages
10 |   ## install the [tools](https://www.datacamp.com/community/tutorials/google-cloud-data-science) sudo apt-get -y install htop parallel curl  tar  vim  nano  bzip2  unzip libssl-dev  make cmake libcurl4-openssl-dev  default-jre  && sudo apt-get -y install dirmngr apt-transport-https  ca-certificates  gnupg2  software-properties-common  zlib1g-dev  libbz2-dev  liblzma-dev  openssh-server  default-libmysqlclient-dev  acl  g++
11 |   ## sudo apt install git libmagickwand-dev libtool libexpat1-dev ghostscript graphviz libgraphviz-dev pkg-config libxml-simple-perl zlib1g-dev
12 |   sudo apt-get -y install \
13 |     htop \
14 |     parallel \
15 |     curl \
16 |     tar \
17 |     vim \
18 |     nano \
19 |     bc \
20 |     bzip2 \
21 |     unzip \
22 |     libssl-dev \
23 |     make \
24 |     cmake \
25 |     libcurl4-openssl-dev \
26 |     default-jre &&
27 |   sudo apt-get -y install dirmngr apt-transport-https \
28 |     ca-certificates \
29 |     gnupg2 \
30 |     software-properties-common \
31 |     zlib1g-dev \
32 |     libbz2-dev \
33 |     liblzma-dev \
34 |     libxml2-dev \
35 |     openssh-server \
36 |     default-libmysqlclient-dev \
37 |     acl \
38 |     g++ \
39 |     autoconf \
40 |     automake \
41 |     git libmagickwand-dev libtool \
42 |     libexpat1-dev \
43 |     ghostscript \
44 |     graphviz \
45 |     libgraphviz-dev \
46 |     pkg-config \
47 |     libxml-simple-perl \
48 |     zlib1g-dev &&
49 | 
50 |   # install R	sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' && echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list && echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list && sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' && sudo apt update && sudo apt -y install r-base && sudo apt -y install python3 python3-pip &&
51 |   sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' &&
52 |   echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list &&
53 |   echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list &&
54 |   sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' &&
55 |   sudo apt update &&
56 |   sudo apt -y install r-base &&
57 |   sudo apt -y install python3 python3-pip &&
58 |   # all python config pip3 install numpy pandas && pip3 install MACS2 && pip3 install dxpy  jupytext  scikit-learn  google-api-core  igv  igv-jupyter firecloud-dalmatian  awscli  seaborn  pipreqs  pysradb  nbstripout  bokeh  matplotlib  deeptools  tensorflow  cutadapt ipykernel jupyter_contrib_nbextensions && jupyter contrib nbextension install && nbstripout --install --global && ipykernel install && nbstripout --install --global
59 |   pip3 install numpy pandas &&
60 | 
61 |   # search history
62 |   touch ~/.inputrc &&
63 |   echo "$include /etc/inputrc" >~/.inputrc &&
64 |   echo ""\e[A":history-search-backward" >~/.inputrc &&
65 |   echo ""\e[B":history-search-forward" >~/.inputrc &&
66 | 
67 |   # all R config
68 |   export R_LIBS="~/R/x86_64-pc-linux-gnu-library/3.6" &&
69 |   # install all nice R packages R -e "install.packages(c('plyr','dplyr','tidyverse','magrittr','reshape2','useful','ggplot2','ggthemes','ggrepel','gridExtra','ggridges','GGally','plotly','VennDiagram','RColorBrewer','extrafont','cowplot', 'network','data.table','DT','readr','readxl','clues','mclust','pheatmap','Rtsne','NMF','hash', 'stringr', 'irr', 'zoo', 'devtools', 'scales', 'rlang', 'rmarkdown','lsa','BiocManager'), dependencies=TRUE, repos='http://cran.rstudio.com/');  font_import(); loadfonts(); BiocManager::install(c('GSEABase','limma','org.Hs.eg.db','GenomicRanges','DESeq2'));	  print('if can't use broad intranet, install from source with [R CMD INSTALL .] for 'taigr',	    'cdsr','svacd', 'cell_line_mapping/celllinemapr')"
70 |   R -e "install.packages(c('plyr','dplyr','tidyverse','magrittr','reshape2','useful','ggplot2','ggthemes',\
71 |     'ggrepel','gridExtra','ggridges','GGally','plotly','VennDiagram','RColorBrewer','extrafont','cowplot',\
72 |     'network','data.table','DT','readr','readxl','clues','mclust','pheatmap','Rtsne','NMF','hash',\
73 |     'stringr', 'irr', 'zoo', 'devtools', 'scales', 'rlang', 'rmarkdown','lsa','BiocManager'), \
74 |     dependencies=TRUE, repos='http://cran.rstudio.com/'); \
75 |     font_import();\
76 |     loadfonts();\
77 |     BiocManager::install(c('GSEABase','Seurat','batchelor','limma','org.Hs.eg.db','GenomicRanges','DESeq2'))\
78 |     devtools::install_github('broadinstitute/taigr');\
79 |     devtools::install_github('broadinstitute/celligner');\
80 |     " &&
81 |   R -e "library(devtools);\
82 |           devtools::install_github('broadinstitute/celligner')"
83 | 


--------------------------------------------------------------------------------
/R/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(.average_correction)
 4 | export(.center_along_batch_vector)
 5 | export(.compute_tricube_average)
 6 | export(.tricube_weighted_correction)
 7 | export(calc_gene_stats)
 8 | export(calc_tumor_CL_cor)
 9 | export(check_NAs)
10 | export(cluster_data)
11 | export(create_Seurat_object)
12 | export(find_differentially_expressed_genes)
13 | export(get_cluster_averages)
14 | export(load_additional_data)
15 | export(load_data)
16 | export(modified_mnnCorrect)
17 | export(run_Celligner)
18 | export(run_MNN)
19 | export(run_cPCA)
20 | export(run_cPCA_analysis)
21 | export(run_lm_stats_limma_group)
22 | export(run_multidataset_alignment)
23 | importFrom(BiocNeighbors,queryKNN)
24 | importFrom(BiocParallel,SerialParam)
25 | importFrom(magrittr,"%>%")
26 | 


--------------------------------------------------------------------------------
/R/README.md:
--------------------------------------------------------------------------------
  1 | # Celligner
  2 | 
  3 | ![](docs/typical_celligner.webp)
  4 | 
  5 | celligner is a computational project to align multiple cancer datasets across sequencing modalities, tissue conditions (media, perturbations..) and format (CL/tumor/organoids/spheroids)
  6 | 
  7 | see our latest paper on aligning CCLE cell lines with TCGA tumors:
  8 | [2020 paper](https://www.nature.com/articles/s41467-020-20294-x)
  9 | 
 10 | 
 11 | ## Install
 12 | 
 13 | ### Local
 14 | 
 15 | ``` r
 16 | library(devtools)
 17 | devtools::install_github("broadinstitute/celligner/R")
 18 | ```
 19 | 
 20 | if you could not install taigr:
 21 | ```r
 22 | devtools::install_github("broadinstitute/taigr")
 23 | ```
 24 | 
 25 | ### Docker
 26 | 
 27 | a docker image is available at: [jkobject/celligner](https://hub.docker.com/r/jkobject/celligner)
 28 | 
 29 | the Dockerfile is listed in this repo.
 30 | 
 31 | for now you will need to copy your taiga secret file to the docker first
 32 | 
 33 | ## run Celligner
 34 | 
 35 | The package can be loaded by calling
 36 | ``` r
 37 | library(celligner)
 38 | ```
 39 | 
 40 | please note that celligner might use a significant amount of RAM (around 50GBs)
 41 | 
 42 | The entire pipeline can be run by calling **run_Celligner()**.
 43 | 
 44 | ### parameters
 45 |   - *cell_line_data_name* : if *cell_line_taiga* = TRUE, then the data.name of the taiga file containing the cell line expression data, 
 46 |   if *cell_line_taiga*=FALSE, then the file path to the local folder containing the cell line expression data. To run the pipeline on
 47 |   new DepMap data this is the only parameter that should need to be updated (to refer to the new virtual dataset for the relevant release).
 48 |   - *cell_line_data_file* : if *cell_line_taiga* = TRUE, then the data.file of the taiga file containing the cell line expression data,
 49 |   if *cell_line_taiga*=FALSE, then the name of the file of cell line expression data. By default uses the virtual dataset data file 'CCLE_expression_full'.
 50 |   - *cell_line_version* : (optional) parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL
 51 |   - *cell_line_taiga*: if TRUE (default) then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder
 52 |   - *cell_line_ann_name* : if *cell_line_taiga* = TRUE, then the data.name of the taiga file containing the cell line annotations,
 53 |   if *cell_line_taiga*=FALSE, then the file path to the local folder containing the cell line annotations. By default pulls the arxspan data from taiga.
 54 |   - *cell_line_ann_file* : if *cell_line_taiga* = TRUE, then the data.file of the taiga file containing the cell line annotations,
 55 |   if *cell_line_taiga*=FALSE, then the name of the file of cell line annotations. If pulling from taiga (default), assumes that the file is the arxspan
 56 |   file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.
 57 |   - *cell_line_ann_version* : (optional) parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL
 58 |   - *cell_line_ann_taiga* : if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder
 59 |   - *tumor_data_name* : if *tumor_taiga* = TRUE (default), then the data.name of the taiga file containing the tumor expression data,
 60 |   if *tumor_taiga*=FALSE, then the file path to the local folder containing the tumor expression data. By default, pulls the TCGA+ (TCGA, TARGET, & Treehouse data 
 61 |   downloaded from xena browser)
 62 |   - *tumor_data_file* : if *tumor_taiga* = TRUE (default), then the data.file of the taiga file containing the tumor expression data,
 63 |   if *tumor_taiga*=FALSE, then the name of the file the tumor expression data
 64 |   - *tumor_version* : (optional) parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL
 65 |   - *tumor_taiga* : if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder
 66 |   - *tumor_ann_name* : if *tumor_taiga* = TRUE (default), then the data.name of the taiga file containing the tumor annotations,
 67 |   if *tumor_taiga*=FALSE, then the file path to the local folder containing the tumor annotations
 68 |   - *tumor_ann_file* : if *tumor_ann_taiga* = TRUE (default), then the data.file of the taiga file containing the tumor annotations,
 69 |   if *tumor_ann_taiga*=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already 
 70 |   created Celligner info file used in the Celligner manuscript, if not then assumes it is a local file containing the columns 
 71 |   sampleID, lineage, subtype, and type=='tumor'.
 72 |   - *tumor_ann_version* : (optional) parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
 73 |   - *tumor_ann_taiga* : if TRUE (default) then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
 74 |   - *additional_annotations_name* : if *additional_annotations_taiga* = TRUE (default), then the data.name of the taiga file containing the additional annotations,
 75 |   if *additional_annotations_taiga*=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
 76 |   for the cell lines. If NULL, assumes there are no additional annotations. 
 77 |   - *additional_annotations_file* : if *additional_annotations_taiga* = TRUE (default), then the data.file of the taiga file containing the additional annotations,
 78 |   if *additional_annotations_taiga*=FALSE, then the name of the file the additional annotations. If null, assumes there are
 79 |   no additional annotations. By default pulls the Celligner_info file, used in the Celligner manuscript.
 80 |   - *additional_annotations_version* : (optional) parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
 81 |   *additional_annotations_taiga*: if TRUE (default) then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
 82 |   - *hgnc_data_name* : if *hgnc_taiga* = TRUE (default), then the data.name of the taiga file containing the HGNC gene annotations,
 83 |   if *hgnc_taiga*=FALSE, then the file path to the local folder containing the HGNC gene annotations
 84 |   - *hgnc_data_file* : if *hgnc_taiga* = TRUE (default), then the data.file of the taiga file containing the HGNC gene annotations,
 85 |   if *hgnc_taiga*=FALSE, then the name of the file the HGNC gne annotations
 86 |   - *hgnc_version* : (optional) parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL
 87 |   - *hgnc_taiga* : if TRUE (default) then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder
 88 |   - *save_output* : by default is NULL and won't save output, to save output pass in a filepath of where to save the output
 89 | 
 90 | ### Returns
 91 |   - a seurat object containing the Celligner-aligned data, the UMAP dimensionality reduction of the Celligner-aligned data, clustering, sample metadata
 92 |   - some plots
 93 |   - if *save_output* is given a filepath then output files will be saved to the folder specified
 94 |   
 95 | ## Multidataset alignment
 96 | 
 97 | **run_multidataset_alignment()** is similar to **run_Celligner()**, but also aligns the _Met500_ dataset and two _PDX_ datasets, by default pulling them from taiga. See more notes on multidataset alignment in [this google document](https://docs.google.com/document/d/11FvwosKXieYT0sRuyOkjCG1ZiyxYDQohx5-dyrY6LVg) 
 98 | 
 99 | Follow our slack discussion on BroadInsitute's [#tumor-to-cl](#)
100 | 
101 | Follow the project on our [Asana page](https://app.asana.com/0/482696339531494/list)
102 | 
103 | Please use _github issues_ for any problem related to the tool.
104 | 
105 | Maintainers:
106 | 
107 | Jérémie Kalfon @jkobject
108 | James McFarland
109 | Javad Noorbak @jnoorbak
110 | 
111 | Created by: 
112 | 
113 | Allie Warren @awarren
114 | 
115 | 


--------------------------------------------------------------------------------
/R/global_params.R:
--------------------------------------------------------------------------------
 1 | # Parameters
 2 | celligner_global <- list(
 3 |   n_genes = 'all', # set to 'all' to use all protein coding genes found in both datasets
 4 |   umap_n_neighbors = 10, # num nearest neighbors used to create UMAP plot
 5 |   umap_min_dist = 0.5, # min distance used to create UMAP plot
 6 |   mnn_k_CL = 5, # number of nearest neighbors of tumors in the cell line data
 7 |   mnn_k_tumor = 50, # number of nearest neighbors of cell lines in the tumor data
 8 |   top_DE_genes_per = 1000, # differentially expressed genes with a rank better than this is in the cell line or tumor data
 9 |   # are used to identify mutual nearest neighbors in the MNN alignment step
10 |   remove_cPCA_dims = c(1,2,3,4), # which cPCA dimensions to regress out of the data
11 |   distance_metric = 'euclidean', # distance metric used for the UMAP projection
12 |   mod_clust_res = 5, # resolution parameter used for clustering the data
13 |   mnn_ndist = 3, # ndist parameter used for MNN
14 |   n_PC_dims = 70, # number of PCs to use for dimensionality reduction
15 |   reduction.use = 'umap', # 2D projection used for plotting
16 |   fast_cPCA = 10 # to run fast cPCA (approximate the cPCA eigenvectors instead of calculating all) set this to a value >= 4
17 | )
18 | 
19 | tissue_colors <- c(`central_nervous_system`= "#f5899e",`engineered_central_nervous_system` = "#f5899e",
20 |                    `teratoma` = "#f5899e",
21 |                    `bone` = "#9f55bb",
22 |                    `pancreas` = "#b644dc",
23 |                    `soft_tissue` = "#5fdb69",
24 |                    `skin` = "#6c55e2",
25 |                    `liver` = "#9c5e2b",
26 |                    `blood` = "#da45bb",
27 |                    `lymphocyte`=  "#abd23f",
28 |                    `peripheral_nervous_system` = "#73e03d",
29 |                    `ovary` = "#56e79d",`engineered_ovary` = "#56e79d",
30 |                    `adrenal` = "#e13978",  `adrenal_cortex` = "#e13978",
31 |                    `upper_aerodigestive` = "#5da134",
32 |                    `kidney` = "#1f8fff",`engineered_kidney` = "#1f8fff",
33 |                    `gastric` = "#dfbc3a",
34 |                    `eye` = "#349077",
35 |                    `nasopharynx` = "#a9e082",
36 |                    `nerve` = "#c44c90",
37 |                    `unknown` = "#999999",
38 |                    `cervix` = "#5ab172",
39 |                    `thyroid` = "#d74829",
40 |                    `lung` = "#51d5e0",`engineered_lung` = "#51d5e0",
41 |                    `rhabdoid` = "#d04850",
42 |                    `germ_cell` = "#75dfbb",   `embryo` = "#75dfbb",
43 |                    `colorectal` = "#96568e",
44 |                    `endocrine` = "#d1d684",
45 |                    `bile_duct` = "#c091e3",
46 |                    `pineal` = "#949031",
47 |                    `thymus` = "#659fd9",
48 |                    `mesothelioma` = "#dc882d",
49 |                    `prostate` = "#3870c9", `engineered_prostate` = "#3870c9",
50 |                    `uterus` = "#e491c1",
51 |                    `breast` = "#45a132",`engineered_breast` = "#45a132",
52 |                    `urinary_tract` = "#e08571",
53 |                    `esophagus` = "#6a6c2c",
54 |                    `fibroblast` = "#d8ab6a",
55 |                    `plasma_cell` = "#e6c241")
56 | 


--------------------------------------------------------------------------------
/R/install_packages.R:
--------------------------------------------------------------------------------
 1 | options(repos = c("https://cran.cnr.berkeley.edu"))
 2 | options(repos = c(
 3 |       "https://iwww.broadinstitute.org/~datasci/R-packages",
 4 |       "https://cran.rstudio.com/"))
 5 | cran_packages <- c('tidyverse', 'reshape2', 'plyr', 'data.table', 'Seurat', 'pdist','FNN', 'irlba')
 6 | new_cran_packages <- cran_packages[!(cran_packages %in% installed.packages()[,"Package"])]
 7 | if(length(new_cran_packages)) install.packages(new_cran_packages)
 8 | 
 9 | bioconductor_packages <- c('limma', 'batchelor', 'BiocParallel')
10 | new_bioconductor_packages <- bioconductor_packages[!(bioconductor_packages %in% installed.packages()[,"Package"])]
11 | if(length(new_bioconductor_packages)) {
12 |   if (!requireNamespace("BiocManager", quietly = TRUE)) {
13 |     install.packages("BiocManager")
14 |   }
15 |   BiocManager::install(new_bioconductor_packages)
16 | }
17 | 


--------------------------------------------------------------------------------
/R/mutlidataset_alignment.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Load additional expression and annotation data
  3 | #' @name load_additional_data
  4 | #'
  5 | #' @param data_name: if data_taiga = TRUE, then the data.name of the taiga file containing the expression data,
  6 | #' if data_taiga=FALSE, then the file path to the local folder containing the expression data. Assumes that genes
  7 | #' are labeled using ensembl IDs and that there are fewer samples than genes in the matrix, will transpose the matrix
  8 | #' so that rows are samples and columns are genes.
  9 | #' @param data_file: if data_taiga = TRUE, then the data.file of the taiga file containing the expression data,
 10 | #' if data_taiga = FALSE, then the name of the file of expression data
 11 | #' @param data_version: (optional) parameter to specify the version to pull from taiga for the expression data, default set to NULL
 12 | #' @param data_taiga: if TRUE then pulls the expression data from taiga, if FALSE then finds expression data in local folder
 13 | #' @param ann_name: if ann_taiga = TRUE, then the data.name of the taiga file containing the data annotations,
 14 | #' if ann_taiga=FALSE, then the file path to the local folder containing the annotations
 15 | #' @param ann_file: if ann_taiga = TRUE, then the data.file of the taiga file containing the data annotations,
 16 | #' if ann_taiga=FALSE, then the name of the file of data annotations
 17 | #' @param ann_version: (optional) parameter to specify the version to pull from taiga for the annotations, default set to NULL
 18 | #' @param ann_taiga: if TRUE (default) then pulls the annotations from taiga, if FALSE then finds cell line annotations in local folder
 19 | #' @param data_type: string added to the annotation file under the column type to specify the data, default is ""
 20 | #' @description load additional expression and annotation files
 21 | #'
 22 | #' @return object containing expression matrix and annotations table
 23 | #' @export
 24 | #'
 25 | load_additional_data <- function(data_name, data_file, data_version = NULL, data_taiga = TRUE,
 26 |                      ann_name, ann_file, ann_version = NULL, ann_taiga = TRUE, data_type = "") {
 27 | 
 28 |   if(data_taiga) {
 29 |     data_mat <- taigr::load.from.taiga(data.name = data_name, data.version = data_version, data.file = data_file)
 30 |     if(is.null(data_mat)) {
 31 |       stop("expression data file input does not exist on taiga")
 32 |     }
 33 |   } else {
 34 |     if(file.exists(file.path(data_name, data_file))) {
 35 |       data_mat <-  readr::read_csv(file.path(data_name, data_file)) %>%
 36 |         as.data.frame() %>%
 37 |         tibble::column_to_rownames('X1') %>%
 38 |         as.matrix()
 39 |     } else {
 40 |       stop('expression data file input does not exist')
 41 |     }
 42 |   }
 43 | 
 44 | 
 45 |   # transpose matrix, if needed, so rownames are samples and column names are genes
 46 |   if(nrow(data_mat) > ncol(data_mat)) {
 47 |     warning('more rows than columns, taking transpose of expression matrix')
 48 |     data_mat <- t(data_mat)
 49 |   }
 50 | 
 51 | 
 52 |   if(ann_taiga) {
 53 |     ann <- taigr::load.from.taiga(data.name = ann_name, data.version = ann_version, data.file = ann_file)
 54 |     column_names <- c('sampleID', 'lineage', 'subtype')
 55 |     if(is.null(ann)) {
 56 |       warning('annotation file does not exist on taiga, creating default annotations')
 57 |       ann <- data.frame(sampleID =  rownames(data_mat),
 58 |                              lineage = NA,
 59 |                              subtype = NA,
 60 |                              type = data_type)
 61 |     }
 62 |     if(!all(column_names %in% colnames(ann))) {
 63 |       warning('annotation file does not contain expected columns (sampleID, lineage, & subtype), creating tumor annotations')
 64 |       ann <- data.frame(sampleID =  rownames(data_mat),
 65 |                              lineage = NA,
 66 |                              subtype = NA,
 67 |                              type = data_type)
 68 |     } else {
 69 |       ann <- ann[,column_names]
 70 |       ann$type <- data_type
 71 |     }
 72 |   } else {
 73 |     if(file.exists(file.path(ann_name, ann_file))) {
 74 |       ann <- data.table::fread(file.path(ann_name, ann_file)) %>%
 75 |         as.data.frame()
 76 |     } else {
 77 |       warning('annotation file does not exist, creating default annotations')
 78 |       ann <- data.frame(sampleID =  rownames(data_mat),
 79 |                              lineage = NA,
 80 |                              subtype = NA,
 81 |                              type = data_type)
 82 |     }
 83 |     if(!all(c('sampleID', 'lineage', 'subtype', 'type') %in% colnames(ann))) {
 84 |       warning('annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations')
 85 |       ann <- data.frame(sampleID =  rownames(data_mat),
 86 |                              lineage = NA,
 87 |                              subtype = NA,
 88 |                              type = data_type)
 89 |     }
 90 |   }
 91 | 
 92 |   return(list(mat = data_mat, ann = ann))
 93 | 
 94 | }
 95 | 
 96 | 
 97 | 
 98 | #' All methods to run Celligner, with additional alignment of Met500 and PDX data, and save the output, if desired
 99 | #' @name run_multidataset_alignment
100 | #'
101 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
102 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data
103 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
104 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data
105 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL
106 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder
107 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
108 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations
109 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
110 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
111 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.
112 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL
113 | #' @param cell_line_ann_taiga: if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder
114 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
115 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.
116 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
117 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data
118 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL
119 | #' @param tumor_taiga: if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder
120 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
121 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations
122 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
123 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
124 | #'  if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.
125 | #' @param tumor_ann_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
126 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
127 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
128 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
129 | #' for the cell lines. If null, assumes there are no additional annotations.
130 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
131 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
132 | #' no additional annotations.
133 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL
134 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder
135 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
136 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations
137 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
138 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations
139 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL
140 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder
141 | #' @param met500_data_name: Met500 expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using met500_taiga=F
142 | #' @param met500_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using met500_taiga=F
143 | #' @param met500_version: default NULL, used to specify version of taiga dataset
144 | #' @param met500_taiga: if TRUE (default) pulls Met500 expression from taiga dataset, if FALSE reads from local
145 | #' @param met500_ann_name: Met500 annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using met500_ann_taiga=F
146 | #' @param met500_ann_file: Met500 annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using met500_ann_taiga=F
147 | #' @param met500_ann_version: default NULL, used to specify version of taiga dataset
148 | #' @param met500_ann_taiga: if TRUE (default) pulls met500 annotations from taiga dataset, if FALSE reads from local
149 | #' @param Novartis_PDX_data_name: Novartis PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using Novartis_PDX_taiga=F
150 | #' @param Novartis_PDX_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using Novartis_PDX_taiga=F
151 | #' @param Novartis_PDX_version: default NULL, used to specify version of taiga dataset
152 | #' @param Novartis_PDX_taiga: if TRUE (default) pulls Novartis PDX expression from taiga dataset, if FALSE reads from local
153 | #' @param Novartis_PDX_ann_name: Novartis PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or path to folder is using met500_ann_taiga=F
154 | #' @param Novartis_PDX_ann_file: Novartis PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or name of local file is using Novartis_PDX_ann_taiga=F
155 | #' @param Novartis_PDX_ann_version: default NULL, used to specify version of taiga dataset
156 | #' @param Novartis_PDX_ann_taiga: if TRUE (default) pulls Novartis PDX annotations from taiga dataset, if FALSE reads from local
157 | #' @param pediatric_PDX_data_name: pediatric PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using pediatric_PDX_taiga=F
158 | #' @param pediatric_PDX_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using pediatric_PDX_taiga=F
159 | #' @param pediatric_PDX_version: default NULL, used to specify version of taiga dataset
160 | #' @param pediatric_PDX_taiga: if TRUE (default) pulls pediatric PDX expression from taiga dataset, if FALSE reads from local
161 | #' @param pediatric_PDX_ann_name: Pediatric PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using pediatric_PDX_ann_taiga=F
162 | #' @param pediatric_PDX_ann_file: Pediatric PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using pediatric_PDX_ann_taiga=F
163 | #' @param pediatric_PDX_ann_version: default NULL, used to specify version of taiga dataset
164 | #' @param pediatric_PDX_ann_taiga: if TRUE (default) pulls pediatric PDX annotations from taiga dataset, if FALSE reads from local
165 | #' @param save_output: by default is NULL and won't save output, to save output pass in a filepath of where to save the output
166 | #'
167 | #' @importFrom magrittr "%>%"
168 | #'
169 | #' @description run all parts of the Celligner pipeline, with alignment of additional datasets
170 | #'
171 | #' @return seurat object of the Celligner-aligned data
172 | #' @export
173 | #'
174 | run_multidataset_alignment <- function(cell_line_data_name='public-20q4-a4b3', cell_line_data_file = 'CCLE_expression_full', cell_line_version = NULL, cell_line_taiga=TRUE,
175 |                           cell_line_ann_name='arxspan-cell-line-export-f808', cell_line_ann_file = 'ACH',cell_line_ann_version = NULL, cell_line_ann_taiga=TRUE,
176 |                           tumor_data_name = 'celligner-input-9827', tumor_data_file = 'tumor_expression', tumor_version = NULL, tumor_taiga = TRUE,
177 |                           tumor_ann_name = 'celligner-input-9827', tumor_ann_file = 'tumor_annotations', tumor_ann_version = NULL, tumor_ann_taiga = TRUE,
178 |                           additional_annotations_name = 'celligner-input-9827', additional_annotations_file = 'CCLE_annotations', additional_annotations_version = NULL, additional_annotations_taiga = TRUE,
179 |                           hgnc_data_name = 'hgnc-87ab', hgnc_data_file='hgnc_complete_set', hgnc_version= NULL, hgnc_taiga = TRUE,
180 |                           met500_data_name = 'met500-fc3c', met500_data_file = 'met500_TPM', met500_version = NULL, met500_taiga = TRUE,
181 |                           met500_ann_name = 'met500-fc3c', met500_ann_file = 'met500_ann', met500_ann_version = NULL, met500_ann_taiga = TRUE,
182 |                           Novartis_PDX_data_name = 'pdx-data-3d29', Novartis_PDX_data_file = 'Novartis_PDX_TPM', Novartis_PDX_version = NULL, Novartis_PDX_taiga = TRUE,
183 |                           Novartis_PDX_ann_name = 'pdx-data-3d29', Novartis_PDX_ann_file = 'Novartis_PDX_ann', Novartis_PDX_ann_version = NULL, Novartis_PDX_ann_taiga = TRUE,
184 |                           pediatric_PDX_data_name = 'pdx-data-3d29', pediatric_PDX_data_file = 'pediatric_PDX_TPM', pediatric_PDX_version = NULL, pediatric_PDX_taiga = TRUE,
185 |                           pediatric_PDX_ann_name = 'pdx-data-3d29', pediatric_PDX_ann_file = 'pediatric_PDX_ann', pediatric_PDX_ann_version = NULL, pediatric_PDX_ann_taiga = TRUE,
186 |                           save_output = NULL) {
187 | 
188 |   require(magrittr)
189 |   require(tidyverse)
190 | 
191 |   dat <- load_data(cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga,
192 |                    cell_line_ann_name, cell_line_ann_file,cell_line_ann_version, cell_line_ann_taiga,
193 |                    tumor_data_name, tumor_data_file, tumor_version, tumor_taiga,
194 |                    tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga,
195 |                    additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga,
196 |                    hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga)
197 | 
198 |   met500 <- load_additional_data(met500_data_name, met500_data_file, met500_data_version, met500_data_taiga,
199 |                                  met500_ann_name, met500_ann_file, met500_ann_version, met500_ann_taiga)
200 |   Novartis_PDX <- load_additional_data(Novartis_PDX_data_name, Novartis_PDX_data_file, Novartis_PDX_data_version, Novartis_PDX_data_taiga,
201 |                                        Novartis_PDX_ann_name, Novartis_PDX_ann_file, Novartis_PDX_ann_version, Novartis_PDX_ann_taiga)
202 | 
203 |   pediatric_PDX <- load_additional_data(pediatric_PDX_data_name, pediatric_PDX_data_file, pediatric_PDX_data_version, pediatric_PDX_data_taiga,
204 |                                        pediatric_PDX_ann_name, pediatric_PDX_ann_file, pediatric_PDX_ann_version, pediatric_PDX_ann_taiga)
205 | 
206 |   shared_genes <- intersect(colnames(dat$TCGA_mat), colnames(dat$CCLE_mat)) %>%
207 |     intersect(colnames(met500$mat)) %>%
208 |     intersect(colnames(Novartis_PDX$mat)) %>%
209 |     intersect(colnames(pediatric_PDX$mat))
210 | 
211 |   dat$TCGA_mat <- dat$TCGA_mat[,shared_genes]
212 |   dat$CCLE_mat <- dat$CCLE_mat[,shared_genes]
213 |   met500$mat <- met500$mat[,shared_genes]
214 |   Novartis_PDX$mat <- Novartis_PDX$mat[,shared_genes]
215 |   pediatric_PDX$mat <- pediatric_PDX$mat[,shared_genes]
216 | 
217 |   gene_stats <- calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga)
218 | 
219 |   comb_ann <- rbind(
220 |     dat$TCGA_ann %>% dplyr::select(sampleID, lineage, subtype) %>%
221 |       dplyr::mutate(type = 'tumor'),
222 |     dat$CCLE_ann %>% dplyr::select(sampleID, lineage, subtype) %>%
223 |       dplyr::mutate(type = 'CL')
224 |   )
225 | 
226 |   TCGA_obj <- create_Seurat_object(dat$TCGA_mat, dat$TCGA_ann, type='tumor')
227 |   CCLE_obj <- create_Seurat_object(dat$CCLE_mat, dat$CCLE_ann, type='CL')
228 | 
229 |   TCGA_obj <- cluster_data(TCGA_obj)
230 |   CCLE_obj <- cluster_data(CCLE_obj)
231 | 
232 |   tumor_DE_genes <- find_differentially_expressed_genes(TCGA_obj)
233 |   CL_DE_genes <- find_differentially_expressed_genes(CCLE_obj)
234 | 
235 |   DE_genes <- full_join(tumor_DE_genes, CL_DE_genes, by = 'Gene', suffix = c('_tumor', '_CL')) %>%
236 |     mutate(
237 |       tumor_rank = dplyr::dense_rank(-gene_stat_tumor),
238 |       CL_rank = dplyr::dense_rank(-gene_stat_CL),
239 |       best_rank = pmin(tumor_rank, CL_rank, na.rm=T)) %>%
240 |     dplyr::left_join(gene_stats, by = 'Gene')
241 | 
242 |   # take genes that are ranked in the top 1000 from either dataset, used for finding mutual nearest neighbors
243 |   DE_gene_set <- DE_genes %>%
244 |     dplyr::filter(best_rank < celligner_global$top_DE_genes_per) %>%
245 |     .[['Gene']]
246 | 
247 | 
248 |   cov_diff_eig <- run_cPCA(TCGA_obj, CCLE_obj, celligner_global$fast_cPCA)
249 | 
250 |   if(is.null(celligner_global$fast_cPCA)) {
251 |     cur_vecs <- cov_diff_eig$vectors[, celligner_global$remove_cPCA_dims, drop = FALSE]
252 |   } else {
253 |     cur_vecs <- cov_diff_eig$rotation[, celligner_global$remove_cPCA_dims, drop = FALSE]
254 |   }
255 | 
256 |   rownames(cur_vecs) <- colnames(dat$TCGA_mat)
257 |   TCGA_cor <- resid(lm(t(dat$TCGA_mat) ~ 0 + cur_vecs)) %>% t()
258 |   CCLE_cor <- resid(lm(t(dat$CCLE_mat) ~ 0 + cur_vecs)) %>% t()
259 | 
260 |   mnn_res <- run_MNN(CCLE_cor, TCGA_cor,  k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist,
261 |                      subset_genes = DE_gene_set)
262 | 
263 |   combined_mat <- rbind(mnn_res$corrected, CCLE_cor)
264 | 
265 |   # clear unused objects
266 |   rm(TCGA_obj); rm(CCLE_obj); rm(cov_diff_eig); rm(TCGA_cor); rm(CCLE_cor); gc()
267 | 
268 |   # Met500 alignment
269 |   met500_cor <- resid(lm(t(met500$mat) ~ 0 + cur_vecs)) %>% t()
270 | 
271 |   mnn_res <- run_MNN(combined_mat, met500_cor,  k1 = 20, k2 = 50, ndist = celligner_global$mnn_ndist,
272 |                      subset_genes = DE_gene_set)
273 |   combined_mat <- rbind(combined_mat, mnn_res$corrected)
274 | 
275 |   ## align PDX datasets
276 | 
277 |   ### PDX - Novartis
278 |   Novartis_PDX_cor <- resid(lm(t(Novartis_PDX$mat) ~ 0 + cur_vecs)) %>% t()
279 | 
280 |   mnn_res_Novartis_PDX <- run_MNN(combined_mat, Novartis_PDX_cor, k1 = 10, k2 = 50, ndist = 3,
281 |                          subset_genes = DE_gene_set)
282 | 
283 |   combined_mat <- rbind(combined_mat, mnn_res_Novartis_PDX$corrected)
284 | 
285 |   ### PDX - pediatric
286 |   pediatric_PDX_cor <- resid(lm(t(pediatric_PDX$mat) ~ 0 + cur_vecs)) %>% t()
287 | 
288 |   mnn_res_pediatric_PDX <- run_MNN(combined_mat[-which(rownames(combined_mat) %in% rownames(pediatric_PDX_cor)),],
289 |                                pediatric_PDX_cor, k1 = 10, k2 = 50, ndist = 3,
290 |                                subset_genes = DE_gene_set)
291 | 
292 |   combined_mat <- t(rbind(combined_mat, mnn_res_pediatric_PDX$corrected))
293 | 
294 |   # combine all output
295 |   comb_ann <- rbind.data.frame(comb_ann[,c('sampleID', 'lineage', 'subtype', 'type')],
296 |                                met500$ann[,c('sampleID', 'lineage', 'subtype', 'type')],
297 |                                Novartis_PDX$ann[,c('sampleID', 'lineage', 'subtype', 'type')],
298 |                                pediatric_PDX$ann[,c('sampleID', 'lineage', 'subtype', 'type')])
299 |   rownames(comb_ann) <- comb_ann$sampleID
300 |   comb_ann <- comb_ann[colnames(combined_mat),]
301 | 
302 |   # clear unused object
303 |   rm(met500); rm(Novartis_PDX); rm(pediatric_PDX); rm(met500_cor); rm(Novartis_PDX_cor); rm(pediatric_PDX_cor); gc()
304 | 
305 |    # create seurat object
306 |   comb_obj <- create_Seurat_object(combined_mat, comb_ann)
307 |   comb_obj <- cluster_data(comb_obj)
308 | 
309 |   Celligner_res <- Seurat::Embeddings(comb_obj, reduction = 'umap') %>%
310 |     as.data.frame() %>%
311 |     magrittr::set_colnames(c('UMAP_1', 'UMAP_2')) %>%
312 |     tibble::rownames_to_column(var = 'sampleID') %>%
313 |     dplyr::left_join(comb_obj@meta.data, by = 'sampleID')
314 | 
315 |   lineage_averages <- Celligner_res %>%
316 |     dplyr::filter(!lineage %in% c('adrenal_cortex', 'embryo', 'endocrine', 'engineered', 'engineered_blood',
317 |                                   'engineered_breast', 'engineered_central_nervous_system', 'engineered_kidney',
318 |                                   'engineered_lung', 'engineered_ovary', 'engineered_prostate', 'epidermoid_carcinoma',
319 |                                   'nasopharynx', 'nerve','pineal', 'teratoma', 'unknown')) %>%
320 |     dplyr::group_by(lineage) %>%
321 |     dplyr::summarise(UMAP_1 = median(UMAP_1, na.rm=T),
322 |                      UMAP_2 = median(UMAP_2, na.rm=T))
323 |   lineage_averages$lineage <- gsub("_", " ", lineage_averages$lineage)
324 |   lineage_lab_aes <- ggplot2::geom_text(data = lineage_averages, mapping = aes(x = UMAP_1, y = UMAP_2, label = lineage), size = 3, color="#000000")
325 | 
326 | 
327 |   if('type' %in% colnames(Celligner_res) & 'tumor' %in% Celligner_res$type & 'CL' %in% Celligner_res$type) {
328 |     celligner_plot <- ggplot2::ggplot(Celligner_res,  ggplot2::aes(UMAP_1, UMAP_2)) +
329 |       ggplot2::geom_point(alpha=0.7, pch=21,  ggplot2::aes(color = type, fill = lineage, size = type)) +
330 |       ggplot2::scale_color_manual(values = c(tumor = 'white', CL = 'black')) +
331 |       ggplot2::scale_size_manual(values=c(tumor=0.75, CL=1.5)) +
332 |       ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') +
333 |       ggplot2::guides(fill=FALSE,
334 |                       color = ggplot2::guide_legend(override.aes = list(color=c('black', 'white'), fill = c('white','black')))) +
335 |       ggplot2::theme_classic()
336 |   } else {
337 |     celligner_plot <-  ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) +
338 |       ggplot2::geom_point(alpha=0.7, pch=21, size = 1, ggplot2::aes(fill = lineage)) +
339 |       ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') +
340 |       ggplot2::theme_classic() + ggplot2::theme(legend.position = 'none')
341 |   }
342 | 
343 |   print(celligner_plot)
344 |   print(celligner_plot + lineage_lab_aes)
345 | 
346 | 
347 |   if(!is.null(save_output)) {
348 |     if(file.exists(save_output)) {
349 |       print('saving files')
350 |       write.csv(combined_mat, file.path(save_output, 'Celligner_multidataset_aligned_data.csv'))
351 |       readr::write_csv(Celligner_res, file.path(save_output, 'Celligner_multidataset_info.csv'))
352 |       ggplot2::ggsave(file.path(save_output, 'Celligner_multidataset_plot.png'), celligner_plot, device='png', width = 8, height = 6)
353 |       ggplot2::ggsave(file.path(save_output, 'labeled_Celligner_multidataset_plot.png'), celligner_plot + lineage_lab_aes, device='png', width = 8, height = 6)
354 | 
355 |     } else{
356 |       warning("can't save output, folder does not exist")
357 |     }
358 |   }
359 | 
360 |   return(comb_obj)
361 | }
362 | 
363 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Celligner
  2 | 
  3 | ![](docs/celligner_public22q2.png)
  4 | 
  5 | __Celligner__ is a computational approach for aligning tumor and cell line transcriptional profiles.
  6 | 
  7 | To learn more, see the [paper](https://www.nature.com/articles/s41467-020-20294-x)
  8 | 
  9 | ## Remark
 10 | 
 11 | __Celligner__ is initially an R project that you can find in the `R/` folder.
 12 | 
 13 | A Python version was made that performs the same computations as the R version, but the results may differ slightly due to small implementation differences in the Louvain clustering and contrastive PCA steps.
 14 | 
 15 | ## Overview
 16 | 
 17 | A **reference** expression dataset (e.g. CCLE cell lines) should be fit using the `fit()` function, and a **target** expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the `transform()` function. See the `run_celligner.py` script for example usage. Celligner is unsupervised and does not require annotations to be run; as such they are not used in this version of the model but can be added post-hoc to aid in interpretation of the output. See the `celligner_output.ipynb` notebook for an example of how to draw an output UMAP.
 18 | 
 19 | The Celligner output can be explored at: [https://depmap.org/portal/celligner/](https://depmap.org/portal/celligner/)
 20 | 
 21 | ## Install
 22 | 
 23 | > To see the old R package installation instruction, see the `R/` folder.
 24 | 
 25 | Before running pip, make sure that you have R installed.
 26 | 
 27 | To install the latest version of Celligner in dev mode, run the following (note that Celligner requires the specific version of mnnpy that is associated with the repository as a submodule):
 28 | 
 29 | ```bash
 30 | git clone https://github.com/broadinstitute/celligner.git
 31 | git checkout new_dev
 32 | cd celligner
 33 | pip install -e .
 34 | cd mnnpy 
 35 | pip install .
 36 | ```
 37 | 
 38 | A dockerfile and build script is also provided.
 39 | 
 40 | 
 41 | ## Using Celligner
 42 | 
 43 | Celligner has `fit()` and `transform()` functions in the style of scikit-learn models.
 44 | 
 45 | A reference expression dataset (e.g. CCLE cell lines TPM expression) should first be fit:
 46 | 
 47 | ```python
 48 | from celligner import Celligner
 49 | 
 50 | my_celligner = Celligner()
 51 | my_celligner.fit(CCLE_expression)
 52 | ```
 53 | 
 54 | A target expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the transform function:
 55 | 
 56 | ```python
 57 | my_celligner.transform(TCGA_expression)
 58 | ```
 59 | 
 60 | The combined transformed expression matrix can then be accessed via `my_celligner.combined_output`. Clusters, UMAP coordinates and tumor-model distances for all samples can be computed with `my_celligner.computeMetricsForOutput()`. There are also functions to save/load a fitted Celligner model as a .pkl file.
 61 | 
 62 | ### Aligning the target dataset to a new reference dataset
 63 | This use case is for the scenario where you want to align the same target dataset to a new reference dataset (which might be the same reference dataset as before with some new samples). In this case you can call transform without the target dataset to re-use the previous target dataset and skip re-doing some computation (see diagram below).
 64 | 
 65 | ```python
 66 | my_celligner.fit(new_reference_expression)
 67 | my_celligner.transform()
 68 | ```
 69 | 
 70 | ### Aligning a third dataset to the previous combined output
 71 | This use case is for the scenario where you have a third dataset (e.g. Met500 tumor samples), that you want to align the the previously aligned (e.g. CCLE+TCGA) dataset. This is the current approach for multi-dataset alignment taken by the Celligner app.
 72 | 
 73 | ```python
 74 | my_celligner.makeNewReference()
 75 | # The value of k1 should be selected based on the size of the new dataset. 
 76 | # We use k=20 for Met500 (n=~850), and k1=10 for the PDX datasets (n=~250-450).
 77 | my_celligner.mnn_kwargs.update({"k1":20, "k2":50}) 
 78 | my_celligner.transform(met500_TPM, compute_cPCs=False)
 79 | ```
 80 | 
 81 | ### Diagram 
 82 | This diagram provides an overview of how Celligner works, including for the different use cases described above.
 83 | 
 84 | ![](docs/celligner_diagram.png)
 85 | 
 86 | ### Computational complexity
 87 | 
 88 | Depending on the dataset, Celligner can be quite memory hungry.
 89 | For TCGA, expect at least _50-60Gb_ of memory to be used. You might need a powerfull computer, lots of _swap_ and to increase R's default _maximum allowed memory_.
 90 | 
 91 | You can also use the `low_memory=True` option to reduce the memory used by Celligner in the memory intensive `PCA` & `cPCA` methods.
 92 | 
 93 | 
 94 | # R Celligner
 95 | 
 96 | For the original R version of celligner, please check the R/README.md file here: [https://github.com/broadinstitute.org/celligner/tree/master/R/README.md](https://github.com/broadinstitute.org/celligner/tree/master/R/README.md)
 97 | 
 98 | ---
 99 | 
100 | __Initial project:__
101 | 
102 | Allie Warren @awarren
103 | 
104 | __Initial python version:__
105 | 
106 | Jérémie Kalfon @jkobject
107 | 
108 | __Current maintainer:__
109 | 
110 | Barbara De Kegel @bdekegel
111 | 


--------------------------------------------------------------------------------
/build_docker.sh:
--------------------------------------------------------------------------------
1 | # Builds celligner docker image
2 | # Note that this docker image does not have taigapy or mnnpy installed
3 | 
4 | docker buildx build --platform linux/amd64 --push -t us.gcr.io/bdekegel/celligner:latest .


--------------------------------------------------------------------------------
/celligner/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.1


--------------------------------------------------------------------------------
/celligner/__init__.py:
--------------------------------------------------------------------------------
  1 | from celligner.params import *
  2 | from celligner import limma
  3 | 
  4 | from sklearn.decomposition import PCA, IncrementalPCA
  5 | from sklearn.linear_model import LinearRegression
  6 | import sklearn.metrics as metrics
  7 | import umap.umap_ as umap
  8 | 
  9 | import scanpy as sc
 10 | from anndata import AnnData
 11 | 
 12 | import os
 13 | import pickle
 14 | import gc
 15 | 
 16 | import pandas as pd
 17 | import numpy as np
 18 | 
 19 | #from contrastive import CPCA
 20 | import mnnpy
 21 | 
 22 | 
 23 | class Celligner(object):
 24 |     def __init__(
 25 |         self,
 26 |         topKGenes=TOP_K_GENES,
 27 |         pca_ncomp=PCA_NCOMP,
 28 |         cpca_ncomp=CPCA_NCOMP,
 29 |         louvain_kwargs=LOUVAIN_PARAMS,
 30 |         mnn_kwargs=MNN_PARAMS,
 31 |         umap_kwargs=UMAP_PARAMS,
 32 |         mnn_method="mnn_marioni",
 33 |         low_mem=False,
 34 |     ):
 35 |         """
 36 |         Initialize Celligner object
 37 | 
 38 |         Args:
 39 |             topKGenes (int, optional): see params.py. Defaults to 1000.
 40 |             pca_ncomp (int, optional): see params.py. Defaults to 70.
 41 |             cpca_ncomp (int, optional): see params.py. Defaults to 4.
 42 |             louvain_kwargs (dict, optional): see params.py
 43 |             mnn_kwargs (dict, optional): see params.py 
 44 |             umap_kwargs (dict, optional): see params.py
 45 |             mnn_method (str, optional): Only default "mnn_marioni" supported right now.
 46 |             low_mem (bool, optional): adviced if you have less than 32Gb of RAM. Defaults to False.
 47 |         """
 48 |         
 49 |         self.topKGenes = topKGenes
 50 |         self.pca_ncomp = pca_ncomp
 51 |         self.cpca_ncomp = cpca_ncomp
 52 |         self.louvain_kwargs = louvain_kwargs
 53 |         self.mnn_kwargs = mnn_kwargs
 54 |         self.umap_kwargs = umap_kwargs
 55 |         self.mnn_method = mnn_method
 56 |         self.low_mem = low_mem
 57 | 
 58 |         self.ref_input = None
 59 |         self.ref_clusters = None
 60 |         self.ref_de_genes = None
 61 |         
 62 |         self.target_input = None
 63 |         self.target_clusters = None
 64 |         self.target_de_genes = None
 65 | 
 66 |         self.de_genes = None
 67 |         self.cpca_loadings = None
 68 |         self.cpca_explained_var = None
 69 |         self.combined_output = None
 70 |         
 71 |         self.umap_reduced = None
 72 |         self.output_clusters = None
 73 |         self.tumor_CL_dist = None
 74 | 
 75 | 
 76 |     def __checkExpression(self, expression, is_reference):
 77 |         """
 78 |         Checks gene overlap with reference, checks for NaNs, then does mean-centering.
 79 | 
 80 |         Args:
 81 |             expression (pd.Dataframe): expression data as samples (rows) x genes (columns)
 82 |             is_reference (bool): whether the expression is a reference or target
 83 | 
 84 |         Raises:
 85 |             ValueError: if some common genes are missing from the expression dataset
 86 |             ValueError: if the expression matrix contains nan values
 87 | 
 88 |         Returns:
 89 |             (pd.Dataframe): the expression matrix
 90 |         """
 91 |         # Check gene overlap
 92 |         if expression.loc[:, expression.columns.isin(self.common_genes)].shape[1] < len(self.common_genes):
 93 |             if not is_reference:
 94 |                 raise ValueError("Some genes from reference dataset not found in target dataset")
 95 |             else:
 96 |                 raise ValueError("Some genes from previously fit target dataset not found in new reference dataset")
 97 |         
 98 |         expression = expression.loc[:, self.common_genes].astype(float)
 99 |         
100 |         # Raise issue if there are any NaNs in the expression dataframe
101 |         if expression.isnull().values.any():
102 |             raise ValueError("Expression dataframe contains NaNs")
103 | 
104 |         # Mean center the expression dataframe
105 |         expression = expression.sub(expression.mean(0), 1)
106 |         
107 |         return expression
108 | 
109 | 
110 |     def __cluster(self, expression):
111 |         """
112 |         Cluster expression in (n=70)-dimensional PCA space using a shared nearest neighbor based method
113 | 
114 |         Args:
115 |             expression (pd.Dataframe): expression data as samples (rows) x genes (columns)
116 | 
117 |         Returns:
118 |             (list): cluster label for each sample
119 |         """
120 |         # Create anndata object
121 |         adata = AnnData(expression, dtype='float64')
122 | 
123 |         # Find PCs
124 |         print("Doing PCA..")
125 |         sc.tl.pca(adata, n_comps=self.pca_ncomp, zero_center=True, svd_solver='arpack')
126 | 
127 |         # Find shared nearest neighbors (SNN) in PC space
128 |         # Might produce different results from the R version as ScanPy and Seurat differ in their implementation.
129 |         print("Computing neighbors..")
130 |         sc.pp.neighbors(adata, knn=True, use_rep='X_pca', n_neighbors=20, n_pcs=self.pca_ncomp)
131 |         
132 |         print("Clustering..")
133 |         sc.tl.louvain(adata, use_weights=True, **self.louvain_kwargs)
134 |         fit_clusters = adata.obs["louvain"].values.astype(int)
135 |         
136 |         del adata
137 |         gc.collect()
138 | 
139 |         return fit_clusters
140 | 
141 | 
142 |     def __runDiffExprOnClusters(self, expression, clusters):
143 |         """
144 |         Runs limma (R) on the clustered data.
145 | 
146 |         Args:
147 |             expression (pd.Dataframe): expression data
148 |             clusters (list): the cluster labels (per sample)
149 | 
150 |         Returns:
151 |             (pd.Dataframe): limmapy results
152 |         """
153 | 
154 |         n_clusts = len(set(clusters))
155 |         print("Running differential expression on " + str(n_clusts) + " clusters..")
156 |         clusts = set(clusters) - set([-1])
157 |         
158 |         # make a design matrix
159 |         design_matrix = pd.DataFrame(
160 |             index=expression.index,
161 |             data=np.array([clusters == i for i in clusts]).T,
162 |             columns=["C" + str(i) + "C" for i in clusts],
163 |         )
164 |         design_matrix.index = design_matrix.index.astype(str).str.replace("-", ".")
165 |         design_matrix = design_matrix[design_matrix.sum(1) > 0]
166 |         
167 |         # creating the matrix
168 |         data = expression.T
169 |         data = data[data.columns[clusters != -1].tolist()]
170 |         
171 |         # running limmapy
172 |         print("Running limmapy..")
173 |         res = (
174 |             limma.limmapy()
175 |             .lmFit(data, design_matrix)
176 |             .eBayes(trend=False)
177 |             .topTable(number=len(data)) 
178 |             .iloc[:, len(clusts) :]
179 |         )
180 |         return res.sort_values(by="F", ascending=False)
181 |     
182 | 
183 |     def __runCPCA(self, centered_ref_input, centered_target_input):
184 |         """
185 |         Perform contrastive PCA on the centered reference and target expression datasets
186 | 
187 |         Args:
188 |             centered_ref_input (pd.DataFrame): reference expression matrix where the cluster mean has been subtracted
189 |             centered_target_input (pd.DataFrame): target expression matrix where the cluster mean has been subtracted
190 | 
191 |         Returns:
192 |             (ndarray, ncomponents x ngenes): principal axes in feature space
193 |             (ndarray, ncomponents,): variance explained by each component
194 | 
195 |         """
196 |         target_cov = centered_target_input.cov()
197 |         ref_cov = centered_ref_input.cov()
198 |         if not self.low_mem:
199 |             pca = PCA(self.cpca_ncomp, svd_solver="randomized", copy=False)
200 |         else: 
201 |             pca = IncrementalPCA(self.cpca_ncomp, copy=False, batch_size=1000)
202 |         
203 |         pca.fit(target_cov - ref_cov)
204 |         return pca.components_, pca.explained_variance_
205 | 
206 | 
207 |     def fit(self, ref_expr):
208 |         """
209 |         Fit the model to the reference expression dataset - cluster + find differentially expressed genes.
210 | 
211 |         Args:
212 |             ref_expr (pd.Dataframe): reference expression matrix of samples (rows) by genes (columns), 
213 |                 where genes are ensembl gene IDs. Data should be log2(X+1) TPM data. 
214 |                 In the standard Celligner pipeline this the cell line data.
215 | 
216 |         Raises:
217 |                 ValueError: if only 1 cluster is found in the PCs of the expression
218 |         """
219 |         
220 |         self.common_genes = list(ref_expr.columns)
221 |         self.ref_input = self.__checkExpression(ref_expr, is_reference=True)
222 |         
223 |         # Cluster and find differential expression for reference data
224 |         self.ref_clusters = self.__cluster(self.ref_input)
225 |         if len(set(self.ref_clusters)) < 2:
226 |             raise ValueError("Only one cluster found in reference data, no differential expression possible")
227 |         self.ref_de_genes = self.__runDiffExprOnClusters(self.ref_input, self.ref_clusters)
228 | 
229 |         return self
230 | 
231 | 
232 |     def transform(self, target_expr=None, compute_cPCs=True):
233 |         """
234 |         Align samples in the target dataset to samples in the reference dataset
235 | 
236 |         Args:
237 |             target_expr (pd.Dataframe, optional): target expression matrix of samples (rows) by genes (columns), 
238 |                 where genes are ensembl gene IDs. Data should be log2(X+1) TPM data.
239 |                 In the standard Celligner pipeline this the tumor data (TCGA). 
240 |                 Set to None if re-running transform with new reference data.
241 |             compute_cPCs (bool, optional): if True, compute cPCs from the fitted reference and target expression. Defaults to True.
242 | 
243 |         Raises:
244 |             ValueError: if compute_cPCs is True but there is no reference input (fit has not been run)
245 |             ValueError: if compute_cPCs is False but there are no previously computed cPCs available (transform has not been previously run)
246 |             ValueError: if no target expression is provided and there is no previously provided target data
247 |             ValueError: if no target expression is provided and compute_cPCs is true; there is no use case for this
248 |             ValueError: if there are not enough clusters to compute DE genes for the target dataset
249 |         """
250 | 
251 |         if self.ref_input is None and compute_cPCs:
252 |             raise ValueError("Need fitted reference dataset to compute cPCs, run fit function first")
253 | 
254 |         if not compute_cPCs and self.cpca_loadings is None:
255 |             raise ValueError("No cPCs found, transform needs to be run with compute_cPCs==True at least once")
256 | 
257 |         if target_expr is None and self.target_input is None:
258 |             raise ValueError("No previous data found for target, transform needs to be run with target expression at least once")
259 | 
260 |         if not compute_cPCs and target_expr is None:
261 |             raise ValueError("No use case for running transform without new target data when compute_cPCs==True")
262 | 
263 |         if compute_cPCs:
264 |             
265 |             if target_expr is not None:
266 |                 
267 |                 self.target_input = self.__checkExpression(target_expr, is_reference=False)
268 | 
269 |                 # Cluster and find differential expression for target data
270 |                 self.target_clusters = self.__cluster(self.target_input)
271 |                 if len(set(self.target_clusters)) < 2:
272 |                     raise ValueError("Only one cluster found in reference data, no differential expression possible")
273 |                 self.target_de_genes = self.__runDiffExprOnClusters(self.target_input, self.target_clusters)
274 | 
275 |                 # Union of the top 1000 differentially expressed genes in each dataset
276 |                 self.de_genes = pd.Series(list(self.ref_de_genes[:self.topKGenes].index) +
277 |                                           list(self.target_de_genes[:self.topKGenes].index)).drop_duplicates().to_list()
278 | 
279 |             else:
280 |                 print("INFO: No new target expression provided, using previously provided target dataset")
281 | 
282 |             # Subtract cluster average from cluster samples
283 |             centered_ref_input = pd.concat(
284 |                 [
285 |                     self.ref_input.loc[self.ref_clusters == val] - self.ref_input.loc[self.ref_clusters == val].mean(axis=0)
286 |                     for val in set(self.ref_clusters)
287 |                 ]
288 |             ).loc[self.ref_input.index]
289 |             
290 |             centered_target_input = pd.concat(
291 |                 [
292 |                     self.target_input.loc[self.target_clusters == val] - self.target_input.loc[self.target_clusters == val].mean(axis=0)
293 |                     for val in set(self.target_clusters)
294 |                 ]
295 |             ).loc[self.target_input.index]
296 |             
297 |             # Compute contrastive PCs
298 |             print("Running cPCA..")
299 |             self.cpca_loadings, self.cpca_explained_var = self.__runCPCA(centered_ref_input, centered_target_input)
300 | 
301 |             del centered_ref_input, centered_target_input
302 |             gc.collect()
303 | 
304 |             print("Regressing top cPCs out of reference dataset..")
305 |              # Take the residuals of the linear regression of ref_input with the cpca_loadings
306 |             transformed_ref = (self.ref_input - 
307 |                 LinearRegression(fit_intercept=False)
308 |                     .fit(self.cpca_loadings.T, self.ref_input.T)
309 |                     .predict(self.cpca_loadings.T)
310 |                     .T
311 |             )
312 | 
313 |         # Using previously computed cPCs - for multi-dataset alignment
314 |         else:
315 |             
316 |             # Allow some genes to be missing in new target dataset
317 |             missing_genes = list(self.ref_input.loc[:, ~self.ref_input.columns.isin(target_expr.columns)].columns)
318 |             if len(missing_genes) > 0:
319 |                 print('WARNING: %d genes from reference dataset not found in new target dataset, subsetting to overlap' % (len(missing_genes)))
320 |                 # Get index of dropped genes
321 |                 drop_idx = [self.ref_input.columns.get_loc(g) for g in missing_genes]
322 |                 
323 |                 # Filter refence dataset
324 |                 self.ref_input = self.ref_input.loc[:, self.ref_input.columns.isin(target_expr.columns)]
325 |                 self.common_genes = list(self.ref_input.columns)
326 | 
327 |                 # Drop cPCA loadings for genes that were filtered out
328 |                 self.cpca_loadings = np.array([np.delete(self.cpca_loadings[n], drop_idx) for n in range(self.cpca_ncomp)])
329 |                 
330 |                 # Check if genes need to be dropped from DE list
331 |                 overlap = self.ref_input.loc[:, self.ref_input.columns.isin(self.de_genes)]
332 |                 if overlap.shape[1] < len(self.de_genes):
333 |                     print('WARNING: dropped genes include %d differentially expressed genes that may be important' % (len(self.de_genes) - overlap.shape[1]))
334 |                     temp = pd.Series(self.de_genes)
335 |                     self.de_genes = temp[temp.isin(self.ref_input.columns)].to_list()
336 | 
337 |             self.target_input = self.__checkExpression(target_expr, is_reference=False)
338 |             transformed_ref = self.ref_input
339 |         
340 |         # Only need to regress out of target dataset if using previously computed cPCs
341 |         print("Regressing top cPCs out of target dataset..")
342 |         transformed_target = (self.target_input - 
343 |             LinearRegression(fit_intercept=False)
344 |                 .fit(self.cpca_loadings.T, self.target_input.T)
345 |                 .predict(self.cpca_loadings.T)
346 |                 .T
347 |         )
348 | 
349 |         # Do MNN 
350 |         print("Doing the MNN analysis using Marioni et al. method..")
351 |         # Use top DE genes only
352 |         varsubset = np.array([1 if i in self.de_genes else 0 for i in self.target_input.columns]).astype(bool)
353 |         target_corrected, self.mnn_pairs = mnnpy.marioniCorrect(
354 |             transformed_ref,
355 |             transformed_target,
356 |             var_index=list(range(len(self.ref_input.columns))),
357 |             var_subset=varsubset,
358 |             **self.mnn_kwargs,
359 |         )
360 | 
361 |         if compute_cPCs:
362 |             self.combined_output =  pd.concat([target_corrected, transformed_ref])
363 |         else: # Append at the end for multi-dataset alignment case
364 |             self.combined_output =  pd.concat([transformed_ref, target_corrected])
365 |         
366 |         del target_corrected
367 |         gc.collect()
368 | 
369 |         print('Done')
370 | 
371 |         return self
372 | 
373 | 
374 |     def computeMetricsForOutput(self, umap_rand_seed=14, UMAP_only=False, model_ids=None, tumor_ids=None):
375 |         """
376 |         Compute UMAP embedding and optionally clusters and tumor - model distance.
377 |         
378 |         Args:
379 |             UMAP_only (bool, optional): Only recompute the UMAP. Defaults to False.
380 |             umap_rand_seed (int, optional): Set seed for UMAP, to try an alternative. Defaults to 14.
381 |             model_ids (list, optional): model IDs for computing tumor-CL distance. Defaults to None, in which case the reference index is used.
382 |             tumor_ids (list, optional): tumor IDs for computing tumor-CL distance. Defaults to None, in which case the target index is used.
383 |         
384 |         Raises:
385 |             ValueError: if there is no corrected expression matrix
386 |         """
387 |         if self.combined_output is None:
388 |             raise ValueError("No corrected expression matrix found, run this function after transform()")
389 | 
390 |         print("Computing UMAP embedding...")
391 |         # Compute UMAP embedding for results
392 |         pca = PCA(self.pca_ncomp)
393 |         pcs = pca.fit_transform(self.combined_output)
394 |         
395 |         umap_reduced = umap.UMAP(**self.umap_kwargs, random_state=umap_rand_seed).fit_transform(pcs)
396 |         self.umap_reduced = pd.DataFrame(umap_reduced, index=self.combined_output.index, columns=['umap1','umap2'])
397 | 
398 |         if not UMAP_only:
399 |             
400 |             print('Computing clusters..')
401 |             self.output_clusters = self.__cluster(self.combined_output)
402 | 
403 |             print("Computing tumor-CL distance..")
404 |             pcs = pd.DataFrame(pcs, index=self.combined_output.index)
405 |             if model_ids is None: model_ids = self.ref_input.index
406 |             if tumor_ids is None: tumor_ids = self.target_input.index
407 |             model_pcs = pcs[pcs.index.isin(model_ids)]
408 |             tumor_pcs = pcs[pcs.index.isin(tumor_ids)]
409 |             
410 |             self.tumor_CL_dist = pd.DataFrame(metrics.pairwise_distances(tumor_pcs, model_pcs), index=tumor_pcs.index, columns=model_pcs.index)
411 |         
412 |         return self
413 | 
414 | 
415 |     def makeNewReference(self):
416 |         """
417 |         Make a new reference dataset from the previously transformed reference+target datasets. 
418 |         Used for multi-dataset alignment with previously computed cPCs and DE genes.
419 |         
420 |         """
421 |         self.ref_input = self.combined_output
422 |         self.target_input = None
423 |         return self
424 |     
425 |     
426 |     def save(self, file_name):
427 |         """
428 |         Save the model as a pickle file
429 | 
430 |         Args:
431 |             file_name (str): name of file in which to save the model
432 |         """
433 |         # save the model
434 |         with open(os.path.normpath(file_name), "wb") as f:
435 |             pickle.dump(self, f)
436 | 
437 | 
438 |     def load(self, file_name):
439 |         """
440 |         Load the model from a pickle file
441 | 
442 |         Args:
443 |             file_name (str): pickle file to load the model from
444 |         """
445 |         with open(os.path.normpath(file_name), "rb") as f:
446 |             model = pickle.load(f)
447 |             self.__dict__.update(model.__dict__)
448 |         return self


--------------------------------------------------------------------------------
/celligner/limma.py:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #
 3 | # limmapy
 4 | #
 5 | ##################################################################
 6 | 
 7 | from __future__ import print_function
 8 | import numpy as np
 9 | import rpy2.robjects as robjects
10 | from rpy2.robjects import pandas2ri
11 | pandas2ri.activate()
12 | import rpy2
13 | from rpy2.robjects.packages import importr
14 | limma = importr('limma')
15 | from rpy2.robjects.conversion import localconverter
16 | import rpy2.robjects as ro
17 | import sys
18 | 
19 | to_dataframe = robjects.r('function(x) data.frame(x)')
20 | 
21 | 
22 | class limmapy:
23 |     '''
24 |     limma object through rpy2
25 |     input:
26 |     count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id
27 |         example:
28 |         id    sampleA    sampleB
29 |         geneA    5         1
30 |         geneB    4         5
31 |         geneC    1         2
32 |     design_matrix: an design matrix in the form of pandas dataframe, see limma manual, samplenames as rownames
33 |                             treatment1, treatment2, ...
34 |     sampleA        A          A          B
35 |     sampleA        A          A          B
36 |     sampleB        B          B          A
37 |     sampleB        B          A          B
38 |     '''
39 | 
40 |     def __init__(self):
41 |         self.limma_result = None
42 | 
43 |     def lmFit(self, count_matrix, design_matrix, **kwargs):
44 |         with localconverter(ro.default_converter + pandas2ri.converter):
45 |             count_matrix = pandas2ri.py2rpy(count_matrix.astype(int))
46 |             design_matrix = pandas2ri.py2rpy(design_matrix.astype(int))
47 |         self.fit = limma.lmFit(count_matrix, design_matrix, **kwargs)
48 |         return self
49 | 
50 |     def eBayes(self, **kwargs):
51 |         self.fit = limma.eBayes(self.fit, **kwargs)
52 |         return self
53 | 
54 |     def topTable(self, **kwargs):
55 |         val = limma.topTable(self.fit, **kwargs)
56 |         if type(val) == robjects.vectors.DataFrame:
57 |             with robjects.conversion.localconverter(
58 |                     robjects.default_converter + pandas2ri.converter):
59 |                 val = ro.conversion.rpy2py(val)
60 |         return val
61 | 


--------------------------------------------------------------------------------
/celligner/params.py:
--------------------------------------------------------------------------------
  1 | # Oncotree tissue colors
  2 | TISSUE_COLOR_OT = {
  3 |     "Adrenal Gland": "#E13978",
  4 |     "Ampulla of Vater": "#F5899E",
  5 |     "Biliary Tract": "#C091E3",
  6 |     "Bladder/Urinary Tract":"#E08571",
  7 |     "Bone": "#9F55BB",
  8 |     "Breast":"#45A132",
  9 |     "Bowel":"#96568E",
 10 |     "CNS/Brain": "#F5899E",
 11 |     "Cervix":"#5AB172",
 12 |     "Esophagus/Stomach": "#DFBC3A",
 13 |     "Eye": "#349077",
 14 |     "Fibroblast": "#D8AB6A",
 15 |     "Embryonal":"#75DFBB",
 16 |     "Head and Neck": "#5DA134",
 17 |     "Kidney": "#1F8FFF",
 18 |     "Liver": "#9C5E2B",
 19 |     "Lung": "#51D5E0",
 20 |     "Lymphoid": "#ABD23F",
 21 |     "Myeloid": "#DA45BB",
 22 |     "Normal":"#555555",
 23 |     "Ovary/Fallopian Tube": "#56E79D",
 24 |     "Pancreas": "#B644DC",
 25 |     "Peripheral Nervous System": "#73E03D",
 26 |     "Pleura": "#F5899E", ###
 27 |     "Prostate": "#3870C9",
 28 |     "Skin": "#6C55E2",
 29 |     "Soft Tissue": "#5FDB69",
 30 |     "Testis": "#F5899E", ###
 31 |     "Thymus": "#659FD9", 
 32 |     "Thyroid": "#D74829",
 33 |     "Other/Unknown": "#bdbdbd",
 34 |     "Uterus": "#E491C1",
 35 |     "Vulva/Vagina":"#E491C1"
 36 | }
 37 | 
 38 | TISSUE_COLOR = {
 39 |     "engineered": "#bcdfbd",
 40 |     "fibroblast": "#9eAeAe",
 41 |     "other": "#A3969d",
 42 |     "skin": "#969696",
 43 |     "soft_tissue": "#cedb9c",
 44 |     "sarcomatoid": "#cdcdbd",
 45 |     "unknown": "#bdbdbd",
 46 |     "NS": "#becdbd",
 47 |     "teratoma": "#252525",
 48 |     "germ_cell": "#c7c7c7",
 49 |     "embryo": "#7f7f7f",
 50 |     "bone": "#aec7e8",
 51 |     "lymphocyte": "#17becf",
 52 |     "plasma_cell": "#9edae5",
 53 |     "blood": "#1f77b4",
 54 |     "engineered_blood": "#2f87b4",
 55 |     "central_nervous_system": "#ff7f0e",
 56 |     "engineered_central_nervous_system": "#ff8f3f",
 57 |     "peripheral_nervous_system": "#ffbb78",
 58 |     "nerve": "#dbdb8d",
 59 |     "autonomic_ganglia": "#ebcb8d",
 60 |     "eye": "#bcbd22",
 61 |     "lung": "#d62728",
 62 |     "engineered_lung": "#ee2e3e",
 63 |     "upper_aerodigestive": "#ff9896",
 64 |     "esophagus": "#e7969c",
 65 |     "nasopharynx": "#f7b6d2",
 66 |     "oral": "#feceee",
 67 |     "parotid": "#fdbf6f",
 68 |     "stomach": "#e377c2",
 69 |     "gall_bladder": "#ff7f0e",
 70 |     "bile_duct": "#a55194",
 71 |     "engineered_bile_duct": "#a55194",
 72 |     "ampulla_of_vater": "#ad3184",
 73 |     "pancreas": "#e377c2",
 74 |     "liver": "#9467bd",
 75 |     "gastric": "#c49c94",
 76 |     "small_intestine": "#9e5e6e",
 77 |     "colon": "#8c564b",
 78 |     "ovary": "#2ca02c",
 79 |     "engineered_ovary": "#4eae4e",
 80 |     "uterus": "#98df8a",
 81 |     "cervix": "#5ab172",
 82 |     "breast": "#393b79",
 83 |     "engineered_breast": "#4e3e7e",
 84 |     "kidney": "#386cb0",
 85 |     "engineered_kidney": "#386cb0",
 86 |     "bladder": "#397cb9",
 87 |     "urinary_tract": "#b644dc",
 88 |     "prostate": "#637939",
 89 |     "engineered_prostate": "#6e7e3e",
 90 |     "testis": "#8c6d31",
 91 |     "thyroid": "#8f7e3e",
 92 |     "endocrine": "#bd9e39",
 93 |     "biliary_tract": "#e7ba52",
 94 |     "adrenal": "#8ca252",
 95 |     "thymus": "#659fd9"
 96 | }
 97 | 
 98 | TISSUE_COLOR_R = {
 99 |     "central_nervous_system": "#f5899e",
100 |     "engineered_central_nervous_system": "#f5899e",
101 |     "teratoma": "#f5899e",
102 |     "bone": "#9f55bb",
103 |     "pancreas": "#b644dc",
104 |     "soft_tissue": "#5fdb69",
105 |     "skin": "#6c55e2",
106 |     "liver": "#9c5e2b",
107 |     "blood": "#da45bb",
108 |     "lymphocyte": "#abd23f",
109 |     "peripheral_nervous_system": "#73e03d",
110 |     "ovary": "#56e79d",
111 |     "engineered_ovary": "#56e79d",
112 |     "adrenal": "#e13978",
113 |     "adrenal_cortex": "#e13978",
114 |     "upper_aerodigestive": "#5da134",
115 |     "kidney": "#1f8fff",
116 |     "engineered_kidney": "#1f8fff",
117 |     "gastric": "#dfbc3a",
118 |     "eye": "#349077",
119 |     "nasopharynx": "#a9e082",
120 |     "nerve": "#c44c90",
121 |     "unknown": "#999999",
122 |     "cervix": "#5ab172",
123 |     "thyroid": "#d74829",
124 |     "lung": "#51d5e0",
125 |     "engineered_lung": "#51d5e0",
126 |     "rhabdoid": "#d04850",
127 |     "germ_cell": "#75dfbb",
128 |     "embryo": "#75dfbb",
129 |     "colorectal": "#96568e",
130 |     "endocrine": "#d1d684",
131 |     "bile_duct": "#c091e3",
132 |     "pineal": "#949031",
133 |     "thymus": "#659fd9",
134 |     "mesothelioma": "#dc882d",
135 |     "prostate": "#3870c9",
136 |     "engineered_prostate": "#3870c9",
137 |     "uterus": "#e491c1",
138 |     "breast": "#45a132",
139 |     "engineered_breast": "#45a132",
140 |     "urinary_tract": "#e08571",
141 |     "esophagus": "#6a6c2c",
142 |     "fibroblast": "#d8ab6a",
143 |     "plasma_cell": "#e6c241",
144 | }
145 | 
146 | 
147 | 
148 | #mnn_ndist = 3, # ndist parameter used for MNN
149 | 
150 | # Differentially expressed genes with a rank better than this is in the cell line
151 | # or tumor data are used to identify mutual nearest neighbors in the MNN alignment step
152 | TOP_K_GENES = 1000
153 | 
154 | # number of PCs to use for dimensionality reduction
155 | PCA_NCOMP = 70 
156 | 
157 | # number of cPCA dimensions to regress out of the data
158 | CPCA_NCOMP = 4
159 | 
160 | # @see https://scanpy.readthedocs.io/en/latest/generated/scanpy.tl.louvain.html
161 | LOUVAIN_PARAMS = {
162 |     "resolution": 5, # resolution parameter used for clustering the data
163 | }
164 | 
165 | # For Mariona method (default)
166 | MNN_PARAMS = {
167 |     "k1": 5, # number of nearest neighbors of tumors in the cell line data
168 |     "k2": 50, # number of nearest neighbors of cell lines in the tumor data
169 |     "cosine_norm": False,
170 |     "fk": 5 
171 | }
172 | 
173 | UMAP_PARAMS = {
174 |     "n_neighbors": 10, # num nearest neighbors used to create UMAP plot
175 |     "n_components": 2, 
176 |     "metric": "euclidean", # distance metric used for the UMAP projection
177 |     "min_dist": 0.5 # min distance used to create UMAP plot
178 | }
179 | 


--------------------------------------------------------------------------------
/docs/Screenshot 2021-10-29 at 10.51.53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/Screenshot 2021-10-29 at 10.51.53.png


--------------------------------------------------------------------------------
/docs/Screenshot 2021-10-29 at 10.53.01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/Screenshot 2021-10-29 at 10.53.01.png


--------------------------------------------------------------------------------
/docs/celligner.md:
--------------------------------------------------------------------------------
1 | # Reference
2 | 
3 | ::: celligner
4 |   selection:
5 |     docstring_style: google
6 |   rendering:
7 |     show_source: true


--------------------------------------------------------------------------------
/docs/celligner_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/celligner_diagram.png


--------------------------------------------------------------------------------
/docs/celligner_public22q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/celligner_public22q2.png


--------------------------------------------------------------------------------
/docs/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/example.pdf


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to MkDocs
 2 | 
 3 | For full documentation visit [mkdocs.org](https://www.mkdocs.org).
 4 | 
 5 | ## Commands
 6 | 
 7 | * `mkdocs new [dir-name]` - Create a new project.
 8 | * `mkdocs serve` - Start the live-reloading docs server.
 9 | * `mkdocs build` - Build the documentation site.
10 | * `mkdocs -h` - Print help message and exit.
11 | 
12 | ## Project layout
13 | 
14 |     mkdocs.yml    # The configuration file.
15 |     docs/
16 |         index.md  # The documentation homepage.
17 |         ...       # Other markdown pages, images and other files.
18 | 


--------------------------------------------------------------------------------
/docs/typical_celligner.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/typical_celligner.webp


--------------------------------------------------------------------------------
/install_submodules_and_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #install submodules
 4 | echo "INSTALLING SUBMODULES..."
 5 | 
 6 | #upgrade pip
 7 | pip install --upgrade pip
 8 | 
 9 | #setup other dependencies
10 | pip install taigapy
11 | cd mnnpy; pip install .; cd ..
12 | 
13 | #run QC
14 | echo "RUNNING CELLIGNER..."
15 | python "$@"


--------------------------------------------------------------------------------
/man/calc_gene_stats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{calc_gene_stats}
 4 | \alias{calc_gene_stats}
 5 | \title{Method to calculate gene average expression and variance for an expression matrix}
 6 | \usage{
 7 | calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga)
 8 | }
 9 | \arguments{
10 | \item{dat:}{data object containing tumor and cell line expression data and annotations produced by running load_data}
11 | 
12 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
13 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations}
14 | 
15 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
16 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations}
17 | 
18 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL}
19 | 
20 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder}
21 | }
22 | \value{
23 | gene stats matrix
24 | }
25 | \description{
26 | calculate the average gene expression and variance
27 | }
28 | 


--------------------------------------------------------------------------------
/man/calc_tumor_CL_cor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{calc_tumor_CL_cor}
 4 | \alias{calc_tumor_CL_cor}
 5 | \title{Method to calculate the correlation between cell lines and tumor in the Celligner aligned data}
 6 | \usage{
 7 | calc_tumor_CL_cor(Celligner_aligned_data, Celligner_info)
 8 | }
 9 | \arguments{
10 | \item{Celligner_aligned_data:}{Celligner aligned data matrix of samples (cells line and tumors) by genes}
11 | 
12 | \item{Celligner_info:}{annotation file of cell line and tumor samples with a column 'type' marking samples as either
13 | cell lines or tumors and a column 'sampleID' that matches the row names of Celligner_aligned_data}
14 | }
15 | \value{
16 | matrix of correlations that is tumors by cell lines
17 | }
18 | \description{
19 | calculate the correlation between cell line and tumor samples in the Celligner aligned data
20 | }
21 | 


--------------------------------------------------------------------------------
/man/check_NAs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{check_NAs}
 4 | \alias{check_NAs}
 5 | \title{check for NAs in the expression data and remove samples with NAs}
 6 | \usage{
 7 | check_NAs(mat)
 8 | }
 9 | \arguments{
10 | \item{mat:}{matrix of gene expression data that is samples by genes}
11 | }
12 | \value{
13 | matrix of gene expression data, removing samples that have NAs
14 | }
15 | \description{
16 | check for NAs in the expression data and remove samples with NAs
17 | }
18 | 


--------------------------------------------------------------------------------
/man/cluster_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{cluster_data}
 4 | \alias{cluster_data}
 5 | \title{Method to take in a Seurat object and run default Seurat clustering algorithm}
 6 | \usage{
 7 | cluster_data(seu_obj)
 8 | }
 9 | \arguments{
10 | \item{seu_obj:}{seurat object containing expression data and sample annotations.
11 | Expects PCA for the seurat object has already been calculated.}
12 | }
13 | \value{
14 | Seurat object with cluster annotations
15 | }
16 | \description{
17 | cluster data in seurat object, using default Seurat clustering method. Clsuters data
18 | within PCA space using the number of dimensions provided in celligner_global$n_PC_dims (default is 70)
19 | }
20 | 


--------------------------------------------------------------------------------
/man/create_Seurat_object.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{create_Seurat_object}
 4 | \alias{create_Seurat_object}
 5 | \title{Method to create seurat objects given an expression matrix and annotation table}
 6 | \usage{
 7 | create_Seurat_object(exp_mat, ann, type = NULL)
 8 | }
 9 | \arguments{
10 | \item{exp_mat:}{matrix of samples by genes, where genes are ensembl gene IDs. Data should be log2(X+1) TPM data.}
11 | 
12 | \item{ann:}{matrix of sample anntoations. Expects column 'sampleID' which matches the rownames of exp_mat.}
13 | 
14 | \item{type:}{optional parameter, string specifying the data type of the current data (ex. 'tumor'), which is added to the annotation matrix.}
15 | }
16 | \value{
17 | Seurat object with scaled expression data and annotations stored in meta.data
18 | }
19 | \description{
20 | create Seurat object of expression data and annotations and run dimensionality reduction.
21 | Dimensionality reductions will be run with the parameters (n_PC_dims, umap_n_neighbors, umap_min_dist, distance_metric) specified in celligner_global.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/dot-average_correction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{.average_correction}
 4 | \alias{.average_correction}
 5 | \title{calculate the average correction vector}
 6 | \usage{
 7 | .average_correction(refdata, mnn1, curdata, mnn2)
 8 | }
 9 | \arguments{
10 | \item{refdata:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
11 | In the standard Celligner pipeline this the cell line data.}
12 | 
13 | \item{mnn1:}{mnn1 pairs}
14 | 
15 | \item{curdata:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
16 | In the standard Celligner pipeline this the tumor data.}
17 | 
18 | \item{mnn2:}{mnn2 pairs}
19 | }
20 | \value{
21 | correction vector and pairs
22 | }
23 | \description{
24 | Computes correction vectors for each MNN pair, and then averages them for each MNN-involved cell in the second batch.
25 | Copied from dev version of scran (2018-10-28), with slight modifications as noted https://github.com/MarioniLab/scran
26 | }
27 | 


--------------------------------------------------------------------------------
/man/dot-center_along_batch_vector.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{.center_along_batch_vector}
 4 | \alias{.center_along_batch_vector}
 5 | \title{centers samples within each batch}
 6 | \usage{
 7 | .center_along_batch_vector(mat, batch.vec)
 8 | }
 9 | \arguments{
10 | \item{mat:}{matrix of samples by genes}
11 | 
12 | \item{batch.vec:}{batch vector}
13 | }
14 | \value{
15 | correction vector and pairs
16 | }
17 | \description{
18 | Projecting along the batch vector, and shifting all samples to the center within each batch.
19 | This removes any variation along the overall batch vector within each matrix.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/dot-compute_tricube_average.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{.compute_tricube_average}
 4 | \alias{.compute_tricube_average}
 5 | \title{compute tricube averages}
 6 | \usage{
 7 | .compute_tricube_average(vals, indices, distances, bandwidth = NULL, ndist = 3)
 8 | }
 9 | \arguments{
10 | \item{values:}{correction vector}
11 | 
12 | \item{indices:}{nxk matrix for the nearest neighbor indice}
13 | 
14 | \item{distances:}{nxk matrix for the nearest neighbor Euclidea distances}
15 | 
16 | \item{bandwidth:}{Is set at 'ndist' times the median distance, if not specified.}
17 | 
18 | \item{ndist:}{By default is 3.}
19 | }
20 | \description{
21 | Centralized function to compute tricube averages.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/dot-tricube_weighted_correction.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{.tricube_weighted_correction}
 4 | \alias{.tricube_weighted_correction}
 5 | \title{tricube-weighted correction}
 6 | \usage{
 7 | .tricube_weighted_correction(
 8 |   curdata,
 9 |   correction,
10 |   in.mnn,
11 |   k = 20,
12 |   ndist = 3,
13 |   subset_genes,
14 |   BNPARAM = NULL,
15 |   BPPARAM = BiocParallel::SerialParam()
16 | )
17 | }
18 | \arguments{
19 | \item{curdata:}{target matrix of samples by genes}
20 | 
21 | \item{correction:}{corrected vector}
22 | 
23 | \item{in.mnn:}{mnn pairs}
24 | 
25 | \item{k:}{k values, default 20}
26 | 
27 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
28 | By default is 3.}
29 | 
30 | \item{subset_genes:}{genes used to identify mutual nearest neighbors}
31 | 
32 | \item{BNPARAM:}{default NULL}
33 | 
34 | \item{BPPARAM:}{default BiocParallel::SerialParam()}
35 | }
36 | \value{
37 | MNN corrected data
38 | }
39 | \description{
40 | Computing tricube-weighted correction vectors for individual samples,
41 | using the nearest neighbouring samples involved in MNN pairs.
42 | Modified to use FNN rather than queryKNN for nearest neighbor finding
43 | }
44 | 


--------------------------------------------------------------------------------
/man/find_differentially_expressed_genes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{find_differentially_expressed_genes}
 4 | \alias{find_differentially_expressed_genes}
 5 | \title{Method to find genes that are differentially expressed between clusters within the expression data}
 6 | \usage{
 7 | find_differentially_expressed_genes(seu_obj)
 8 | }
 9 | \arguments{
10 | \item{seu_obj:}{seurat object containing expression data and sample annotations. Expects data in the Seurat object
11 | slot scale.data and a column 'seurat_clusters' within the meta.data of the Seurat object.}
12 | }
13 | \value{
14 | table with gene level stats
15 | }
16 | \description{
17 | find genes that are differentially expressed between clusters within the expression data
18 | }
19 | 


--------------------------------------------------------------------------------
/man/get_cluster_averages.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{get_cluster_averages}
 4 | \alias{get_cluster_averages}
 5 | \title{calculate the average expression per cluster}
 6 | \usage{
 7 | get_cluster_averages(mat, cluster_df)
 8 | }
 9 | \arguments{
10 | \item{mat:}{sample by genes matrix of expression data}
11 | 
12 | \item{cluster_df:}{table of sample metadata that includes a column 'seurat_clusters',
13 | containing transcriptional clusters}
14 | }
15 | \value{
16 | average cluster expression
17 | }
18 | \description{
19 | calculate the average expression per cluster
20 | }
21 | 


--------------------------------------------------------------------------------
/man/load_additional_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutlidataset_alignment.R
 3 | \name{load_additional_data}
 4 | \alias{load_additional_data}
 5 | \title{Load additional expression and annotation data}
 6 | \usage{
 7 | load_additional_data(
 8 |   data_name,
 9 |   data_file,
10 |   data_version = NULL,
11 |   data_taiga = TRUE,
12 |   ann_name,
13 |   ann_file,
14 |   ann_version = NULL,
15 |   ann_taiga = TRUE,
16 |   data_type = ""
17 | )
18 | }
19 | \arguments{
20 | \item{data_name:}{if data_taiga = TRUE, then the data.name of the taiga file containing the expression data,
21 | if data_taiga=FALSE, then the file path to the local folder containing the expression data. Assumes that genes
22 | are labeled using ensembl IDs and that there are fewer samples than genes in the matrix, will transpose the matrix
23 | so that rows are samples and columns are genes.}
24 | 
25 | \item{data_file:}{if data_taiga = TRUE, then the data.file of the taiga file containing the expression data,
26 | if data_taiga = FALSE, then the name of the file of expression data}
27 | 
28 | \item{data_version:}{(optional) parameter to specify the version to pull from taiga for the expression data, default set to NULL}
29 | 
30 | \item{data_taiga:}{if TRUE then pulls the expression data from taiga, if FALSE then finds expression data in local folder}
31 | 
32 | \item{ann_name:}{if ann_taiga = TRUE, then the data.name of the taiga file containing the data annotations,
33 | if ann_taiga=FALSE, then the file path to the local folder containing the annotations}
34 | 
35 | \item{ann_file:}{if ann_taiga = TRUE, then the data.file of the taiga file containing the data annotations,
36 | if ann_taiga=FALSE, then the name of the file of data annotations}
37 | 
38 | \item{ann_version:}{(optional) parameter to specify the version to pull from taiga for the annotations, default set to NULL}
39 | 
40 | \item{ann_taiga:}{if TRUE (default) then pulls the annotations from taiga, if FALSE then finds cell line annotations in local folder}
41 | 
42 | \item{data_type:}{string added to the annotation file under the column type to specify the data, default is ""}
43 | }
44 | \value{
45 | object containing expression matrix and annotations table
46 | }
47 | \description{
48 | load additional expression and annotation files
49 | }
50 | 


--------------------------------------------------------------------------------
/man/load_data.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/Celligner_methods.R
  3 | \name{load_data}
  4 | \alias{load_data}
  5 | \title{method to load in tumor and cell line expression data and annotations}
  6 | \usage{
  7 | load_data(
  8 |   cell_line_data_name,
  9 |   cell_line_data_file,
 10 |   cell_line_version,
 11 |   cell_line_taiga,
 12 |   cell_line_ann_name,
 13 |   cell_line_ann_file,
 14 |   cell_line_ann_version,
 15 |   cell_line_ann_taiga,
 16 |   tumor_data_name,
 17 |   tumor_data_file,
 18 |   tumor_version,
 19 |   tumor_taiga,
 20 |   tumor_ann_name,
 21 |   tumor_ann_file,
 22 |   tumor_ann_version,
 23 |   tumor_ann_taiga,
 24 |   additional_annotations_name,
 25 |   additional_annotations_file,
 26 |   additional_annotations_version,
 27 |   additional_annotations_taiga,
 28 |   hgnc_data_name,
 29 |   hgnc_data_file,
 30 |   hgnc_version,
 31 |   hgnc_taiga
 32 | )
 33 | }
 34 | \arguments{
 35 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
 36 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data}
 37 | 
 38 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
 39 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data}
 40 | 
 41 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL}
 42 | 
 43 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder}
 44 | 
 45 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
 46 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations}
 47 | 
 48 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
 49 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
 50 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.}
 51 | 
 52 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL}
 53 | 
 54 | \item{cell_line_ann_taiga:}{if TRUE then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder}
 55 | 
 56 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
 57 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.
 58 | If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
 59 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.}
 60 | 
 61 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
 62 | if tumor_taiga=FALSE, then the name of the file the tumor expression data}
 63 | 
 64 | \item{tumor_taiga:}{if TRUE then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder}
 65 | 
 66 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
 67 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations}
 68 | 
 69 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
 70 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations}
 71 | 
 72 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
 73 | 
 74 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
 75 | 
 76 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
 77 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
 78 | for the cell lines. If null, assumes there are no additional annotations.}
 79 | 
 80 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
 81 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
 82 | no additional annotations.}
 83 | 
 84 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
 85 | 
 86 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
 87 | 
 88 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
 89 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations}
 90 | 
 91 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
 92 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations}
 93 | 
 94 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL}
 95 | 
 96 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder}
 97 | }
 98 | \value{
 99 | dat object with cell line and tumor expression data and annotations
100 | }
101 | \description{
102 | load expression and annotation files for cell lines and tumors
103 | }
104 | 


--------------------------------------------------------------------------------
/man/modified_mnnCorrect.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{modified_mnnCorrect}
 4 | \alias{modified_mnnCorrect}
 5 | \title{MNN}
 6 | \usage{
 7 | modified_mnnCorrect(
 8 |   ref_mat,
 9 |   targ_mat,
10 |   k1 = 20,
11 |   k2 = 20,
12 |   ndist = 3,
13 |   subset_genes = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{ref_mat:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
18 | In the standard Celligner pipeline this the cell line data.}
19 | 
20 | \item{targ_mat:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
21 | In the standard Celligner pipeline this the tumor data.}
22 | 
23 | \item{k1:}{the number of neighbors within the data being corrected (in standard pipeline the tumor data). By default this is 20.}
24 | 
25 | \item{k2:}{the number of neighbors within the reference data (in standard pipeline the cell line data). By default this is 20.}
26 | 
27 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
28 | By default is 3.}
29 | 
30 | \item{subset_genes:}{the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially
31 | expressed genes is usually passed here. By default is NULL, meaning all genes are used}
32 | }
33 | \value{
34 | MNN object, containing the targ_mat corrected data and the mutual nearest neighbor pairs.
35 | }
36 | \description{
37 | Mutual nearest neighbors correction. Modification of the scran::fastMNN (https://github.com/MarioniLab/scran).
38 | Allows for separate k values per dataset, and simplifies some of the IO and doesn't use PCA reduction
39 | }
40 | 


--------------------------------------------------------------------------------
/man/run_Celligner.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/Celligner_methods.R
  3 | \name{run_Celligner}
  4 | \alias{run_Celligner}
  5 | \title{All methods to run Celligner and save the output, if desired}
  6 | \usage{
  7 | run_Celligner(
  8 |   cell_line_data_name = "public-20q4-a4b3",
  9 |   cell_line_data_file = "CCLE_expression_full",
 10 |   cell_line_version = NULL,
 11 |   cell_line_taiga = TRUE,
 12 |   cell_line_ann_name = "arxspan-cell-line-export-f808",
 13 |   cell_line_ann_file = "ACH",
 14 |   cell_line_ann_version = NULL,
 15 |   cell_line_ann_taiga = TRUE,
 16 |   tumor_data_name = "celligner-input-9827",
 17 |   tumor_data_file = "tumor_expression",
 18 |   tumor_version = NULL,
 19 |   tumor_taiga = TRUE,
 20 |   tumor_ann_name = "celligner-input-9827",
 21 |   tumor_ann_file = "tumor_annotations",
 22 |   tumor_ann_version = NULL,
 23 |   tumor_ann_taiga = TRUE,
 24 |   additional_annotations_name = "celligner-input-9827",
 25 |   additional_annotations_file = "CCLE_annotations",
 26 |   additional_annotations_version = NULL,
 27 |   additional_annotations_taiga = TRUE,
 28 |   hgnc_data_name = "hgnc-87ab",
 29 |   hgnc_data_file = "hgnc_complete_set",
 30 |   hgnc_version = NULL,
 31 |   hgnc_taiga = TRUE,
 32 |   save_output = NULL
 33 | )
 34 | }
 35 | \arguments{
 36 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
 37 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data}
 38 | 
 39 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
 40 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data}
 41 | 
 42 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL}
 43 | 
 44 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder}
 45 | 
 46 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
 47 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations}
 48 | 
 49 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
 50 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
 51 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.}
 52 | 
 53 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL}
 54 | 
 55 | \item{cell_line_ann_taiga:}{if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder}
 56 | 
 57 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
 58 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.}
 59 | 
 60 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
 61 | if tumor_taiga=FALSE, then the name of the file the tumor expression data}
 62 | 
 63 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL}
 64 | 
 65 | \item{tumor_taiga:}{if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder}
 66 | 
 67 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
 68 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations}
 69 | 
 70 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
 71 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
 72 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.}
 73 | 
 74 | \item{tumor_ann_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
 75 | 
 76 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
 77 | 
 78 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
 79 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
 80 | for the cell lines. If null, assumes there are no additional annotations.}
 81 | 
 82 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
 83 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
 84 | no additional annotations.}
 85 | 
 86 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
 87 | 
 88 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
 89 | 
 90 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
 91 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations}
 92 | 
 93 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
 94 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations}
 95 | 
 96 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL}
 97 | 
 98 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder}
 99 | 
100 | \item{save_output:}{by default is NULL and won't save output, to save output pass in a filepath of where to save the output}
101 | }
102 | \value{
103 | seurat object of the Celligner-aligned data
104 | }
105 | \description{
106 | run all parts of the Celligner pipeline
107 | }
108 | 


--------------------------------------------------------------------------------
/man/run_MNN.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{run_MNN}
 4 | \alias{run_MNN}
 5 | \title{Method to run mutual nearest neighbors batch correction}
 6 | \usage{
 7 | run_MNN(
 8 |   CCLE_cor,
 9 |   TCGA_cor,
10 |   k1 = celligner_global$mnn_k_tumor,
11 |   k2 = celligner_global$mnn_k_CL,
12 |   ndist = celligner_global$mnn_ndist,
13 |   subset_genes
14 | )
15 | }
16 | \arguments{
17 | \item{CCLE_cor:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment.
18 | In the default Celligner pipeline this the cell line data.}
19 | 
20 | \item{TCGA_cor:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data.
21 | In the default Celligner pipeline this the tumor data.}
22 | 
23 | \item{k1:}{the number of neighbors within the data being corrected (by default the tumor data). By default this
24 | pulls from the celligner_global paramter mnn_k_tumor, which by default is 50.}
25 | 
26 | \item{k2:}{the number of neighbors within the reference data (by default the cell line data). By default this
27 | pulls from the celligner_global parameter mnn_k_CL, which by default is 5.}
28 | 
29 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors.
30 | By default this pulls from the celligner_global parameter mnn_ndist, which by default is 3.}
31 | 
32 | \item{subset_genes:}{the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially
33 | expressed genes is usually passed here.}
34 | }
35 | \value{
36 | mutual nearest neighbors object with corrected data for the second dataset provided as input and the mutual nearest neighbors
37 | }
38 | \description{
39 | run MNN batch correction to align data to a reference dataset
40 | }
41 | 


--------------------------------------------------------------------------------
/man/run_cPCA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_methods.R
 3 | \name{run_cPCA}
 4 | \alias{run_cPCA}
 5 | \title{Method to run contrastive principal components analysis}
 6 | \usage{
 7 | run_cPCA(TCGA_obj, CCLE_obj, pc_dims = NULL)
 8 | }
 9 | \arguments{
10 | \item{TCGA_obj:}{seurat object containing expression data and sample annotations, usually the tumor data}
11 | 
12 | \item{CCLE_obj:}{seurat object containing expression data and sample annotations, usually the cell line data}
13 | 
14 | \item{pc_dims:}{the number of cPCs calculated. If set to null then all cPCs will be calculated (this is quite slow), but if set to
15 | some value >=4 then an approximate cPCA will be calculated, which just calculates the input number of contrastive principle components,
16 | which is quicker.}
17 | }
18 | \value{
19 | object containing cPC vectors and values
20 | }
21 | \description{
22 | run contrastive principal components analysis.
23 | Set pc_dims to a value >= 4 to run fast cPCA by just calculating the top contrastive principle components
24 | }
25 | 


--------------------------------------------------------------------------------
/man/run_cPCA_analysis.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{run_cPCA_analysis}
 4 | \alias{run_cPCA_analysis}
 5 | \title{cPCA}
 6 | \usage{
 7 | run_cPCA_analysis(
 8 |   TCGA_dat,
 9 |   CCLE_dat,
10 |   tumor_cluster_df,
11 |   CL_cluster_df,
12 |   pc_dims = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{TCGA_dat:}{sample by genes matrix of scaled expression data}
17 | 
18 | \item{CCLE_dat:}{sample by genes matrix of scaled expression data}
19 | 
20 | \item{tumor_cluster_df:}{table of sample metadata that includes a column 'seurat_clusters',
21 | containing transcriptional clusters in the TCGA data}
22 | 
23 | \item{CL_cluster_df:}{table of sample metadata that includes a column 'seurat_clusters',
24 | containing transcriptional clusters in the CCLE data}
25 | 
26 | \item{pc_dims:}{numbers of cPCs calculated. If set to NULL (default) all cPCs will be calculated, if set to a value
27 | then that number of cPCs will be approximated. Values input should be >= 4.}
28 | }
29 | \value{
30 | contrastive principal component object containing cPC vectors and values
31 | }
32 | \description{
33 | Run contrastive principal components analysis, first removing average cluster expression, to
34 | }
35 | 


--------------------------------------------------------------------------------
/man/run_lm_stats_limma_group.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Celligner_helpers.R
 3 | \name{run_lm_stats_limma_group}
 4 | \alias{run_lm_stats_limma_group}
 5 | \title{Differentially expressed genes}
 6 | \usage{
 7 | run_lm_stats_limma_group(
 8 |   mat,
 9 |   phenos,
10 |   covars = NULL,
11 |   weights = NULL,
12 |   target_type = "Gene",
13 |   limma_trend = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{mat:}{Nxp data matrix of N cell lines and p genes}
18 | 
19 | \item{phenos:}{N vector of independent variables. Can be two-group labels as factors, bools, or can be numeric}
20 | 
21 | \item{covars:}{optional Nxk matrix of sample covariates}
22 | 
23 | \item{weights:}{optional N vector of precision weights for each data point}
24 | 
25 | \item{target_type:}{name of the column variable in the data (default 'Gene')}
26 | }
27 | \value{
28 | table of gene level stata
29 | }
30 | \description{
31 | Estimate linear-model stats for a matrix of data with respect to a group of phenotype variables
32 | }
33 | 


--------------------------------------------------------------------------------
/man/run_multidataset_alignment.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mutlidataset_alignment.R
  3 | \name{run_multidataset_alignment}
  4 | \alias{run_multidataset_alignment}
  5 | \title{All methods to run Celligner, with additional alignment of Met500 and PDX data, and save the output, if desired}
  6 | \usage{
  7 | run_multidataset_alignment(
  8 |   cell_line_data_name = "public-20q4-a4b3",
  9 |   cell_line_data_file = "CCLE_expression_full",
 10 |   cell_line_version = NULL,
 11 |   cell_line_taiga = TRUE,
 12 |   cell_line_ann_name = "arxspan-cell-line-export-f808",
 13 |   cell_line_ann_file = "ACH",
 14 |   cell_line_ann_version = NULL,
 15 |   cell_line_ann_taiga = TRUE,
 16 |   tumor_data_name = "celligner-input-9827",
 17 |   tumor_data_file = "tumor_expression",
 18 |   tumor_version = NULL,
 19 |   tumor_taiga = TRUE,
 20 |   tumor_ann_name = "celligner-input-9827",
 21 |   tumor_ann_file = "tumor_annotations",
 22 |   tumor_ann_version = NULL,
 23 |   tumor_ann_taiga = TRUE,
 24 |   additional_annotations_name = "celligner-input-9827",
 25 |   additional_annotations_file = "CCLE_annotations",
 26 |   additional_annotations_version = NULL,
 27 |   additional_annotations_taiga = TRUE,
 28 |   hgnc_data_name = "hgnc-87ab",
 29 |   hgnc_data_file = "hgnc_complete_set",
 30 |   hgnc_version = NULL,
 31 |   hgnc_taiga = TRUE,
 32 |   met500_data_name = "met500-fc3c",
 33 |   met500_data_file = "met500_TPM",
 34 |   met500_version = NULL,
 35 |   met500_taiga = TRUE,
 36 |   met500_ann_name = "met500-fc3c",
 37 |   met500_ann_file = "met500_ann",
 38 |   met500_ann_version = NULL,
 39 |   met500_ann_taiga = TRUE,
 40 |   Novartis_PDX_data_name = "pdx-data-3d29",
 41 |   Novartis_PDX_data_file = "Novartis_PDX_TPM",
 42 |   Novartis_PDX_version = NULL,
 43 |   Novartis_PDX_taiga = TRUE,
 44 |   Novartis_PDX_ann_name = "pdx-data-3d29",
 45 |   Novartis_PDX_ann_file = "Novartis_PDX_ann",
 46 |   Novartis_PDX_ann_version = NULL,
 47 |   Novartis_PDX_ann_taiga = TRUE,
 48 |   pediatric_PDX_data_name = "pdx-data-3d29",
 49 |   pediatric_PDX_data_file = "pediatric_PDX_TPM",
 50 |   pediatric_PDX_version = NULL,
 51 |   pediatric_PDX_taiga = TRUE,
 52 |   pediatric_PDX_ann_name = "pdx-data-3d29",
 53 |   pediatric_PDX_ann_file = "pediatric_PDX_ann",
 54 |   pediatric_PDX_ann_version = NULL,
 55 |   pediatric_PDX_ann_taiga = TRUE,
 56 |   save_output = NULL
 57 | )
 58 | }
 59 | \arguments{
 60 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data,
 61 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data}
 62 | 
 63 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data,
 64 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data}
 65 | 
 66 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL}
 67 | 
 68 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder}
 69 | 
 70 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations,
 71 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations}
 72 | 
 73 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations,
 74 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan
 75 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.}
 76 | 
 77 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL}
 78 | 
 79 | \item{cell_line_ann_taiga:}{if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder}
 80 | 
 81 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data,
 82 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.}
 83 | 
 84 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data,
 85 | if tumor_taiga=FALSE, then the name of the file the tumor expression data}
 86 | 
 87 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL}
 88 | 
 89 | \item{tumor_taiga:}{if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder}
 90 | 
 91 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations,
 92 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations}
 93 | 
 94 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations,
 95 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript,
 96 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.}
 97 | 
 98 | \item{tumor_ann_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
 99 | 
100 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
101 | 
102 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations,
103 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations
104 | for the cell lines. If null, assumes there are no additional annotations.}
105 | 
106 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations,
107 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are
108 | no additional annotations.}
109 | 
110 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL}
111 | 
112 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder}
113 | 
114 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations,
115 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations}
116 | 
117 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations,
118 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations}
119 | 
120 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL}
121 | 
122 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder}
123 | 
124 | \item{met500_data_name:}{Met500 expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using met500_taiga=F}
125 | 
126 | \item{met500_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using met500_taiga=F}
127 | 
128 | \item{met500_version:}{default NULL, used to specify version of taiga dataset}
129 | 
130 | \item{met500_taiga:}{if TRUE (default) pulls Met500 expression from taiga dataset, if FALSE reads from local}
131 | 
132 | \item{met500_ann_name:}{Met500 annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using met500_ann_taiga=F}
133 | 
134 | \item{met500_ann_file:}{Met500 annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using met500_ann_taiga=F}
135 | 
136 | \item{met500_ann_version:}{default NULL, used to specify version of taiga dataset}
137 | 
138 | \item{met500_ann_taiga:}{if TRUE (default) pulls met500 annotations from taiga dataset, if FALSE reads from local}
139 | 
140 | \item{Novartis_PDX_data_name:}{Novartis PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using Novartis_PDX_taiga=F}
141 | 
142 | \item{Novartis_PDX_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using Novartis_PDX_taiga=F}
143 | 
144 | \item{Novartis_PDX_version:}{default NULL, used to specify version of taiga dataset}
145 | 
146 | \item{Novartis_PDX_taiga:}{if TRUE (default) pulls Novartis PDX expression from taiga dataset, if FALSE reads from local}
147 | 
148 | \item{Novartis_PDX_ann_name:}{Novartis PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or path to folder is using met500_ann_taiga=F}
149 | 
150 | \item{Novartis_PDX_ann_file:}{Novartis PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or name of local file is using Novartis_PDX_ann_taiga=F}
151 | 
152 | \item{Novartis_PDX_ann_version:}{default NULL, used to specify version of taiga dataset}
153 | 
154 | \item{Novartis_PDX_ann_taiga:}{if TRUE (default) pulls Novartis PDX annotations from taiga dataset, if FALSE reads from local}
155 | 
156 | \item{pediatric_PDX_data_name:}{pediatric PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using pediatric_PDX_taiga=F}
157 | 
158 | \item{pediatric_PDX_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using pediatric_PDX_taiga=F}
159 | 
160 | \item{pediatric_PDX_version:}{default NULL, used to specify version of taiga dataset}
161 | 
162 | \item{pediatric_PDX_taiga:}{if TRUE (default) pulls pediatric PDX expression from taiga dataset, if FALSE reads from local}
163 | 
164 | \item{pediatric_PDX_ann_name:}{Pediatric PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using pediatric_PDX_ann_taiga=F}
165 | 
166 | \item{pediatric_PDX_ann_file:}{Pediatric PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using pediatric_PDX_ann_taiga=F}
167 | 
168 | \item{pediatric_PDX_ann_version:}{default NULL, used to specify version of taiga dataset}
169 | 
170 | \item{pediatric_PDX_ann_taiga:}{if TRUE (default) pulls pediatric PDX annotations from taiga dataset, if FALSE reads from local}
171 | 
172 | \item{save_output:}{by default is NULL and won't save output, to save output pass in a filepath of where to save the output}
173 | }
174 | \value{
175 | seurat object of the Celligner-aligned data
176 | }
177 | \description{
178 | run all parts of the Celligner pipeline, with alignment of additional datasets
179 | }
180 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: celligner
 2 | theme: readthedocs
 3 | nav:
 4 |   - "Celligner": celligner.md
 5 | plugins:
 6 | - mkdocstrings:
 7 |     default_handler: python
 8 |     handlers:
 9 |       python:
10 |         rendering:
11 |           show_source: false
12 |     custom_templates: templates
13 |     watch:
14 |       - celligner/
15 |           


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.23.3
 2 | pandas==1.4.4
 3 | scikit_learn==1.1.2
 4 | umap>=0.1
 5 | igraph==0.9.11
 6 | scanpy==1.9.1
 7 | anndata==0.8.0
 8 | scipy==1.9.1
 9 | louvain==0.7.1
10 | rpy2==3.5.4
11 | cython==0.29.32
12 | matplotlib<3.7


--------------------------------------------------------------------------------
/run_celligner.py:
--------------------------------------------------------------------------------
  1 | import celligner
  2 | import pandas as pd
  3 | import re
  4 | from taigapy import TaigaClient
  5 | 
  6 | tc = TaigaClient()
  7 | 
  8 | portal_in_dict = {}
  9 | 
 10 | portal_out_dict = {}
 11 | 
 12 | depmap_params = {'name': 'depmap_public_23q2',
 13 |                  'taiga_name': 'public-23q2-19de',
 14 |                  'taiga_file': 'OmicsExpressionProteinCodingGenesTPMLogp1',
 15 |                  'dset_type': 'model',
 16 |                  'mnn_params': None}
 17 | 
 18 | tcga_params = {'name': 'tcga',
 19 |                'taiga_name': 'celligner-input-9827',
 20 |                'taiga_file': 'tumor_expression',
 21 |                'dset_type': 'tumor',
 22 |                'mnn_params': None}
 23 | 
 24 | met500_params = {'name': 'met500',
 25 |                  'taiga_name': 'met500-fc3c',
 26 |                  'taiga_file': 'met500_TPM',
 27 |                  'dset_type': 'tumor',
 28 |                  'mnn_params': {'k1': 20, 'k2': 50}}
 29 | 
 30 | pdx_nv_params = {'name': 'pdx_novartis',
 31 |                  'taiga_name': 'pdx-data-3d29',
 32 |                  'taiga_file': 'Novartis_PDX_TPM',
 33 |                  'dset_type': 'model',
 34 |                  'mnn_params': {'k1': 10, 'k2': 50}}
 35 | 
 36 | pdx_ped_params = {'name': 'pdx_pediatric',
 37 |                   'taiga_name': 'pdx-data-3d29',
 38 |                   'taiga_file': 'pediatric_PDX_TPM',
 39 |                   'dset_type': 'model',
 40 |                   'mnn_params': {'k1': 10, 'k2': 50}}
 41 | 
 42 | celligner_default_extras = [met500_params, pdx_nv_params, pdx_ped_params]
 43 | 
 44 | 
 45 | def process_data(bg_df, contrast_df, extra_dfs):
 46 |     # Filter to columns with ensembl id
 47 |     # Need to transform from ensembl transcript ids to ensemble gene ids
 48 |     # insert below
 49 | 
 50 |     # Load HGNC gene set, filter to functional subset
 51 |     hgnc_complete_set = tc.get(name='hgnc-87ab', version=7, file='hgnc_complete_set')
 52 |     func_genes = hgnc_complete_set[~hgnc_complete_set.locus_group.isin(["non-coding RNA", "pseudogene"])]
 53 | 
 54 |     # background datafrrame should have genes labeled by gene symbol
 55 |     # need to reindex
 56 |     # print('bg_genes going in:', bg_df.keys())
 57 |     bg_genes = pd.Series(bg_df.keys()).apply(lambda s: re.search(r'^([\w.-]+) \(', s).group(1)).rename('symbol')
 58 |     bg_genes = bg_genes.set_axis(bg_df.keys()).to_frame()
 59 |     bg_genes = bg_genes.reset_index().merge(hgnc_complete_set[['symbol', 'ensembl_gene_id']],
 60 |                               left_on='symbol', right_on='symbol')[['index','ensembl_gene_id']].set_index('index')
 61 |     # print('bg_genes processed:', bg_genes)
 62 |     bg_df = bg_df.rename(columns=bg_genes.to_dict()['ensembl_gene_id'])
 63 |     # print(bg_df.head())
 64 | 
 65 |     gene_sets = [set(bg_df.columns), set(contrast_df.columns)] + [set(_df.columns) for _df in extra_dfs]
 66 |     gene_set = set(func_genes.ensembl_gene_id).intersection(*gene_sets)
 67 |     common_genes = [x for x in bg_df.columns if x in gene_set]
 68 |     print('Common genes:', len(common_genes))
 69 | 
 70 |     bg_df = bg_df[common_genes]
 71 |     contrast_df = contrast_df[common_genes]
 72 |     extra_dfs = [_df[common_genes] for _df in extra_dfs]
 73 | 
 74 |     return bg_df, contrast_df, extra_dfs
 75 | 
 76 | 
 77 | def run_celligner(bg=depmap_params, contrast=tcga_params, extra_dsets=celligner_default_extras):
 78 |     # load data to be used as the background and label the source
 79 |     bg_df = tc.get(name=bg['taiga_name'], file=bg['taiga_file'])
 80 |     bg_df = pd.concat({bg['name']: bg_df}, names=['source'])
 81 | 
 82 |     # load data to be contrasted with the background and label the source
 83 |     contrast_df = tc.get(name=contrast['taiga_name'], file=contrast['taiga_file'])
 84 |     contrast_df = pd.concat({contrast['name']: contrast_df}, names=['source'])
 85 | 
 86 |     # if there are additional datasets to be projected then collect and label them as well
 87 |     extra_dfs = []
 88 |     for dset in extra_dsets:
 89 |         _df = tc.get(name=dset['taiga_name'], file=dset['taiga_file'])
 90 |         if dset['name'] == 'pdx_novartis':
 91 |             _df = _df.T
 92 |         _df = pd.concat({dset['name']: _df}, names=['source'])
 93 |         extra_dfs.append(_df)
 94 | 
 95 |     '''print(bg_df.head())
 96 |     print(contrast_df.head())
 97 |     for _df in extra_dfs:
 98 |         print(_df.head())'''
 99 |     # make sure all datasets are using ensembl gene ids and are restricted to common sets of genes
100 |     bg_df, contrast_df, extra_dfs = process_data(bg_df, contrast_df, extra_dfs)
101 |     '''print(bg_df.head())
102 |     print(contrast_df.head())
103 |     for _df in extra_dfs:
104 |         print(_df.head())'''
105 |     # Create Celligner object and fit + transform the reference (depmap) and target (TCGA) expression datasets
106 |     my_celligner = celligner.Celligner()
107 |     my_celligner.fit(bg_df.droplevel(0,0))
108 |     my_celligner.transform(contrast_df.droplevel(0,0))
109 |     # add in additional datasets to be projected if they are given
110 |     for _df in extra_dfs:
111 |         df_name = _df.index.get_level_values(0).unique()[0]
112 |         for dset in extra_dsets:
113 |             if dset['name'] == df_name:
114 |                 break
115 | 
116 |         my_celligner.makeNewReference()
117 |         if dset['mnn_params']:
118 |             p = dset['mnn_params']
119 |             my_celligner.mnn_kwargs.update({'k1': p['k1'], 'k2': p['k2']})
120 |         my_celligner.transform(_df.droplevel(0,0), compute_cPCs=False)
121 | 
122 |     # Compute UMAP, clusters and tumor - model distance
123 |     model_ids = list(bg_df.index.get_level_values(1))
124 |     tumor_ids = list(contrast_df.index.get_level_values(1))
125 | 
126 |     for _df in extra_dfs:
127 |         df_name = _df.index.get_level_values(0).unique()[0]
128 |         for dset in extra_dsets:
129 |             if dset['name'] == df_name:
130 |                 break
131 | 
132 |         if dset['dset_type'] == 'model':
133 |             model_ids += list(_df.index.get_level_values(1))
134 |         elif dset['dset_type'] == 'tumor':
135 |             tumor_ids += list(_df.index.get_level_values(1))
136 | 
137 |     my_celligner.computeMetricsForOutput(model_ids=model_ids, tumor_ids=tumor_ids)
138 | 
139 |     outname = 'celligner_output_' \
140 |               + bg['name'] + '_' \
141 |               + contrast['name'] + '_' \
142 |               + '_'.join([d['name'] for d in extra_dsets]) + ".pkl"
143 |     my_celligner.save(outname)
144 |     print('Model saved to: ', outname)
145 |     return outname
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     run_celligner()
150 | 


--------------------------------------------------------------------------------
/run_celligner_multi_dataset.py:
--------------------------------------------------------------------------------
 1 | import celligner
 2 | import pandas as pd
 3 | import numpy as np
 4 | import re
 5 | from taigapy import TaigaClient
 6 | tc = TaigaClient()
 7 | 
 8 | # Load data
 9 | CCLE_expression = tc.get(name='dmc-22q2-5e51', version=16, file='CCLE_expression_full')
10 | tumor_expression = tc.get(name='celligner-input-9827', version=2, file='tumor_expression')
11 | met500_TPM = tc.get(name='met500-fc3c', version=1, file='met500_TPM')
12 | Novartis_PDX_TPM = tc.get(name='pdx-data-3d29', version=2, file='Novartis_PDX_TPM').T
13 | pediatric_PDX_TPM = tc.get(name='pdx-data-3d29', version=2, file='pediatric_PDX_TPM')
14 | 
15 | # Filter to columns with ensembl id
16 | CCLE_expression = CCLE_expression.filter(like='ENSG')
17 | CCLE_expression.columns = pd.Series(CCLE_expression.columns).apply(lambda x: re.search('(ENSG\d+)', x).group(1))
18 | 
19 | ## Load HGNC gene set, filter to functional subset
20 | hgnc_complete_set = tc.get(name='hgnc-87ab', version=5, file='hgnc_complete_set')
21 | func_genes = hgnc_complete_set[~hgnc_complete_set.locus_group.isin(["non-coding RNA", "pseudogene"])]
22 | 
23 | # Identify common genes - maintaining order from CCLE expression matrix
24 | gene_sets = [set(tumor_expression.columns), set(met500_TPM.columns), set(Novartis_PDX_TPM.columns), set(pediatric_PDX_TPM.columns)]
25 | gene_set = set(func_genes.ensembl_gene_id).intersection(*gene_sets)
26 | common_genes = [x for x in CCLE_expression.columns if x in gene_set]
27 | print('Common genes:', len(common_genes))
28 | 
29 | # Subset all matrices to common genes
30 | CCLE_expression = CCLE_expression[common_genes]
31 | tumor_expression = tumor_expression[common_genes]
32 | met500_TPM = met500_TPM[common_genes]
33 | Novartis_PDX_TPM = Novartis_PDX_TPM[common_genes]
34 | pediatric_PDX_TPM = pediatric_PDX_TPM[common_genes]
35 | 
36 | # Create Celligner object and fit + transform the reference (CCLE) and target (TCGA) expression datasets
37 | my_celligner = celligner.Celligner()
38 | my_celligner.fit(CCLE_expression)
39 | my_celligner.transform(tumor_expression)
40 | 
41 | # Multi-dataset alignment - sequentially aligning additional expression datasets
42 | # Met500
43 | my_celligner.makeNewReference()
44 | my_celligner.mnn_kwargs.update({"k1":20, "k2":50})
45 | my_celligner.transform(met500_TPM, compute_cPCs=False)
46 | # Novartis PDX
47 | my_celligner.makeNewReference()
48 | my_celligner.mnn_kwargs.update({"k1":10, "k2":50})
49 | my_celligner.transform(Novartis_PDX_TPM, compute_cPCs=False)
50 | # Pediatric PDX
51 | my_celligner.makeNewReference()
52 | my_celligner.mnn_kwargs.update({"k1":10, "k2":50})
53 | my_celligner.transform(pediatric_PDX_TPM, compute_cPCs=False)
54 | 
55 | # Compute UMAP, clusters and tumor - model distance
56 | model_ids = list(CCLE_expression.index)+list(Novartis_PDX_TPM.index)+list(pediatric_PDX_TPM.index)
57 | tumor_ids = list(tumor_expression.index)+list(met500_TPM.index)
58 | my_celligner.computeMetricsForOutput(model_ids=model_ids, tumor_ids=tumor_ids)
59 | 
60 | my_celligner.save("model_22q2_dmc_multi_dataset.pkl")
61 | 


--------------------------------------------------------------------------------
/run_on_sparkles.sh:
--------------------------------------------------------------------------------
1 | sparkles sub -n celligner-run \
2 |     -u celligner:celligner \
3 |     -u mnnpy:mnnpy \
4 |     -u install_submodules_and_run.sh \
5 |     -u run_celligner.py \
6 |     -u $HOME/.taiga/token:.taiga-token \
7 |     sh install_submodules_and_run.sh "run_celligner.py"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | import os
 4 | import io
 5 | import subprocess
 6 | 
 7 | if sys.version_info.major < 3 or sys.version_info.minor < 2:
 8 |   raise ValueError("celligner is only compatible with Python 3.3 and above")
 9 | if sys.version_info.minor < 5:
10 |   import warnings
11 |   warnings.warn("celligner may not function properly on Python < 3.5")
12 | 
13 | print("trying to install the required limma R package")
14 | try:
15 |   subprocess.run(
16 |     'R -e \'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install("limma");\'', shell=True, check=True, 
17 |     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
18 | except:
19 |   print('failed to install limma. \
20 |     please install R or check your R installation and then install limma with:\
21 |     R -e \"if(!requireNamespace(\"BiocManager\", quietly = TRUE)){\
22 |         install.packages(\"BiocManager\", repos=\"http://cran.us.r-project.org\")};\
23 |       BiocManager::install(c(\"limma\"));\"')
24 | 
25 | print("Finished!")
26 | def read(*paths, **kwargs):
27 |   """Read the contents of a text file safely.
28 |   >>> read("celligner", "VERSION")
29 |   '0.1.0'
30 |   >>> read("README.md")
31 |   ...
32 |   """
33 | 
34 |   content = ""
35 |   with io.open(
36 |     os.path.join(os.path.dirname(__file__), *paths),
37 |     encoding=kwargs.get("encoding", "utf8"),
38 |   ) as open_file:
39 |     content = open_file.read().strip()
40 |   return content
41 | 
42 | 
43 | def read_requirements(path):
44 |   return [
45 |     line.strip()
46 |     for line in read(path).split("\n")
47 |     if not line.startswith(('"', "#", "-", "git+"))
48 |   ]
49 | 
50 | 
51 | setup(
52 |   name='celligner',
53 |   version=read("celligner", "VERSION"),
54 |   description='A useful module for alligning cell lines to tumors',
55 |   long_description=read("README.md"),
56 |   long_description_content_type="text/markdown",
57 |   author="Broad Institute CDS",
58 |   url="https://github.com/BroadInstitute/celligner",
59 |   packages=find_packages(exclude=["tests", ".github"]),
60 |   package_data={'celligner': ['data/*']},
61 |   python_requires='>=3.5',
62 |   install_requires=read_requirements("requirements.txt"),
63 |   entry_points={
64 |     "console_scripts": ["celligner = celligner.__main__:main"]
65 |   },
66 |   #extras_require={"test": read_requirements("requirements-test.txt")},
67 |   classifiers=[
68 |     "Programming Language :: Python :: 3",
69 |     "Intended Audience :: Science/Research",
70 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
71 |   ],
72 | )
73 | 
74 | # try: 
75 | #   subprocess.run(
76 | #     "git submodule update --remote --init", shell=True, check=True, 
77 | #     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
78 | # except:
79 | #   print('failed to install the mnnpy and CPCA submodules. \
80 | #     please install Python or check your Python installation and then install mnnpy & cpca with:\
81 | #     cd PROJECTLOC && git submodule update --remote --init')
82 | 


--------------------------------------------------------------------------------