├── .github ├── FUNDING.yml ├── release_message.sh └── workflows │ ├── main.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── Dockerfile ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── R ├── Celligner_helpers.R ├── Celligner_methods.R ├── DESCRIPTION ├── Dockerfile ├── NAMESPACE ├── README.md ├── global_params.R ├── install_packages.R └── mutlidataset_alignment.R ├── README.md ├── build_docker.sh ├── celligner ├── VERSION ├── __init__.py ├── limma.py └── params.py ├── celligner_output.ipynb ├── docs ├── Screenshot 2021-10-29 at 10.51.53.png ├── Screenshot 2021-10-29 at 10.53.01.png ├── celligner.md ├── celligner_diagram.png ├── celligner_public22q2.png ├── example.html ├── example.pdf ├── index.html ├── index.md └── typical_celligner.webp ├── install_submodules_and_run.sh ├── man ├── calc_gene_stats.Rd ├── calc_tumor_CL_cor.Rd ├── check_NAs.Rd ├── cluster_data.Rd ├── create_Seurat_object.Rd ├── dot-average_correction.Rd ├── dot-center_along_batch_vector.Rd ├── dot-compute_tricube_average.Rd ├── dot-tricube_weighted_correction.Rd ├── find_differentially_expressed_genes.Rd ├── get_cluster_averages.Rd ├── load_additional_data.Rd ├── load_data.Rd ├── modified_mnnCorrect.Rd ├── run_Celligner.Rd ├── run_MNN.Rd ├── run_cPCA.Rd ├── run_cPCA_analysis.Rd ├── run_lm_stats_limma_group.Rd └── run_multidataset_alignment.Rd ├── mkdocs.yml ├── requirements.txt ├── run_celligner.py ├── run_celligner_multi_dataset.py ├── run_on_sparkles.sh ├── runs ├── 22Q1-newmerging.ipynb ├── 22Q1.ipynb ├── 22Q2.ipynb ├── CCLF-manuscript.ipynb ├── cclf_analysisCelligner_plot_scatter.html ├── cclf_color_analysisCelligner_plot_scatter.html └── testing.ipynb └── setup.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [broadinstitute] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/release_message.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | previous_tag=$(git tag --sort=-creatordate | sed -n 2p) 3 | git shortlog "${previous_tag}.." | sed 's/^./ &/' 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the main branch 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | jobs: 17 | linter: 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.9] 22 | os: [ubuntu-latest] 23 | runs-on: ${{ matrix.os }} 24 | steps: 25 | - uses: actions/checkout@v2 26 | - uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install project 30 | run: make install 31 | - name: Run linter 32 | run: make lint 33 | 34 | tests_linux: 35 | needs: linter 36 | strategy: 37 | fail-fast: false 38 | matrix: 39 | python-version: [3.9] 40 | os: [ubuntu-latest] 41 | runs-on: ${{ matrix.os }} 42 | steps: 43 | - uses: actions/checkout@v2 44 | - uses: actions/setup-python@v2 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install project 48 | run: make install 49 | - name: Run tests 50 | run: make test 51 | - name: "Upload coverage to Codecov" 52 | uses: codecov/codecov-action@v1 53 | # with: 54 | # fail_ci_if_error: true 55 | 56 | tests_mac: 57 | needs: linter 58 | strategy: 59 | fail-fast: false 60 | matrix: 61 | python-version: [3.9] 62 | os: [macos-latest] 63 | runs-on: ${{ matrix.os }} 64 | steps: 65 | - uses: actions/checkout@v2 66 | - uses: actions/setup-python@v2 67 | with: 68 | python-version: ${{ matrix.python-version }} 69 | - name: Install project 70 | run: make install 71 | - name: Run tests 72 | run: make test 73 | 74 | tests_win: 75 | needs: linter 76 | strategy: 77 | fail-fast: false 78 | matrix: 79 | python-version: [3.9] 80 | os: [windows-latest] 81 | runs-on: ${{ matrix.os }} 82 | steps: 83 | - uses: actions/checkout@v2 84 | - uses: actions/setup-python@v2 85 | with: 86 | python-version: ${{ matrix.python-version }} 87 | - name: Install Pip 88 | run: pip install --user --upgrade pip 89 | - name: Install project 90 | run: pip install -e .[test] 91 | - name: run tests 92 | run: pytest -s -vvvv -l --tb=long tests 93 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | # Sequence of patterns matched against refs/tags 6 | tags: 7 | - '*' # Push events to matching v*, i.e. v1.0, v20.15.10 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | jobs: 13 | release: 14 | name: Create Release 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | # by default, it uses a depth of 1 20 | # this fetches all history so that we can read each commit 21 | fetch-depth: 0 22 | - name: Generate Changelog 23 | run: .github/release_message.sh > release_message.md 24 | - name: Release 25 | uses: softprops/action-gh-release@v1 26 | with: 27 | body_path: release_message.md 28 | 29 | deploy: 30 | needs: release 31 | runs-on: ubuntu-latest 32 | steps: 33 | - uses: actions/checkout@v1 34 | - name: Set up Python 35 | uses: actions/setup-python@v1 36 | with: 37 | python-version: '3.x' 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install setuptools wheel twine 42 | - name: Build and publish 43 | env: 44 | TWINE_USERNAME: __token__ 45 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 46 | run: | 47 | python setup.py sdist bdist_wheel 48 | twine upload dist/* 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/r,macos,python,sublimetext 2 | 3 | .code-workspace.code-workspace 4 | # Edit at https://www.gitignore.io/?templates=r,macos,python,sublimetext 5 | ### PERSO ### 6 | data/* 7 | **.so 8 | *.code-workspace 9 | ### macOS ### 10 | # General 11 | .DS_Store 12 | .AppleDouble 13 | .LSOverride 14 | .vscode 15 | 16 | # output model 17 | *.pkl 18 | 19 | # sparkles 20 | .kubeque-cached-file-hashes 21 | .sparkles-cache 22 | 23 | # Icon must end with two \r 24 | Icon 25 | 26 | # Thumbnails 27 | ._* 28 | temp/* 29 | 30 | # Files that might appear in the root of a volume 31 | .DocumentRevisions-V100 32 | .fseventsd 33 | .Spotlight-V100 34 | .TemporaryItems 35 | .Trashes 36 | .VolumeIcon.icns 37 | .com.apple.timemachine.donotpresent 38 | 39 | # Directories potentially created on remote AFP share 40 | .AppleDB 41 | .AppleDesktop 42 | Network Trash Folder 43 | Temporary Items 44 | .apdisk 45 | 46 | ### Python ### 47 | # Byte-compiled / optimized / DLL files 48 | __pycache__/ 49 | *.py[cod] 50 | *$py.class 51 | 52 | # C extensions 53 | *.so 54 | 55 | # Distribution / packaging 56 | .Python 57 | build/ 58 | develop-eggs/ 59 | dist/ 60 | downloads/ 61 | eggs/ 62 | .eggs/ 63 | lib/ 64 | lib64/ 65 | parts/ 66 | sdist/ 67 | var/ 68 | wheels/ 69 | pip-wheel-metadata/ 70 | share/python-wheels/ 71 | *.egg-info/ 72 | .installed.cfg 73 | *.egg 74 | MANIFEST 75 | 76 | # PyInstaller 77 | # Usually these files are written by a python script from a template 78 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 79 | *.manifest 80 | *.spec 81 | 82 | # Installer logs 83 | pip-log.txt 84 | pip-delete-this-directory.txt 85 | 86 | # Unit test / coverage reports 87 | htmlcov/ 88 | .tox/ 89 | .nox/ 90 | .coverage 91 | .coverage.* 92 | .cache 93 | nosetests.xml 94 | coverage.xml 95 | *.cover 96 | .hypothesis/ 97 | .pytest_cache/ 98 | 99 | # Translations 100 | *.mo 101 | *.pot 102 | 103 | # Django stuff: 104 | *.log 105 | local_settings.py 106 | db.sqlite3 107 | 108 | # Flask stuff: 109 | instance/ 110 | .webassets-cache 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # Jupyter Notebook 122 | .ipynb_checkpoints 123 | 124 | # IPython 125 | profile_default/ 126 | ipython_config.py 127 | 128 | # pyenv 129 | .python-version 130 | 131 | # pipenv 132 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 133 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 134 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 135 | # install all needed dependencies. 136 | #Pipfile.lock 137 | 138 | # celery beat schedule file 139 | celerybeat-schedule 140 | 141 | # SageMath parsed files 142 | *.sage.py 143 | 144 | # Environments 145 | .env 146 | .venv 147 | env/ 148 | venv/ 149 | ENV/ 150 | env.bak/ 151 | venv.bak/ 152 | 153 | # Spyder project settings 154 | .spyderproject 155 | .spyproject 156 | 157 | # Rope project settings 158 | .ropeproject 159 | 160 | # mkdocs documentation 161 | /site 162 | 163 | # mypy 164 | .mypy_cache/ 165 | .dmypy.json 166 | dmypy.json 167 | 168 | # Pyre type checker 169 | .pyre/ 170 | 171 | ### R ### 172 | # History files 173 | .Rhistory 174 | .Rapp.history 175 | 176 | # Session Data files 177 | .RData 178 | 179 | # User-specific files 180 | .Ruserdata 181 | 182 | # Example code in package build process 183 | *-Ex.R 184 | 185 | # Output files from R CMD build 186 | /*.tar.gz 187 | 188 | # Output files from R CMD check 189 | /*.Rcheck/ 190 | 191 | # RStudio files 192 | .Rproj.user/ 193 | 194 | # produced vignettes 195 | vignettes/*.html 196 | vignettes/*.pdf 197 | 198 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 199 | .httr-oauth 200 | 201 | # knitr and R markdown default cache directories 202 | /*_cache/ 203 | /cache/ 204 | 205 | # Temporary files created by R markdown 206 | *.utf8.md 207 | *.knit.md 208 | 209 | ### R.Bookdown Stack ### 210 | # R package: bookdown caching files 211 | /*_files/ 212 | 213 | ### SublimeText ### 214 | # Cache files for Sublime Text 215 | *.tmlanguage.cache 216 | *.tmPreferences.cache 217 | *.stTheme.cache 218 | 219 | # Workspace files are user-specific 220 | *.sublime-workspace 221 | 222 | # Project files should be checked into the repository, unless a significant 223 | # proportion of contributors will probably not be using Sublime Text 224 | # *.sublime-project 225 | 226 | # SFTP configuration file 227 | sftp-config.json 228 | 229 | # Package control specific files 230 | Package Control.last-run 231 | Package Control.ca-list 232 | Package Control.ca-bundle 233 | Package Control.system-ca-bundle 234 | Package Control.cache/ 235 | Package Control.ca-certs/ 236 | Package Control.merged-ca-bundle 237 | Package Control.user-ca-bundle 238 | oscrypto-ca-bundle.crt 239 | bh_unicode_properties.cache 240 | 241 | # Sublime-github package stores a github token in this file 242 | # https://packagecontrol.io/packages/sublime-github 243 | GitHub.sublime-settings 244 | 245 | # tmp files 246 | tmp.py 247 | 248 | # End of https://www.gitignore.io/api/r,macos,python,sublimetext 249 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "mnnpy"] 2 | path = mnnpy 3 | url = git@github.com:DeKegel/mnnpy.git 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to develop on this project 2 | 3 | celligner welcomes contributions from the community. 4 | 5 | **You need PYTHON3!** 6 | 7 | This instructions are for linux base systems. (Linux, MacOS, BSD, etc.) 8 | ## Setting up your own fork of this repo. 9 | 10 | - On github interface click on `Fork` button. 11 | - Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/celligner.git` 12 | - Enter the directory `cd celligner` 13 | - Add upstream repo `git remote add upstream https://github.com/broadinstitute/celligner` 14 | 15 | ## Setting up your own virtual environment 16 | 17 | Run `make virtualenv` to create a virtual environment. 18 | then activate it with `source .venv/bin/activate`. 19 | 20 | ## Install the project in develop mode 21 | 22 | Run `make install` to install the project in develop mode. 23 | 24 | ## Run the tests to ensure everything is working 25 | 26 | Run `make test` to run the tests. 27 | 28 | ## Create a new branch to work on your contribution 29 | 30 | Run `git checkout -b my_contribution` 31 | 32 | ## Make your changes 33 | 34 | Edit the files using your preferred editor. (we recommend VIM or VSCode) 35 | 36 | ## Format the code 37 | 38 | Run `make fmt` to format the code. 39 | 40 | ## Run the linter 41 | 42 | Run `make lint` to run the linter. 43 | 44 | ## Test your changes 45 | 46 | Run `make test` to run the tests. 47 | 48 | Ensure code coverage report shows `100%` coverage, add tests to your PR. 49 | 50 | ## Build the docs locally 51 | 52 | Run `make docs` to build the docs. 53 | 54 | Ensure your new changes are documented. 55 | 56 | ## Commit your changes 57 | 58 | This project uses [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/). 59 | 60 | Example: `fix(package): update setup.py arguments 🎉` (emojis are fine too) 61 | 62 | ## Push your changes to your fork 63 | 64 | Run `git push origin my_contribution` 65 | 66 | ## Submit a pull request 67 | 68 | On github interface, click on `Pull Request` button. 69 | 70 | Wait CI to run and one of the developers will review your PR. 71 | ## Makefile utilities 72 | 73 | This project comes with a `Makefile` that contains a number of useful utility. 74 | 75 | ```bash 76 | ❯ make 77 | Usage: make 78 | 79 | Targets: 80 | help: ## Show the help. 81 | install: ## Install the project in dev mode. 82 | fmt: ## Format code using black & isort. 83 | lint: ## Run pep8, black, mypy linters. 84 | test: lint ## Run tests and generate coverage report. 85 | watch: ## Run tests on every change. 86 | clean: ## Clean unused files. 87 | virtualenv: ## Create a virtual environment. 88 | release: ## Create a new tag for release. 89 | docs: ## Build the documentation. 90 | switch-to-poetry: ## Switch to poetry package manager. 91 | init: ## Initialize the project based on an application template. 92 | ``` 93 | 94 | ## Making a new release 95 | 96 | This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z` 97 | Every time a new tag is created and pushed to the remote repo, github actions will 98 | automatically create a new release on github and trigger a release on PyPI. 99 | 100 | For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, 101 | this token can be generated on [pypi.org](https://pypi.org/account/). 102 | 103 | To trigger a new release all you need to do is. 104 | 105 | 1. If you have changes to add to the repo 106 | * Make your changes following the steps described above. 107 | * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/). 108 | 2. Run the tests to ensure everything is working. 109 | 4. Run `make release` to create a new tag and push it to the remote repo. 110 | 111 | the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked. 112 | 113 | > **CAUTION**: The make release will change local changelog files and commit all the unstaged changes you have. 114 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Dockerfile to create celligner image 4 | # 5 | # Run build_docker.sh 6 | 7 | FROM python:3.8 8 | 9 | #add R and CMAKE 10 | RUN apt-get update && apt-get install -y r-base cmake 11 | # install 12 | RUN R -e 'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install("limma");' 13 | 14 | #install requirements 15 | COPY requirements.txt . 16 | RUN pip install --upgrade pip &&\ 17 | pip install -r requirements.txt -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | 5 | (unreleased) 6 | ------------ 7 | - Release: version 🚀 [Jérémie Kalfon] 8 | - Update setup.py. [Jérémie Kalfon] 9 | - Update requirements.txt. [Jérémie Kalfon] 10 | - Update README.md. [Jérémie Kalfon] 11 | - Update README.md. [Jérémie Kalfon] 12 | - Release: version 1.1.0 🚀 [jkobject] 13 | 14 | 15 | 1.1.0 (2022-04-01) 16 | ------------------ 17 | - Update to the notebooks. [jkobject] 18 | - Quick debug and adding more datasets. [jkobject] 19 | - Merge branch 'master' of https://github.com/broadinstitute/celligner. 20 | [jkobject] 21 | - Update README.md. [Jérémie Kalfon] 22 | - Some trials with the release, making new release too. [jkobject] 23 | - Som debugs and reformating. [jkobject] 24 | - Again. [jkobject] 25 | - Better viz of new dataset. [jkobject] 26 | - CCLF analysis. [jkobject] 27 | - Updating the demo with the tests. [jkobject] 28 | - Some small improvements. [jkobject] 29 | - Adding a new release notebook. [jkobject] 30 | - Format. [jkobject] 31 | - Updating formatting. [jkobject] 32 | - Debugging limma when different version of rpy2. [jkobject] 33 | - Update README.md. [Jérémie Kalfon] 34 | - Merge pull request #3 from broadinstitute/dev. [Jérémie Kalfon] 35 | 36 | Update README.md 37 | - From javad's comment. [jkobject] 38 | - Update README.md. [Jérémie Kalfon] 39 | - Merge branch 'master' into dev. [Jérémie Kalfon] 40 | - Update README.md. [Jérémie Kalfon] 41 | - Adding a bit of doc and WIP on QC. [jkobject] 42 | - Release: version 1.0.1 🚀 [jkobject] 43 | 44 | 45 | 1.0.1 (2021-10-27) 46 | ------------------ 47 | - Cleanup. [jkobject] 48 | - Finishing examples. [jkobject] 49 | - Release: version 1.0.0 🚀 [jkobject] 50 | 51 | 52 | 1.0.0 (2021-10-27) 53 | ------------------ 54 | - Finishig multidataseet alignment and final debugs. [jkobject] 55 | - Release: version 0.9.3 🚀 [jkobject] 56 | 57 | 58 | 0.9.3 (2021-10-25) 59 | ------------------ 60 | - Release: version 0.9.2 🚀 [jkobject] 61 | 62 | 63 | 0.9.2 (2021-10-25) 64 | ------------------ 65 | - Merge pull request #2 from jkobject/master. [Jérémie Kalfon] 66 | 67 | tomerge 68 | - Release: version 0.9.1 🚀 [jkobject] 69 | - Merge pull request #1 from jkobject/dev. [Jérémie Kalfon] 70 | - Merge branch 'master' of https://github.com/broadinstitute/celligner. 71 | [Jérémie Kalfon] 72 | - Adding data and planning. [Jérémie Kalfon] 73 | 74 | 75 | 0.9.1 (2021-10-25) 76 | ------------------ 77 | - Nthi. [jkobject] 78 | - Nothing really. [jkobject] 79 | - Better readme. [jkobject] 80 | - Merge branch 'dev' of https://github.com/jkobject/celligner into dev. 81 | [jkobject] 82 | - Update .github/FUNDING.yml. [Jérémie Kalfon] 83 | - Delete rename_project.yml. [Jérémie Kalfon] 84 | - Delete rename_project.sh. [Jérémie Kalfon] 85 | - Delete init.sh. [Jérémie Kalfon] 86 | - Adding doc and remving snn. [jkobject] 87 | - Release: version 0.9.0 🚀 [jkobject] 88 | 89 | 90 | 0.9.0 (2021-10-24) 91 | ------------------ 92 | - Updating mnn too. [jkobject] 93 | - More on celligner. [jkobject] 94 | - Continuing celligner debug. [jkobject] 95 | - Debuging marioni 1/n. [jkobject] 96 | - Updating contrastive and mnnpy. [jkobject] 97 | - Making it more prod ready. [jkobject] 98 | - Making a prod version. [jkobject] 99 | - Finishing debugging. [jkobject] 100 | - Cont. [jkobject] 101 | - Coontinuing. [jkobject] 102 | - Make it more productionalized. [jkobject] 103 | - Improving. [jkobject] 104 | - Continuing adding new mnn version (trying rgular mnn) [jkobject] 105 | - Making more changes. [jkobject] 106 | - Adding too giti. [jkobject] 107 | - Finish debug yay!! [jkobject] 108 | - Finish debug yay!! [jkobject] 109 | - Continuing debugs. [jkobject] 110 | - Making demo. [jkobject] 111 | - Coninuing work. [jkobject] 112 | - Redoing. [jkobject] 113 | - Cleanup. [jkobject] 114 | - WIP on celligner python. [jkobject] 115 | - WIP on python's celigner. [jkobject] 116 | - Update README.md. [Jérémie Kalfon] 117 | - Better doc. [Jérémie Kalfon] 118 | - Update to README on multidataset. [Allie Warren] 119 | - Adding code to run Celligner with additional datasets. [Allie Warren] 120 | - Adding statements to clear unused objects to reduce memory. [Allie 121 | Warren] 122 | - Adding code to install required packages and adding imports to 123 | description. [Allie Warren] 124 | - Merge branch 'master' of https://github.com/broadinstitute/celligner. 125 | [Allie Warren] 126 | - Create README. [acwarren] 127 | - Adding plots to output. [Allie Warren] 128 | - Add documentation to helper methods and add test for NAs in data. 129 | [Allie Warren] 130 | - Add manual for methods. [Allie Warren] 131 | - Added error tests and updated documentation. [Allie Warren] 132 | - Creating repository for celligner package. [Allie Warren] 133 | 134 | 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | This is free and unencumbered software released into the public domain. 3 | 4 | Anyone is free to copy, modify, publish, use, compile, sell, or 5 | distribute this software, either in source code form or as a compiled 6 | binary, for any purpose, commercial or non-commercial, and by any 7 | means. 8 | 9 | In jurisdictions that recognize copyright laws, the author or authors 10 | of this software dedicate any and all copyright interest in the 11 | software to the public domain. We make this dedication for the benefit 12 | of the public at large and to the detriment of our heirs and 13 | successors. We intend this dedication to be an overt act of 14 | relinquishment in perpetuity of all present and future rights to this 15 | software under copyright law. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | For more information, please refer to 26 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include HISTORY.md 3 | include Containerfile 4 | graft tests 5 | graft celligner 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .ONESHELL: 2 | ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')") 3 | USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes") 4 | 5 | .PHONY: help 6 | help: ## Show the help. 7 | @echo "Usage: make " 8 | @echo "" 9 | @echo "Targets:" 10 | @fgrep "##" Makefile | fgrep -v fgrep 11 | 12 | 13 | .PHONY: show 14 | show: ## Show the current environment. 15 | @echo "Current environment:" 16 | @if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi 17 | @echo "Running using $(ENV_PREFIX)" 18 | @$(ENV_PREFIX)python -V 19 | @$(ENV_PREFIX)python -m site 20 | 21 | .PHONY: install 22 | install: ## Install the project in dev mode. 23 | @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi 24 | @echo "Don't forget to run 'make virtualenv' if you got errors." 25 | $(ENV_PREFIX)pip install -e .[test] 26 | 27 | .PHONY: fmt 28 | fmt: ## Format code using black & isort. 29 | $(ENV_PREFIX)isort celligner/ 30 | $(ENV_PREFIX)black -l 79 celligner/ 31 | $(ENV_PREFIX)black -l 79 tests/ 32 | 33 | .PHONY: lint 34 | lint: ## Run pep8, black, mypy linters. 35 | $(ENV_PREFIX)flake8 celligner/ 36 | $(ENV_PREFIX)black -l 79 --check celligner/ 37 | $(ENV_PREFIX)black -l 79 --check tests/ 38 | $(ENV_PREFIX)mypy --ignore-missing-imports celligner/ 39 | 40 | .PHONY: test 41 | test: lint ## Run tests and generate coverage report. 42 | $(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=celligner -l --tb=short --maxfail=1 tests/ 43 | $(ENV_PREFIX)coverage xml 44 | $(ENV_PREFIX)coverage html 45 | 46 | .PHONY: watch 47 | watch: ## Run tests on every change. 48 | ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/ 49 | 50 | .PHONY: clean 51 | clean: ## Clean unused files. 52 | @find ./ -name '*.pyc' -exec rm -f {} \; 53 | @find ./ -name '__pycache__' -exec rm -rf {} \; 54 | @find ./ -name 'Thumbs.db' -exec rm -f {} \; 55 | @find ./ -name '*~' -exec rm -f {} \; 56 | @rm -rf .cache 57 | @rm -rf .pytest_cache 58 | @rm -rf .mypy_cache 59 | @rm -rf build 60 | @rm -rf dist 61 | @rm -rf *.egg-info 62 | @rm -rf htmlcov 63 | @rm -rf .tox/ 64 | @rm -rf docs/_build 65 | 66 | .PHONY: virtualenv 67 | virtualenv: ## Create a virtual environment. 68 | @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi 69 | @echo "creating virtualenv ..." 70 | @rm -rf .venv 71 | @python3 -m venv .venv 72 | @./.venv/bin/pip install -U pip 73 | @./.venv/bin/pip install -e .[test] 74 | @echo 75 | @echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!" 76 | 77 | .PHONY: release 78 | release: ## Create a new tag for release. 79 | @echo "WARNING: This operation will create s version tag and push to github" 80 | @read -p "Version? (provide the next x.y.z semver) : " TAG 81 | @echo "creating git tag : $${TAG}" 82 | @git tag $${TAG} 83 | @echo "$${TAG}" > celligner/VERSION 84 | @$(ENV_PREFIX)gitchangelog > HISTORY.md 85 | @git add celligner/VERSION HISTORY.md 86 | @git commit -m "release: version $${TAG} 🚀" 87 | @git push -u origin HEAD --tags 88 | @echo "Github Actions will detect the new tag and release the new version." 89 | 90 | .PHONY: docs 91 | docs: ## Build the documentation. 92 | @echo "building documentation ..." 93 | @jupyter nbconvert --to html Celligner_demo.ipynb --output docs/index.html 94 | @$(ENV_PREFIX)mkdocs gh-deploy 95 | URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL 96 | 97 | .PHONY: switch-to-poetry 98 | switch-to-poetry: ## Switch to poetry package manager. 99 | @echo "Switching to poetry ..." 100 | @if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi 101 | @rm -rf .venv 102 | @poetry init --no-interaction --name=a_flask_test --author=rochacbruno 103 | @echo "" >> pyproject.toml 104 | @echo "[tool.poetry.scripts]" >> pyproject.toml 105 | @echo "celligner = 'celligner.__main__:main'" >> pyproject.toml 106 | @cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done 107 | @cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done 108 | @poetry install --no-interaction 109 | @mkdir -p .github/backup 110 | @mv requirements* .github/backup 111 | @mv setup.py .github/backup 112 | @echo "You have switched to https://python-poetry.org/ package manager." 113 | @echo "Please run 'poetry shell' or 'poetry run celligner'" 114 | 115 | .PHONY: init 116 | init: ## Initialize the project based on an application template. 117 | @./.github/init.sh 118 | 119 | 120 | # This project has been generated from rochacbruno/python-project-template 121 | # __author__ = 'rochacbruno' 122 | # __repo__ = https://github.com/rochacbruno/python-project-template 123 | # __sponsor__ = https://github.com/sponsors/rochacbruno/ 124 | -------------------------------------------------------------------------------- /R/Celligner_helpers.R: -------------------------------------------------------------------------------- 1 | library(magrittr) 2 | library(tidyverse) 3 | 4 | #' check for NAs in the expression data and remove samples with NAs 5 | #' @name check_NAs 6 | #' 7 | #' @param mat: matrix of gene expression data that is samples by genes 8 | #' @return matrix of gene expression data, removing samples that have NAs 9 | #' @export 10 | #' 11 | check_NAs <- function(mat) { 12 | if(length(which(is.na(rowSums(mat))==T))>0) { 13 | warning("Removing sample(s) due to NAs in the data") 14 | mat <- mat[!is.na(rowSums(mat)),] 15 | } 16 | 17 | return(mat) 18 | } 19 | 20 | #' 21 | #' Differentially expressed genes 22 | #' @name run_lm_stats_limma_group 23 | #' 24 | #' @param mat: Nxp data matrix of N cell lines and p genes 25 | #' @param phenos: N vector of independent variables. Can be two-group labels as factors, bools, or can be numeric 26 | #' @param covars: optional Nxk matrix of sample covariates 27 | #' @param weights: optional N vector of precision weights for each data point 28 | #' @param target_type: name of the column variable in the data (default 'Gene') 29 | #' @return table of gene level stata 30 | #' @description Estimate linear-model stats for a matrix of data with respect to a group of phenotype variables 31 | # using limma with empirical Bayes moderated F-stats for p-values 32 | #' @export 33 | #' 34 | run_lm_stats_limma_group <- function (mat, phenos, covars = NULL, weights = NULL, target_type = "Gene", 35 | limma_trend = FALSE) 36 | { 37 | require(limma) 38 | require(magrittr) 39 | require(tibble) 40 | require(plyr) 41 | require(dplyr) 42 | udata <- rownames(mat) %>% intersect(rownames(phenos)) 43 | if (!is.null(covars)) { 44 | udata %<>% intersect(rownames(covars)) 45 | } 46 | form <- as.formula(paste("~", paste0(colnames(phenos), collapse = " + "))) 47 | design <- model.matrix(form, data = phenos[udata, , drop = F]) 48 | if (!is.null(covars)) { 49 | covars <- data.frame(covars) 50 | form <- as.formula(paste("~", paste0(colnames(covars), 51 | collapse = " + "))) 52 | Cdesign <- model.matrix(form, data = covars[udata, , 53 | drop = F]) 54 | Cdesign <- Cdesign[, setdiff(colnames(Cdesign), "(Intercept)"), 55 | drop = FALSE] 56 | stopifnot(length(intersect(colnames(Cdesign), colnames(design))) == 57 | 0) 58 | design %<>% cbind(Cdesign) 59 | } 60 | if (!is.null(weights)) { 61 | if (is.matrix(weights)) { 62 | weights <- t(weights[udata, ]) 63 | } 64 | else { 65 | weights <- weights[udata] 66 | } 67 | } 68 | design <- design[, colSums(design) > 2, drop = FALSE] 69 | targ_coefs <- setdiff(colnames(design), "(Intercept)") 70 | fit <- limma::lmFit(t(mat[udata, ]), design, weights = weights) 71 | fit <- limma::eBayes(fit, trend = limma_trend) 72 | targ_coef <- which(colnames(design) %in% targ_coefs) 73 | results <- limma::topTable(fit, coef = targ_coef, number = Inf, 74 | sort.by = "F", genelist = colnames(mat)) 75 | results %<>% tibble::rownames_to_column(var = target_type) 76 | results %<>% magrittr::set_colnames(revalue(colnames(.), c(AveExpr = "Avg", 77 | F = "F_stat", P.Value = "p.value", adj.P.Val = "q.value"))) %>% 78 | na.omit() %>% dplyr::select(-ProbeID) 79 | return(results) 80 | } 81 | 82 | #' 83 | #' cPCA 84 | #' @name run_cPCA_analysis 85 | #' 86 | #' @param TCGA_dat: sample by genes matrix of scaled expression data 87 | #' @param CCLE_dat: sample by genes matrix of scaled expression data 88 | #' @param tumor_cluster_df: table of sample metadata that includes a column 'seurat_clusters', 89 | #' containing transcriptional clusters in the TCGA data 90 | #' @param CL_cluster_df: table of sample metadata that includes a column 'seurat_clusters', 91 | #' containing transcriptional clusters in the CCLE data 92 | #' @param pc_dims: numbers of cPCs calculated. If set to NULL (default) all cPCs will be calculated, if set to a value 93 | #' then that number of cPCs will be approximated. Values input should be >= 4. 94 | #' @return contrastive principal component object containing cPC vectors and values 95 | #' @description Run contrastive principal components analysis, first removing average cluster expression, to 96 | # estimate the average intra-cluster covariance. If pc_dims = NULL, all cPCs are calculated. Faster cPCA can be run by setting pc_dims to a 97 | # value >=4 and approximating just those cPCs. 98 | #' @export 99 | #' 100 | run_cPCA_analysis <- function(TCGA_dat, CCLE_dat, tumor_cluster_df, CL_cluster_df, pc_dims=NULL) { 101 | tumor_clust_avgs <- get_cluster_averages(TCGA_dat, tumor_cluster_df) 102 | CL_clust_avgs <- get_cluster_averages(CCLE_dat, CL_cluster_df) 103 | 104 | TCGA_subtype_ms <- TCGA_dat - tumor_clust_avgs[tumor_cluster_df$seurat_clusters,] 105 | CCLE_subtype_ms <- CCLE_dat - CL_clust_avgs[CL_cluster_df$seurat_clusters,] 106 | 107 | TCGA_cov <- cov(TCGA_subtype_ms) 108 | CCLE_cov <- cov(CCLE_subtype_ms) 109 | 110 | if(!is.null(pc_dims)) { 111 | cov_diff_eig <- irlba::prcomp_irlba(TCGA_cov - CCLE_cov, n = pc_dims) 112 | } else { 113 | cov_diff_eig <- eigen(TCGA_cov - CCLE_cov) 114 | } 115 | return(cov_diff_eig) 116 | } 117 | 118 | #' 119 | #' calculate the average expression per cluster 120 | #' @name get_cluster_averages 121 | #' 122 | #' @param mat: sample by genes matrix of expression data 123 | #' @param cluster_df: table of sample metadata that includes a column 'seurat_clusters', 124 | #' containing transcriptional clusters 125 | #' @return average cluster expression 126 | #' @description calculate the average expression per cluster 127 | #' @export 128 | #' 129 | get_cluster_averages <- function(mat, cluster_df) { 130 | n_clusts <- nlevels(cluster_df$seurat_clusters) 131 | clust_avgs <- matrix(NA, nrow = n_clusts, ncol = ncol(mat)) %>% 132 | magrittr::set_colnames(colnames(mat)) %>% 133 | magrittr::set_rownames(levels(cluster_df$seurat_clusters)) 134 | for (ii in levels(cluster_df$seurat_clusters)) { 135 | clust_avgs[ii,] <- colMeans(mat[cluster_df$seurat_clusters == ii,], na.rm=T) 136 | } 137 | return(clust_avgs) 138 | } 139 | 140 | # MNN -------------------------------------------------------------------- 141 | 142 | #' 143 | #' MNN 144 | #' @name modified_mnnCorrect 145 | #' 146 | #' @param ref_mat: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 147 | #' In the standard Celligner pipeline this the cell line data. 148 | #' @param targ_mat: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 149 | #' In the standard Celligner pipeline this the tumor data. 150 | #' @param k1: the number of neighbors within the data being corrected (in standard pipeline the tumor data). By default this is 20. 151 | #' @param k2: the number of neighbors within the reference data (in standard pipeline the cell line data). By default this is 20. 152 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 153 | #' By default is 3. 154 | #' @param subset_genes: the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially 155 | #' expressed genes is usually passed here. By default is NULL, meaning all genes are used 156 | #' @return MNN object, containing the targ_mat corrected data and the mutual nearest neighbor pairs. 157 | #' @description Mutual nearest neighbors correction. Modification of the scran::fastMNN (https://github.com/MarioniLab/scran). 158 | #' Allows for separate k values per dataset, and simplifies some of the IO and doesn't use PCA reduction 159 | #' @export 160 | #' 161 | modified_mnnCorrect <- function(ref_mat, targ_mat, k1 = 20, k2 = 20, 162 | ndist = 3, subset_genes = NULL) { 163 | if (is.null(subset_genes)) { 164 | subset_genes <- colnames(ref_mat) 165 | } 166 | 167 | sets <- batchelor::findMutualNN(ref_mat[, subset_genes], 168 | targ_mat[, subset_genes], 169 | k1 = k2, k2 = k1, 170 | BPPARAM = BiocParallel::SerialParam()) 171 | mnn_pairs <- as.data.frame(sets) %>% 172 | dplyr::mutate(ref_ID = rownames(ref_mat)[first], 173 | targ_ID = rownames(targ_mat)[second], 174 | pair = seq(nrow(.))) %>% 175 | dplyr::select(-first, -second) 176 | 177 | # Estimate the overall batch vector. 178 | ave.out <- .average_correction(ref_mat, sets$first, targ_mat, sets$second) 179 | overall.batch <- colMeans(ave.out$averaged) 180 | 181 | #remove variation along the overall batch vector 182 | ref_mat <- .center_along_batch_vector(ref_mat, overall.batch) 183 | targ_mat <- .center_along_batch_vector(targ_mat, overall.batch) 184 | 185 | # Recompute correction vectors and apply them. 186 | re.ave.out <- .average_correction(ref_mat, sets$first, targ_mat, sets$second) 187 | targ_mat <- .tricube_weighted_correction(targ_mat, re.ave.out$averaged, re.ave.out$second, k=k2, ndist=ndist, subset_genes, BPPARAM=BiocParallel::SerialParam()) 188 | 189 | final <- list(corrected = targ_mat, 190 | pairs = mnn_pairs) 191 | return(final) 192 | } 193 | 194 | #' 195 | #' calculate the average correction vector 196 | #' @name .average_correction 197 | #' 198 | #' @param refdata: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 199 | #' In the standard Celligner pipeline this the cell line data. 200 | #' @param mnn1: mnn1 pairs 201 | #' @param curdata: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 202 | #' In the standard Celligner pipeline this the tumor data. 203 | #' @param mnn2: mnn2 pairs 204 | #' @return correction vector and pairs 205 | #' @description Computes correction vectors for each MNN pair, and then averages them for each MNN-involved cell in the second batch. 206 | #' Copied from dev version of scran (2018-10-28), with slight modifications as noted https://github.com/MarioniLab/scran 207 | #' @export 208 | #' 209 | .average_correction <- function(refdata, mnn1, curdata, mnn2) 210 | # Computes correction vectors for each MNN pair, and then 211 | # averages them for each MNN-involved cell in the second batch. 212 | { 213 | corvec <- refdata[mnn1,,drop=FALSE] - curdata[mnn2,,drop=FALSE] 214 | corvec <- rowsum(corvec, mnn2) 215 | npairs <- table(mnn2) 216 | stopifnot(identical(names(npairs), rownames(corvec))) 217 | corvec <- unname(corvec)/as.vector(npairs) 218 | list(averaged=corvec, second=as.integer(names(npairs))) 219 | } 220 | 221 | 222 | #' 223 | #' centers samples within each batch 224 | #' @name .center_along_batch_vector 225 | #' 226 | #' @param mat: matrix of samples by genes 227 | #' @param batch.vec: batch vector 228 | #' @return correction vector and pairs 229 | #' @description Projecting along the batch vector, and shifting all samples to the center within each batch. 230 | #' This removes any variation along the overall batch vector within each matrix. 231 | #' @export 232 | #' 233 | .center_along_batch_vector <- function(mat, batch.vec) 234 | # Projecting along the batch vector, and shifting all cells to the center _within_ each batch. 235 | # This removes any variation along the overall batch vector within each matrix. 236 | { 237 | batch.vec <- batch.vec/sqrt(sum(batch.vec^2)) 238 | batch.loc <- as.vector(mat %*% batch.vec) 239 | central.loc <- mean(batch.loc) 240 | mat <- mat + outer(central.loc - batch.loc, batch.vec, FUN="*") 241 | return(mat) 242 | } 243 | 244 | 245 | #' tricube-weighted correction 246 | #' @name .tricube_weighted_correction 247 | #' 248 | #' @param curdata: target matrix of samples by genes 249 | #' @param correction: corrected vector 250 | #' @param in.mnn: mnn pairs 251 | #' @param k: k values, default 20 252 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 253 | #' By default is 3. 254 | #' @param subset_genes: genes used to identify mutual nearest neighbors 255 | #' @param BNPARAM: default NULL 256 | #' @param BPPARAM: default BiocParallel::SerialParam() 257 | #' @return MNN corrected data 258 | #' @description Computing tricube-weighted correction vectors for individual samples, 259 | #' using the nearest neighbouring samples involved in MNN pairs. 260 | #' Modified to use FNN rather than queryKNN for nearest neighbor finding 261 | #' @export 262 | #' @importFrom BiocNeighbors queryKNN 263 | #' @importFrom BiocParallel SerialParam 264 | #' 265 | .tricube_weighted_correction <- function(curdata, correction, in.mnn, k=20, ndist=3, subset_genes, BNPARAM=NULL, BPPARAM=BiocParallel::SerialParam()) 266 | # Computing tricube-weighted correction vectors for individual cells, 267 | # using the nearest neighbouring cells _involved in MNN pairs_. 268 | # Modified to use FNN rather than queryKNN for nearest neighbor finding 269 | { 270 | cur.uniq <- curdata[in.mnn,,drop=FALSE] 271 | safe.k <- min(k, nrow(cur.uniq)) 272 | # closest <- queryKNN(query=curdata, X=cur.uniq, k=safe.k, BNPARAM=BNPARAM, BPPARAM=BPPARAM) 273 | closest <- FNN::get.knnx(cur.uniq[, subset_genes], query=curdata[, subset_genes], k=safe.k) 274 | # weighted.correction <- .compute_tricube_average(correction, closest$index, closest$distance, ndist=ndist) 275 | weighted.correction <- .compute_tricube_average(correction, closest$nn.index, closest$nn.dist, ndist=ndist) 276 | curdata + weighted.correction 277 | } 278 | 279 | #' 280 | #' compute tricube averages 281 | #' @name .compute_tricube_average 282 | #' 283 | #' @param values: correction vector 284 | #' @param indices: nxk matrix for the nearest neighbor indice 285 | #' @param distances: nxk matrix for the nearest neighbor Euclidea distances 286 | #' @param bandwidth: Is set at 'ndist' times the median distance, if not specified. 287 | #' @param ndist: By default is 3. 288 | #' @description Centralized function to compute tricube averages. 289 | #' @export 290 | #' 291 | .compute_tricube_average <- function(vals, indices, distances, bandwidth=NULL, ndist=3) 292 | # Centralized function to compute tricube averages. 293 | # Bandwidth is set at 'ndist' times the median distance, if not specified. 294 | { 295 | if (is.null(bandwidth)) { 296 | middle <- ceiling(ncol(indices)/2L) 297 | mid.dist <- distances[,middle] 298 | bandwidth <- mid.dist * ndist 299 | } 300 | bandwidth <- pmax(1e-8, bandwidth) 301 | 302 | rel.dist <- distances/bandwidth 303 | rel.dist[rel.dist > 1] <- 1 # don't use pmin(), as this destroys dimensions. 304 | tricube <- (1 - rel.dist^3)^3 305 | weight <- tricube/rowSums(tricube) 306 | 307 | output <- 0 308 | for (kdx in seq_len(ncol(indices))) { 309 | output <- output + vals[indices[,kdx],,drop=FALSE] * weight[,kdx] 310 | } 311 | 312 | if (is.null(dim(output))) { 313 | matrix(0, nrow(vals), ncol(vals)) 314 | } else { 315 | output 316 | } 317 | } 318 | -------------------------------------------------------------------------------- /R/Celligner_methods.R: -------------------------------------------------------------------------------- 1 | library(magrittr) 2 | library(tidyverse) 3 | 4 | 5 | #' method to load in tumor and cell line expression data and annotations 6 | #' @name load_data 7 | #' 8 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 9 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data 10 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 11 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data 12 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL 13 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder 14 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 15 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations 16 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 17 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 18 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'. 19 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL 20 | #' @param cell_line_ann_taiga: if TRUE then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder 21 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 22 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data. 23 | #' If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 24 | #' if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'. 25 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 26 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data 27 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL 28 | #' @param tumor_taiga: if TRUE then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder 29 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 30 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations 31 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 32 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations 33 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 34 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 35 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 36 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 37 | #' for the cell lines. If null, assumes there are no additional annotations. 38 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 39 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 40 | #' no additional annotations. 41 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 42 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 43 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 44 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations 45 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 46 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations 47 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL 48 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder 49 | #' 50 | #' @importFrom magrittr "%>%" 51 | #' 52 | #' @description load expression and annotation files for cell lines and tumors 53 | #' @return dat object with cell line and tumor expression data and annotations 54 | #' @export 55 | load_data <- function(cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga, 56 | cell_line_ann_name, cell_line_ann_file, cell_line_ann_version, cell_line_ann_taiga, 57 | tumor_data_name, tumor_data_file, tumor_version, tumor_taiga, 58 | tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga, 59 | additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga, 60 | hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) { 61 | if (hgnc_taiga) { 62 | hgnc.complete.set <- taigr::load.from.taiga(data.name = hgnc_data_name, data.version = hgnc_version, data.file = hgnc_data_file) 63 | if (is.null(hgnc.complete.set)) { 64 | stop("HGNC gene file input does not exist on taiga") 65 | } 66 | } else { 67 | if (file.exists(file.path(hgnc_data_name, hgnc_data_file))) { 68 | hgnc.complete.set <- data.table::fread(file.path(hgnc_data_name, hgnc_data_file)) %>% 69 | as.data.frame() 70 | } else { 71 | stop("HGNC gene file input does not exist") 72 | } 73 | } 74 | 75 | if (!all(c("symbol", "ensembl_gene_id", "locus_group") %in% colnames(hgnc.complete.set))) { 76 | stop("HGNC gene file does not contain expected columns (symbol, ensembl_gene_id, & locus_group)") 77 | } 78 | 79 | if (tumor_taiga) { 80 | TCGA_mat <- taigr::load.from.taiga(data.name = tumor_data_name, data.version = tumor_version, data.file = tumor_data_file) 81 | if (is.null(TCGA_mat)) { 82 | stop("tumor expression data file input does not exist on taiga") 83 | } 84 | } else { 85 | if (file.exists(file.path(tumor_data_name, tumor_data_file))) { 86 | TCGA_mat <- readr::read_tsv(file.path(tumor_data_name, tumor_data_file)) %>% 87 | as.data.frame() %>% 88 | tibble::column_to_rownames("Gene") %>% 89 | as.matrix() %>% 90 | t() 91 | } else { 92 | stop("tumor expression data file input does not exist") 93 | } 94 | } 95 | 96 | 97 | if (cell_line_taiga) { 98 | CCLE_mat <- taigr::load.from.taiga(data.name = cell_line_data_name, data.version = cell_line_version, data.file = cell_line_data_file) 99 | if (is.null(CCLE_mat)) { 100 | stop("cell line expression data file input does not exist on taiga") 101 | } 102 | } else { 103 | if (file.exists(file.path(cell_line_data_name, cell_line_data_file))) { 104 | CCLE_mat <- readr::read_csv(file.path(cell_line_data_name, cell_line_data_file)) %>% 105 | as.data.frame() %>% 106 | tibble::column_to_rownames("X1") %>% 107 | as.matrix() 108 | } else { 109 | stop("cell line data file input does not exist") 110 | } 111 | } 112 | 113 | # subset gene names to just ensembl IDs 114 | # add test for this 115 | colnames(CCLE_mat) <- stringr::str_match(colnames(CCLE_mat), "\\((.+)\\)")[, 2] 116 | 117 | # convert tumor gene names to ensembl IDs, if needed 118 | if (length(grep("ENS", colnames(TCGA_mat))) != ncol(TCGA_mat)) { 119 | print("converting TCGA column names from HGNC ids to ensembl ids") 120 | common_genes <- dplyr::intersect(colnames(TCGA_mat), hgnc.complete.set$symbol) 121 | if (length(common_genes) < 10000) { 122 | sprint("only %s genes in overlapping between genes in columns of the tumor data and hgnc dataset") 123 | warning("low overlap of genes in tumor data and gene symbol, either tumor data 124 | or gene file may not be in correct format") 125 | } 126 | TCGA_mat <- TCGA_mat[, common_genes] 127 | hgnc.complete.set <- dplyr::filter(hgnc.complete.set, symbol %in% common_genes) 128 | hgnc.complete.set <- hgnc.complete.set[!duplicated(hgnc.complete.set$symbol), ] 129 | rownames(hgnc.complete.set) <- hgnc.complete.set$symbol 130 | hgnc.complete.set <- hgnc.complete.set[common_genes, ] 131 | colnames(TCGA_mat) <- hgnc.complete.set$ensembl_gene_id 132 | } 133 | 134 | 135 | if (cell_line_ann_taiga) { 136 | CCLE_ann <- taigr::load.from.taiga(data.name = cell_line_ann_name, data.version = cell_line_ann_version, data.file = cell_line_ann_file) 137 | column_names <- c("arxspan_id", "lineage", "lineage_subtype") 138 | if ("DepMap_ID" %in% colnames(CCLE_ann)) { 139 | column_names[1] <- "DepMap_ID" 140 | } 141 | if (is.null(CCLE_ann)) { 142 | warning("cell line annotation file does not exist on taiga, creating default annotations") 143 | CCLE_ann <- data.frame( 144 | sampleID = rownames(CCLE_mat), 145 | lineage = NA, 146 | subtype = NA, 147 | type = "CL" 148 | ) 149 | } 150 | if (!all(column_names %in% colnames(CCLE_ann))) { 151 | warning("cell line annotation file does not contain expected columns (arxspan_id or DepMap_ID, lineage, & lineage_subtype), creating default annotation file") 152 | CCLE_ann <- data.frame( 153 | sampleID = rownames(CCLE_mat), 154 | lineage = NA, 155 | subtype = NA, 156 | type = "CL" 157 | ) 158 | } else { 159 | CCLE_ann <- CCLE_ann[, column_names] 160 | colnames(CCLE_ann) <- c("sampleID", "lineage", "subtype") 161 | CCLE_ann$type <- "CL" 162 | } 163 | } else { 164 | if (file.exists(file.path(cell_line_ann_name, cell_line_ann_file))) { 165 | CCLE_ann <- data.table::fread(file.path(cell_line_ann_name, cell_line_ann_file)) %>% 166 | as.data.frame() 167 | } else { 168 | warning("cell line annotation file does not exist, creating default annotations") 169 | CCLE_ann <- data.frame( 170 | sampleID = rownames(CCLE_mat), 171 | lineage = NA, 172 | subtype = NA, 173 | type = "CL" 174 | ) 175 | } 176 | } 177 | 178 | if (!all(c("sampleID", "lineage", "subtype", "type") %in% colnames(CCLE_ann))) { 179 | warning("cell line annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations") 180 | CCLE_ann <- data.frame( 181 | sampleID = rownames(CCLE_mat), 182 | lineage = NA, 183 | subtype = NA, 184 | type = "CL" 185 | ) 186 | } 187 | 188 | if (tumor_ann_taiga) { 189 | TCGA_ann <- taigr::load.from.taiga(data.name = tumor_ann_name, data.version = tumor_ann_version, data.file = tumor_ann_file) 190 | tumor_column_names <- c("sampleID", "lineage", "subtype") 191 | if (is.null(TCGA_ann)) { 192 | warning("tumor annotation file does not exist on taiga, creating default annotations") 193 | TCGA_ann <- data.frame( 194 | sampleID = rownames(TCGA_mat), 195 | lineage = NA, 196 | subtype = NA, 197 | type = "tumor" 198 | ) 199 | } 200 | if (!all(tumor_column_names %in% colnames(TCGA_ann))) { 201 | warning("tumor annotation file does not contain expected columns (sampleID, lineage, & subtype), creating default tumor annotations") 202 | TCGA_ann <- data.frame( 203 | sampleID = rownames(TCGA_mat), 204 | lineage = NA, 205 | subtype = NA, 206 | type = "tumor" 207 | ) 208 | } else { 209 | TCGA_ann <- TCGA_ann[, tumor_column_names] 210 | TCGA_ann$type <- "tumor" 211 | } 212 | } else { 213 | if (file.exists(file.path(tumor_ann_name, tumor_ann_file))) { 214 | TCGA_ann <- data.table::fread(file.path(tumor_ann_name, tumor_ann_file)) %>% 215 | as.data.frame() 216 | } else { 217 | warning("tumor annotation file does not exist, creating default annotations") 218 | TCGA_ann <- data.frame( 219 | sampleID = rownames(TCGA_mat), 220 | lineage = NA, 221 | subtype = NA, 222 | type = "tumor" 223 | ) 224 | } 225 | if (!all(c("sampleID", "lineage", "subtype", "type") %in% colnames(TCGA_ann))) { 226 | warning("tumor annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations") 227 | TCGA_ann <- data.frame( 228 | sampleID = rownames(TCGA_mat), 229 | lineage = NA, 230 | subtype = NA, 231 | type = "tumor" 232 | ) 233 | } 234 | } 235 | 236 | if (!(is.null(additional_annotations_name) | is.null(additional_annotations_file))) { 237 | if (additional_annotations_taiga) { 238 | add_ann <- taigr::load.from.taiga(data.name = additional_annotations_name, data.version = additional_annotations_version, data.file = additional_annotations_file) 239 | tumor_column_names <- c("sampleID", "lineage", "subtype", "type") 240 | if (is.null(add_ann)) { 241 | warning("additional annotation file does not exist on taiga, no additional annotations used") 242 | } 243 | if (!all(c("sampleID", "subtype") %in% colnames(add_ann))) { 244 | warning("additional annotation file does not contain expected columns (sampleID & subtype), no additional annotations used") 245 | } else { 246 | shared_samples <- intersect(CCLE_ann$sampleID, add_ann$sampleID) 247 | CCLE_ann[match(shared_samples, CCLE_ann$sampleID), "subtype"] <- add_ann[match(shared_samples, add_ann$sampleID), "subtype"] 248 | } 249 | } else { 250 | if (file.exists(file.path(additional_annotations_name, additional_annotations_file))) { 251 | add_ann <- data.table::fread(file.path(additional_annotations_name, additional_annotations_file)) %>% 252 | as.data.frame() 253 | if (!all(c("sampleID", "subtype") %in% colnames(add_ann))) { 254 | warning("additional annotation file does not contain expected columns (sampleID & subtype), no additional annotations used") 255 | } else { 256 | shared_samples <- intersect(CCLE_ann$sampleID, add_ann$sampleID) 257 | CCLE_ann[match(shared_samples, CCLE_ann$sampleID), "subtype"] <- add_ann[match(shared_samples, add_ann$sampleID), "subtype"] 258 | } 259 | } else { 260 | warning("additional annotation file does not exist, no additional annotations used") 261 | } 262 | } 263 | } 264 | # check for NAs 265 | TCGA_mat <- check_NAs(TCGA_mat) 266 | CCLE_mat <- check_NAs(CCLE_mat) 267 | 268 | # subset to samples in both the annotation and gene expression matrices, and match ordering between them 269 | common_cls <- intersect(rownames(CCLE_mat), CCLE_ann$sampleID) 270 | if (length(setdiff(rownames(CCLE_mat), CCLE_ann$sampleID)) > 0) { 271 | sprintf("Missing annotations for these cell lines: %s", paste(rownames(CCLE_mat), CCLE_ann$sampleID, collapse = ", ")) 272 | } 273 | 274 | CCLE_mat <- CCLE_mat[common_cls, ] 275 | CCLE_ann <- CCLE_ann[match(common_cls, CCLE_ann$sampleID), ] 276 | 277 | common_tumors <- intersect(rownames(TCGA_mat), TCGA_ann$sampleID) 278 | if (length(setdiff(rownames(TCGA_mat), common_tumors)) > 0) { 279 | sprintf("Missing annotations for these tumors: %s", paste(rownames(TCGA_mat), common_tumors, collapse = ", ")) 280 | } 281 | TCGA_mat <- TCGA_mat[common_tumors, ] 282 | TCGA_ann <- TCGA_ann[match(common_tumors, TCGA_ann$sampleID), ] 283 | 284 | # subset genes to functional genes 285 | func_genes <- dplyr::filter(hgnc.complete.set, !locus_group %in% c("non-coding RNA", "pseudogene"))$ensembl_gene_id 286 | genes_used <- intersect(colnames(TCGA_mat), colnames(CCLE_mat)) 287 | genes_used <- intersect(genes_used, func_genes) 288 | 289 | TCGA_mat <- TCGA_mat[, genes_used] 290 | CCLE_mat <- CCLE_mat[, genes_used] 291 | 292 | 293 | return(list(TCGA_mat = TCGA_mat, TCGA_ann = TCGA_ann, CCLE_mat = CCLE_mat, CCLE_ann = CCLE_ann)) 294 | } 295 | 296 | #' Method to calculate gene average expression and variance for an expression matrix 297 | #' @name calc_gene_stats 298 | #' 299 | #' @param dat: data object containing tumor and cell line expression data and annotations produced by running load_data 300 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 301 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations 302 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 303 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations 304 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL 305 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder 306 | #' 307 | #' @description calculate the average gene expression and variance 308 | #' @return gene stats matrix 309 | #' @export 310 | calc_gene_stats <- function(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) { 311 | common_genes <- intersect(colnames(dat$TCGA_mat), colnames(dat$CCLE_mat)) 312 | 313 | if (hgnc_taiga) { 314 | hgnc.complete.set <- taigr::load.from.taiga(data.name = hgnc_data_name, data.version = hgnc_version, data.file = hgnc_data_file) 315 | if (is.null(hgnc.complete.set)) { 316 | stop("HGNC gene file input does not exist on taiga") 317 | } 318 | } else { 319 | if (file.exists(file.path(hgnc_data_name, hgnc_data_file))) { 320 | hgnc.complete.set <- data.table::fread(file.path(hgnc_data_name, hgnc_data_file)) %>% 321 | as.data.frame() 322 | } else { 323 | stop("HGNC gene file input does not exist") 324 | } 325 | } 326 | 327 | if (!all(c("symbol", "ensembl_gene_id", "locus_group") %in% colnames(hgnc.complete.set))) { 328 | stop("HGNC gene file does not contain expected columns (symbol, ensembl_gene_id, & locus_group)") 329 | } 330 | 331 | hgnc.complete.set <- hgnc.complete.set %>% 332 | dplyr::select(Gene = ensembl_gene_id, Symbol = symbol) %>% 333 | filter(Gene %in% common_genes) 334 | hgnc.complete.set <- hgnc.complete.set[!duplicated(hgnc.complete.set$Gene), ] 335 | rownames(hgnc.complete.set) <- hgnc.complete.set$Gene 336 | hgnc.complete.set <- hgnc.complete.set[common_genes, ] 337 | 338 | gene_stats <- data.frame( 339 | Tumor_SD = apply(dat$TCGA_mat, 2, sd, na.rm = T), 340 | CCLE_SD = apply(dat$CCLE_mat, 2, sd, na.rm = T), 341 | Tumor_mean = colMeans(dat$TCGA_mat, na.rm = T), 342 | CCLE_mean = colMeans(dat$CCLE_mat, na.rm = T), 343 | Gene = common_genes, 344 | stringsAsFactors = F 345 | ) %>% 346 | dplyr::mutate(max_SD = pmax(Tumor_SD, CCLE_SD, na.rm = T)) # add avg and max SD per gene 347 | 348 | gene_stats <- left_join(hgnc.complete.set, gene_stats, by = "Gene") 349 | 350 | return(gene_stats) 351 | } 352 | 353 | 354 | #' Method to create seurat objects given an expression matrix and annotation table 355 | #' @name create_Seurat_object 356 | #' 357 | #' @param exp_mat: matrix of samples by genes, where genes are ensembl gene IDs. Data should be log2(X+1) TPM data. 358 | #' @param ann: matrix of sample anntoations. Expects column 'sampleID' which matches the rownames of exp_mat. 359 | #' @param type: optional parameter, string specifying the data type of the current data (ex. 'tumor'), which is added to the annotation matrix. 360 | #' @description create Seurat object of expression data and annotations and run dimensionality reduction. 361 | #' Dimensionality reductions will be run with the parameters (n_PC_dims, umap_n_neighbors, umap_min_dist, distance_metric) specified in celligner_global. 362 | #' @return Seurat object with scaled expression data and annotations stored in meta.data 363 | #' @export 364 | #' 365 | create_Seurat_object <- function(exp_mat, ann, type = NULL) { 366 | seu_obj <- Seurat::CreateSeuratObject(t(exp_mat), 367 | min.cells = 0, 368 | min.features = 0, 369 | meta.data = ann %>% 370 | magrittr::set_rownames(ann$sampleID) 371 | ) 372 | if (!is.null(type)) { 373 | seu_obj@meta.data$type <- type 374 | } 375 | # mean center the data, important for PCA 376 | seu_obj <- Seurat::ScaleData(seu_obj, features = rownames(Seurat::GetAssayData(seu_obj)), do.scale = F) 377 | 378 | seu_obj %<>% Seurat::RunPCA( 379 | assay = "RNA", 380 | features = rownames(Seurat::GetAssayData(seu_obj)), 381 | npcs = celligner_global$n_PC_dims, verbose = F 382 | ) 383 | 384 | seu_obj %<>% Seurat::RunUMAP( 385 | assay = "RNA", dims = 1:celligner_global$n_PC_dims, 386 | reduction = "pca", 387 | n.neighbors = celligner_global$umap_n_neighbors, 388 | min.dist = celligner_global$umap_min_dist, 389 | metric = celligner_global$distance_metric, verbose = F 390 | ) 391 | 392 | return(seu_obj) 393 | } 394 | 395 | #' Method to take in a Seurat object and run default Seurat clustering algorithm 396 | #' @name cluster_data 397 | #' 398 | #' @param seu_obj: seurat object containing expression data and sample annotations. 399 | #' Expects PCA for the seurat object has already been calculated. 400 | #' @description cluster data in seurat object, using default Seurat clustering method. Clsuters data 401 | #' within PCA space using the number of dimensions provided in celligner_global$n_PC_dims (default is 70) 402 | #' 403 | #' @return Seurat object with cluster annotations 404 | #' @export 405 | #' 406 | cluster_data <- function(seu_obj) { 407 | seu_obj <- Seurat::FindNeighbors(seu_obj, 408 | reduction = "pca", 409 | dims = 1:celligner_global$n_PC_dims, 410 | k.param = 20, 411 | force.recalc = TRUE, 412 | verbose = FALSE 413 | ) 414 | 415 | seu_obj %<>% Seurat::FindClusters( 416 | reduction = "pca", 417 | resolution = celligner_global$mod_clust_res 418 | ) 419 | 420 | seu_obj@meta.data$cluster <- seu_obj@meta.data$seurat_clusters 421 | 422 | return(seu_obj) 423 | } 424 | 425 | #' Method to find genes that are differentially expressed between clusters within the expression data 426 | #' @name find_differentially_expressed_genes 427 | #' 428 | #' @param seu_obj: seurat object containing expression data and sample annotations. Expects data in the Seurat object 429 | #' slot scale.data and a column 'seurat_clusters' within the meta.data of the Seurat object. 430 | #' @description find genes that are differentially expressed between clusters within the expression data 431 | #' 432 | #' @return table with gene level stats 433 | #' @export 434 | #' 435 | find_differentially_expressed_genes <- function(seu_obj) { 436 | if (nrow(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")) == 0) { 437 | stop("Seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first") 438 | } 439 | if (!"seurat_clusters" %in% colnames(seu_obj@meta.data)) { 440 | stop("Seurat object doesn't contain the column 'seurat_clusters', run 'cluster_data' first") 441 | } 442 | n_clusts <- nlevels(seu_obj@meta.data$seurat_clusters) 443 | if (n_clusts > 2) { 444 | cur_DE_genes <- run_lm_stats_limma_group( 445 | t(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")), 446 | seu_obj@meta.data %>% dplyr::select(seurat_clusters), 447 | limma_trend = TRUE 448 | ) %>% 449 | dplyr::select(Gene, gene_stat = F_stat) 450 | } else if (n_clusts == 2) { 451 | cur_DE_genes <- run_lm_stats_limma(t(Seurat::GetAssayData(seu_obj, assay = "RNA", slot = "scale.data")), 452 | seu_obj@meta.data$cluster, 453 | limma_trend = TRUE 454 | ) %>% 455 | dplyr::mutate(gene_stat = abs(t_stat)) %>% 456 | dplyr::select(Gene, gene_stat) 457 | } else { 458 | cur_DE_genes <- data.frame(Gene = colnames(seu_obj), gene_stat = NA) 459 | } 460 | 461 | return(cur_DE_genes) 462 | } 463 | 464 | #' Method to run contrastive principal components analysis 465 | #' @name run_cPCA 466 | #' 467 | #' @param TCGA_obj: seurat object containing expression data and sample annotations, usually the tumor data 468 | #' @param CCLE_obj: seurat object containing expression data and sample annotations, usually the cell line data 469 | #' @param pc_dims: the number of cPCs calculated. If set to null then all cPCs will be calculated (this is quite slow), but if set to 470 | #' some value >=4 then an approximate cPCA will be calculated, which just calculates the input number of contrastive principle components, 471 | #' which is quicker. 472 | #' @description run contrastive principal components analysis. 473 | #' Set pc_dims to a value >= 4 to run fast cPCA by just calculating the top contrastive principle components 474 | #' 475 | #' @return object containing cPC vectors and values 476 | #' @export 477 | #' 478 | run_cPCA <- function(TCGA_obj, CCLE_obj, pc_dims = NULL) { 479 | if (nrow(Seurat::GetAssayData(TCGA_obj, assay = "RNA", slot = "scale.data")) == 0) { 480 | stop("TCGA seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first") 481 | } 482 | if (nrow(Seurat::GetAssayData(CCLE_obj, assay = "RNA", slot = "scale.data")) == 0) { 483 | stop("CCLE seurat object doesn't have expression data at scale.data, run 'create_Seurat_object' first") 484 | } 485 | cov_diff_eig <- run_cPCA_analysis(t(Seurat::GetAssayData(TCGA_obj, assay = "RNA", slot = "scale.data")), 486 | t(Seurat::GetAssayData(CCLE_obj, assay = "RNA", slot = "scale.data")), 487 | TCGA_obj@meta.data, CCLE_obj@meta.data, 488 | pc_dims = pc_dims 489 | ) 490 | return(cov_diff_eig) 491 | } 492 | 493 | #' Method to run mutual nearest neighbors batch correction 494 | #' @name run_MNN 495 | #' 496 | #' @param CCLE_cor: matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 497 | #' In the default Celligner pipeline this the cell line data. 498 | #' @param TCGA_cor: matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 499 | #' In the default Celligner pipeline this the tumor data. 500 | #' @param k1: the number of neighbors within the data being corrected (by default the tumor data). By default this 501 | #' pulls from the celligner_global paramter mnn_k_tumor, which by default is 50. 502 | #' @param k2: the number of neighbors within the reference data (by default the cell line data). By default this 503 | #' pulls from the celligner_global parameter mnn_k_CL, which by default is 5. 504 | #' @param ndist: A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 505 | #' By default this pulls from the celligner_global parameter mnn_ndist, which by default is 3. 506 | #' @param subset_genes: the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially 507 | #' expressed genes is usually passed here. 508 | #' @description run MNN batch correction to align data to a reference dataset 509 | #' 510 | #' @return mutual nearest neighbors object with corrected data for the second dataset provided as input and the mutual nearest neighbors 511 | #' @export 512 | #' 513 | run_MNN <- function(CCLE_cor, TCGA_cor, k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist, 514 | subset_genes) { 515 | mnn_res <- modified_mnnCorrect(CCLE_cor, TCGA_cor, 516 | k1 = k1, k2 = k2, ndist = ndist, 517 | subset_genes = subset_genes 518 | ) 519 | 520 | return(mnn_res) 521 | } 522 | 523 | #' Method to calculate the correlation between cell lines and tumor in the Celligner aligned data 524 | #' @name calc_tumor_CL_cor 525 | #' 526 | #' @param Celligner_aligned_data: Celligner aligned data matrix of samples (cells line and tumors) by genes 527 | #' @param Celligner_info: annotation file of cell line and tumor samples with a column 'type' marking samples as either 528 | #' cell lines or tumors and a column 'sampleID' that matches the row names of Celligner_aligned_data 529 | #' @description calculate the correlation between cell line and tumor samples in the Celligner aligned data 530 | #' 531 | #' @return matrix of correlations that is tumors by cell lines 532 | #' @export 533 | #' 534 | calc_tumor_CL_cor <- function(Celligner_aligned_data, Celligner_info) { 535 | tumors_samples <- dplyr::filter(Celligner_info, type == "tumor")$sampleID 536 | cl_samples <- dplyr::filter(Celligner_info, type == "CL")$sampleID 537 | tumor_CL_cor <- cor(t(Celligner_aligned_data[tumor_samples, ]), t(Celligner_aligned_data[cl_samples, ]), 538 | use = "pairwise" 539 | ) 540 | 541 | 542 | return(tumor_CL_cor) 543 | } 544 | 545 | 546 | #' All methods to run Celligner and save the output, if desired 547 | #' @name run_Celligner 548 | #' 549 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 550 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data 551 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 552 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data 553 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL 554 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder 555 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 556 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations 557 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 558 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 559 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'. 560 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL 561 | #' @param cell_line_ann_taiga: if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder 562 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 563 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data. 564 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 565 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data 566 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL 567 | #' @param tumor_taiga: if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder 568 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 569 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations 570 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 571 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 572 | #' if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'. 573 | #' @param tumor_ann_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 574 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 575 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 576 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 577 | #' for the cell lines. If null, assumes there are no additional annotations. 578 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 579 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 580 | #' no additional annotations. 581 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 582 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 583 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 584 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations 585 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 586 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations 587 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL 588 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder 589 | #' @param save_output: by default is NULL and won't save output, to save output pass in a filepath of where to save the output 590 | #' 591 | #' @importFrom magrittr "%>%" 592 | #' 593 | #' @description run all parts of the Celligner pipeline 594 | #' 595 | #' @return seurat object of the Celligner-aligned data 596 | #' @export 597 | #' 598 | run_Celligner <- function(cell_line_data_name = "public-20q4-a4b3", cell_line_data_file = "CCLE_expression_full", cell_line_version = NULL, cell_line_taiga = TRUE, 599 | cell_line_ann_name = "arxspan-cell-line-export-f808", cell_line_ann_file = "ACH", cell_line_ann_version = NULL, cell_line_ann_taiga = TRUE, 600 | tumor_data_name = "celligner-input-9827", tumor_data_file = "tumor_expression", tumor_version = NULL, tumor_taiga = TRUE, 601 | tumor_ann_name = "celligner-input-9827", tumor_ann_file = "tumor_annotations", tumor_ann_version = NULL, tumor_ann_taiga = TRUE, 602 | additional_annotations_name = "celligner-input-9827", additional_annotations_file = "CCLE_annotations", additional_annotations_version = NULL, additional_annotations_taiga = TRUE, 603 | hgnc_data_name = "hgnc-87ab", hgnc_data_file = "hgnc_complete_set", hgnc_version = NULL, hgnc_taiga = TRUE, 604 | save_output = NULL) { 605 | require(magrittr) 606 | require(tidyverse) 607 | 608 | dat <- load_data( 609 | cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga, 610 | cell_line_ann_name, cell_line_ann_file, cell_line_ann_version, cell_line_ann_taiga, 611 | tumor_data_name, tumor_data_file, tumor_version, tumor_taiga, 612 | tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga, 613 | additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga, 614 | hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga 615 | ) 616 | 617 | gene_stats <- calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) 618 | 619 | comb_ann <- rbind( 620 | dat$TCGA_ann %>% dplyr::select(sampleID, lineage, subtype) %>% 621 | dplyr::mutate(type = "tumor"), 622 | dat$CCLE_ann %>% dplyr::select(sampleID, lineage, subtype) %>% 623 | dplyr::mutate(type = "CL") 624 | ) 625 | 626 | TCGA_obj <- create_Seurat_object(dat$TCGA_mat, dat$TCGA_ann, type = "tumor") 627 | CCLE_obj <- create_Seurat_object(dat$CCLE_mat, dat$CCLE_ann, type = "CL") 628 | 629 | TCGA_obj <- cluster_data(TCGA_obj) 630 | CCLE_obj <- cluster_data(CCLE_obj) 631 | 632 | tumor_DE_genes <- find_differentially_expressed_genes(TCGA_obj) 633 | CL_DE_genes <- find_differentially_expressed_genes(CCLE_obj) 634 | 635 | DE_genes <- full_join(tumor_DE_genes, CL_DE_genes, by = "Gene", suffix = c("_tumor", "_CL")) %>% 636 | mutate( 637 | tumor_rank = dplyr::dense_rank(-gene_stat_tumor), 638 | CL_rank = dplyr::dense_rank(-gene_stat_CL), 639 | best_rank = pmin(tumor_rank, CL_rank, na.rm = T) 640 | ) %>% 641 | dplyr::left_join(gene_stats, by = "Gene") 642 | 643 | # take genes that are ranked in the top 1000 from either dataset, used for finding mutual nearest neighbors 644 | DE_gene_set <- DE_genes %>% 645 | dplyr::filter(best_rank < celligner_global$top_DE_genes_per) %>% 646 | .[["Gene"]] 647 | 648 | 649 | cov_diff_eig <- run_cPCA(TCGA_obj, CCLE_obj, celligner_global$fast_cPCA) 650 | 651 | if (is.null(celligner_global$fast_cPCA)) { 652 | cur_vecs <- cov_diff_eig$vectors[, celligner_global$remove_cPCA_dims, drop = FALSE] 653 | } else { 654 | cur_vecs <- cov_diff_eig$rotation[, celligner_global$remove_cPCA_dims, drop = FALSE] 655 | } 656 | 657 | # clear unused objects 658 | rm(TCGA_obj) 659 | rm(CCLE_obj) 660 | gc() 661 | 662 | rownames(cur_vecs) <- colnames(dat$TCGA_mat) 663 | TCGA_cor <- resid(lm(t(dat$TCGA_mat) ~ 0 + cur_vecs)) %>% t() 664 | CCLE_cor <- resid(lm(t(dat$CCLE_mat) ~ 0 + cur_vecs)) %>% t() 665 | 666 | # clear unused objects 667 | rm(dat) 668 | gc() 669 | 670 | mnn_res <- run_MNN(CCLE_cor, TCGA_cor, 671 | k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist, 672 | subset_genes = DE_gene_set 673 | ) 674 | 675 | combined_mat <- rbind(mnn_res$corrected, CCLE_cor) 676 | 677 | comb_obj <- create_Seurat_object(combined_mat, comb_ann) 678 | comb_obj <- cluster_data(comb_obj) 679 | 680 | Celligner_res <- Seurat::Embeddings(comb_obj, reduction = "umap") %>% 681 | as.data.frame() %>% 682 | magrittr::set_colnames(c("UMAP_1", "UMAP_2")) %>% 683 | tibble::rownames_to_column(var = "sampleID") %>% 684 | dplyr::left_join(comb_obj@meta.data, by = "sampleID") 685 | 686 | lineage_averages <- Celligner_res %>% 687 | dplyr::filter(!lineage %in% c( 688 | "adrenal_cortex", "embryo", "endocrine", "engineered", "engineered_blood", 689 | "engineered_breast", "engineered_central_nervous_system", "engineered_kidney", 690 | "engineered_lung", "engineered_ovary", "engineered_prostate", "epidermoid_carcinoma", 691 | "nasopharynx", "nerve", "pineal", "teratoma", "unknown" 692 | )) %>% 693 | dplyr::group_by(lineage) %>% 694 | dplyr::summarise( 695 | UMAP_1 = median(UMAP_1, na.rm = T), 696 | UMAP_2 = median(UMAP_2, na.rm = T) 697 | ) 698 | lineage_averages$lineage <- gsub("_", " ", lineage_averages$lineage) 699 | lineage_lab_aes <- ggplot2::geom_text(data = lineage_averages, mapping = aes(x = UMAP_1, y = UMAP_2, label = lineage), size = 3, color = "#000000") 700 | 701 | 702 | if ("type" %in% colnames(Celligner_res) & "tumor" %in% Celligner_res$type & "CL" %in% Celligner_res$type) { 703 | celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) + 704 | ggplot2::geom_point(alpha = 0.7, pch = 21, ggplot2::aes(color = type, fill = lineage, size = type)) + 705 | ggplot2::scale_color_manual(values = c(tumor = "white", CL = "black")) + 706 | ggplot2::scale_size_manual(values = c(tumor = 0.75, CL = 1.5)) + 707 | ggplot2::xlab("UMAP 1") + 708 | ggplot2::ylab("UMAP 2") + 709 | ggplot2::guides( 710 | fill = FALSE, 711 | color = ggplot2::guide_legend(override.aes = list(color = c("black", "white"), fill = c("white", "black"))) 712 | ) + 713 | ggplot2::theme_classic() 714 | } else { 715 | celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) + 716 | ggplot2::geom_point(alpha = 0.7, pch = 21, size = 1, ggplot2::aes(fill = lineage)) + 717 | ggplot2::xlab("UMAP 1") + 718 | ggplot2::ylab("UMAP 2") + 719 | ggplot2::theme_classic() + 720 | ggplot2::theme(legend.position = "none") 721 | } 722 | 723 | print(celligner_plot) 724 | print(celligner_plot + lineage_lab_aes) 725 | 726 | 727 | if (!is.null(save_output)) { 728 | if (file.exists(save_output)) { 729 | print("calculating tumor/cell line correlation") 730 | tumor_CL_cor <- calc_tumor_CL_cor(combined_mat, comb_obj@meta.data) 731 | 732 | print("saving files") 733 | write.csv(tumor_CL_cor, file.path(save_output, "tumor_CL_cor.csv")) 734 | write.csv(combined_mat, file.path(save_output, "Celligner_aligned_data.csv")) 735 | readr::write_csv(Celligner_res, file.path(save_output, "Celligner_info.csv")) 736 | write.csv(cur_vecs, file.path(save_output, "cPCs.csv")) 737 | readr::write_csv(DE_genes, file.path(save_output, "DE_genes.csv")) 738 | ggplot2::ggsave(file.path(save_output, "Celligner_plot.png"), celligner_plot, device = "png", width = 8, height = 6) 739 | ggplot2::ggsave(file.path(save_output, "labeled_Celligner_plot.png"), celligner_plot + lineage_lab_aes, device = "png", width = 8, height = 6) 740 | } else { 741 | warning("can't save output, folder does not exist") 742 | } 743 | } 744 | 745 | return(comb_obj) 746 | } -------------------------------------------------------------------------------- /R/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: celligner 2 | Title: Celligner pipeline 3 | Version: 0.0.0.9000 4 | Authors@R: 5 | person(given = "Allie", 6 | family = "Warren", 7 | role = c("aut", "cre"), 8 | email = "alliecwarren@gmail.com", 9 | comment = c(ORCID = "")) 10 | Description: R package to run the Celligner method using datasets from taiga. 11 | Imports: 12 | tidyverse, 13 | reshape2, 14 | plyr, 15 | data.table, 16 | Seurat, 17 | pdist, 18 | FNN, 19 | irlba, 20 | limma, 21 | batchelor, 22 | BiocParallel, 23 | BiocManager, 24 | taigr 25 | License: `use_mit_license()`, `use_gpl3_license()` or friends to 26 | pick a license 27 | Encoding: UTF-8 28 | LazyData: true 29 | Roxygen: list(markdown = TRUE) 30 | RoxygenNote: 7.1.1 31 | -------------------------------------------------------------------------------- /R/Dockerfile: -------------------------------------------------------------------------------- 1 | ## celligner 2 | FROM debian:latest 3 | MAINTAINER Jeremie Kalfon 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends apt-utils && 6 | apt-get install -y sudo && 7 | sudo apt-get install -y wget libterm-readline-gnu-perl && 8 | 9 | # all nice packages 10 | ## install the [tools](https://www.datacamp.com/community/tutorials/google-cloud-data-science) sudo apt-get -y install htop parallel curl tar vim nano bzip2 unzip libssl-dev make cmake libcurl4-openssl-dev default-jre && sudo apt-get -y install dirmngr apt-transport-https ca-certificates gnupg2 software-properties-common zlib1g-dev libbz2-dev liblzma-dev openssh-server default-libmysqlclient-dev acl g++ 11 | ## sudo apt install git libmagickwand-dev libtool libexpat1-dev ghostscript graphviz libgraphviz-dev pkg-config libxml-simple-perl zlib1g-dev 12 | sudo apt-get -y install \ 13 | htop \ 14 | parallel \ 15 | curl \ 16 | tar \ 17 | vim \ 18 | nano \ 19 | bc \ 20 | bzip2 \ 21 | unzip \ 22 | libssl-dev \ 23 | make \ 24 | cmake \ 25 | libcurl4-openssl-dev \ 26 | default-jre && 27 | sudo apt-get -y install dirmngr apt-transport-https \ 28 | ca-certificates \ 29 | gnupg2 \ 30 | software-properties-common \ 31 | zlib1g-dev \ 32 | libbz2-dev \ 33 | liblzma-dev \ 34 | libxml2-dev \ 35 | openssh-server \ 36 | default-libmysqlclient-dev \ 37 | acl \ 38 | g++ \ 39 | autoconf \ 40 | automake \ 41 | git libmagickwand-dev libtool \ 42 | libexpat1-dev \ 43 | ghostscript \ 44 | graphviz \ 45 | libgraphviz-dev \ 46 | pkg-config \ 47 | libxml-simple-perl \ 48 | zlib1g-dev && 49 | 50 | # install R sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' && echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list && echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list && sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' && sudo apt update && sudo apt -y install r-base && sudo apt -y install python3 python3-pip && 51 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-key 'E19F5F87128899B192B1A2C2AD5F960A256A04AF' && 52 | echo "deb http://http.debian.net/debian sid main" | sudo tee -a /etc/apt/sources.list && 53 | echo "deb http://ftp.de.debian.org/debian testing main" | sudo tee -a /etc/apt/sources.list && 54 | sudo add-apt-repository 'deb http://cran.rstudio.com/bin/linux/debian buster-cran35/' && 55 | sudo apt update && 56 | sudo apt -y install r-base && 57 | sudo apt -y install python3 python3-pip && 58 | # all python config pip3 install numpy pandas && pip3 install MACS2 && pip3 install dxpy jupytext scikit-learn google-api-core igv igv-jupyter firecloud-dalmatian awscli seaborn pipreqs pysradb nbstripout bokeh matplotlib deeptools tensorflow cutadapt ipykernel jupyter_contrib_nbextensions && jupyter contrib nbextension install && nbstripout --install --global && ipykernel install && nbstripout --install --global 59 | pip3 install numpy pandas && 60 | 61 | # search history 62 | touch ~/.inputrc && 63 | echo "$include /etc/inputrc" >~/.inputrc && 64 | echo ""\e[A":history-search-backward" >~/.inputrc && 65 | echo ""\e[B":history-search-forward" >~/.inputrc && 66 | 67 | # all R config 68 | export R_LIBS="~/R/x86_64-pc-linux-gnu-library/3.6" && 69 | # install all nice R packages R -e "install.packages(c('plyr','dplyr','tidyverse','magrittr','reshape2','useful','ggplot2','ggthemes','ggrepel','gridExtra','ggridges','GGally','plotly','VennDiagram','RColorBrewer','extrafont','cowplot', 'network','data.table','DT','readr','readxl','clues','mclust','pheatmap','Rtsne','NMF','hash', 'stringr', 'irr', 'zoo', 'devtools', 'scales', 'rlang', 'rmarkdown','lsa','BiocManager'), dependencies=TRUE, repos='http://cran.rstudio.com/'); font_import(); loadfonts(); BiocManager::install(c('GSEABase','limma','org.Hs.eg.db','GenomicRanges','DESeq2')); print('if can't use broad intranet, install from source with [R CMD INSTALL .] for 'taigr', 'cdsr','svacd', 'cell_line_mapping/celllinemapr')" 70 | R -e "install.packages(c('plyr','dplyr','tidyverse','magrittr','reshape2','useful','ggplot2','ggthemes',\ 71 | 'ggrepel','gridExtra','ggridges','GGally','plotly','VennDiagram','RColorBrewer','extrafont','cowplot',\ 72 | 'network','data.table','DT','readr','readxl','clues','mclust','pheatmap','Rtsne','NMF','hash',\ 73 | 'stringr', 'irr', 'zoo', 'devtools', 'scales', 'rlang', 'rmarkdown','lsa','BiocManager'), \ 74 | dependencies=TRUE, repos='http://cran.rstudio.com/'); \ 75 | font_import();\ 76 | loadfonts();\ 77 | BiocManager::install(c('GSEABase','Seurat','batchelor','limma','org.Hs.eg.db','GenomicRanges','DESeq2'))\ 78 | devtools::install_github('broadinstitute/taigr');\ 79 | devtools::install_github('broadinstitute/celligner');\ 80 | " && 81 | R -e "library(devtools);\ 82 | devtools::install_github('broadinstitute/celligner')" 83 | -------------------------------------------------------------------------------- /R/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(.average_correction) 4 | export(.center_along_batch_vector) 5 | export(.compute_tricube_average) 6 | export(.tricube_weighted_correction) 7 | export(calc_gene_stats) 8 | export(calc_tumor_CL_cor) 9 | export(check_NAs) 10 | export(cluster_data) 11 | export(create_Seurat_object) 12 | export(find_differentially_expressed_genes) 13 | export(get_cluster_averages) 14 | export(load_additional_data) 15 | export(load_data) 16 | export(modified_mnnCorrect) 17 | export(run_Celligner) 18 | export(run_MNN) 19 | export(run_cPCA) 20 | export(run_cPCA_analysis) 21 | export(run_lm_stats_limma_group) 22 | export(run_multidataset_alignment) 23 | importFrom(BiocNeighbors,queryKNN) 24 | importFrom(BiocParallel,SerialParam) 25 | importFrom(magrittr,"%>%") 26 | -------------------------------------------------------------------------------- /R/README.md: -------------------------------------------------------------------------------- 1 | # Celligner 2 | 3 | ![](docs/typical_celligner.webp) 4 | 5 | celligner is a computational project to align multiple cancer datasets across sequencing modalities, tissue conditions (media, perturbations..) and format (CL/tumor/organoids/spheroids) 6 | 7 | see our latest paper on aligning CCLE cell lines with TCGA tumors: 8 | [2020 paper](https://www.nature.com/articles/s41467-020-20294-x) 9 | 10 | 11 | ## Install 12 | 13 | ### Local 14 | 15 | ``` r 16 | library(devtools) 17 | devtools::install_github("broadinstitute/celligner/R") 18 | ``` 19 | 20 | if you could not install taigr: 21 | ```r 22 | devtools::install_github("broadinstitute/taigr") 23 | ``` 24 | 25 | ### Docker 26 | 27 | a docker image is available at: [jkobject/celligner](https://hub.docker.com/r/jkobject/celligner) 28 | 29 | the Dockerfile is listed in this repo. 30 | 31 | for now you will need to copy your taiga secret file to the docker first 32 | 33 | ## run Celligner 34 | 35 | The package can be loaded by calling 36 | ``` r 37 | library(celligner) 38 | ``` 39 | 40 | please note that celligner might use a significant amount of RAM (around 50GBs) 41 | 42 | The entire pipeline can be run by calling **run_Celligner()**. 43 | 44 | ### parameters 45 | - *cell_line_data_name* : if *cell_line_taiga* = TRUE, then the data.name of the taiga file containing the cell line expression data, 46 | if *cell_line_taiga*=FALSE, then the file path to the local folder containing the cell line expression data. To run the pipeline on 47 | new DepMap data this is the only parameter that should need to be updated (to refer to the new virtual dataset for the relevant release). 48 | - *cell_line_data_file* : if *cell_line_taiga* = TRUE, then the data.file of the taiga file containing the cell line expression data, 49 | if *cell_line_taiga*=FALSE, then the name of the file of cell line expression data. By default uses the virtual dataset data file 'CCLE_expression_full'. 50 | - *cell_line_version* : (optional) parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL 51 | - *cell_line_taiga*: if TRUE (default) then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder 52 | - *cell_line_ann_name* : if *cell_line_taiga* = TRUE, then the data.name of the taiga file containing the cell line annotations, 53 | if *cell_line_taiga*=FALSE, then the file path to the local folder containing the cell line annotations. By default pulls the arxspan data from taiga. 54 | - *cell_line_ann_file* : if *cell_line_taiga* = TRUE, then the data.file of the taiga file containing the cell line annotations, 55 | if *cell_line_taiga*=FALSE, then the name of the file of cell line annotations. If pulling from taiga (default), assumes that the file is the arxspan 56 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'. 57 | - *cell_line_ann_version* : (optional) parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL 58 | - *cell_line_ann_taiga* : if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder 59 | - *tumor_data_name* : if *tumor_taiga* = TRUE (default), then the data.name of the taiga file containing the tumor expression data, 60 | if *tumor_taiga*=FALSE, then the file path to the local folder containing the tumor expression data. By default, pulls the TCGA+ (TCGA, TARGET, & Treehouse data 61 | downloaded from xena browser) 62 | - *tumor_data_file* : if *tumor_taiga* = TRUE (default), then the data.file of the taiga file containing the tumor expression data, 63 | if *tumor_taiga*=FALSE, then the name of the file the tumor expression data 64 | - *tumor_version* : (optional) parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL 65 | - *tumor_taiga* : if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder 66 | - *tumor_ann_name* : if *tumor_taiga* = TRUE (default), then the data.name of the taiga file containing the tumor annotations, 67 | if *tumor_taiga*=FALSE, then the file path to the local folder containing the tumor annotations 68 | - *tumor_ann_file* : if *tumor_ann_taiga* = TRUE (default), then the data.file of the taiga file containing the tumor annotations, 69 | if *tumor_ann_taiga*=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already 70 | created Celligner info file used in the Celligner manuscript, if not then assumes it is a local file containing the columns 71 | sampleID, lineage, subtype, and type=='tumor'. 72 | - *tumor_ann_version* : (optional) parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 73 | - *tumor_ann_taiga* : if TRUE (default) then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 74 | - *additional_annotations_name* : if *additional_annotations_taiga* = TRUE (default), then the data.name of the taiga file containing the additional annotations, 75 | if *additional_annotations_taiga*=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 76 | for the cell lines. If NULL, assumes there are no additional annotations. 77 | - *additional_annotations_file* : if *additional_annotations_taiga* = TRUE (default), then the data.file of the taiga file containing the additional annotations, 78 | if *additional_annotations_taiga*=FALSE, then the name of the file the additional annotations. If null, assumes there are 79 | no additional annotations. By default pulls the Celligner_info file, used in the Celligner manuscript. 80 | - *additional_annotations_version* : (optional) parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 81 | *additional_annotations_taiga*: if TRUE (default) then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 82 | - *hgnc_data_name* : if *hgnc_taiga* = TRUE (default), then the data.name of the taiga file containing the HGNC gene annotations, 83 | if *hgnc_taiga*=FALSE, then the file path to the local folder containing the HGNC gene annotations 84 | - *hgnc_data_file* : if *hgnc_taiga* = TRUE (default), then the data.file of the taiga file containing the HGNC gene annotations, 85 | if *hgnc_taiga*=FALSE, then the name of the file the HGNC gne annotations 86 | - *hgnc_version* : (optional) parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL 87 | - *hgnc_taiga* : if TRUE (default) then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder 88 | - *save_output* : by default is NULL and won't save output, to save output pass in a filepath of where to save the output 89 | 90 | ### Returns 91 | - a seurat object containing the Celligner-aligned data, the UMAP dimensionality reduction of the Celligner-aligned data, clustering, sample metadata 92 | - some plots 93 | - if *save_output* is given a filepath then output files will be saved to the folder specified 94 | 95 | ## Multidataset alignment 96 | 97 | **run_multidataset_alignment()** is similar to **run_Celligner()**, but also aligns the _Met500_ dataset and two _PDX_ datasets, by default pulling them from taiga. See more notes on multidataset alignment in [this google document](https://docs.google.com/document/d/11FvwosKXieYT0sRuyOkjCG1ZiyxYDQohx5-dyrY6LVg) 98 | 99 | Follow our slack discussion on BroadInsitute's [#tumor-to-cl](#) 100 | 101 | Follow the project on our [Asana page](https://app.asana.com/0/482696339531494/list) 102 | 103 | Please use _github issues_ for any problem related to the tool. 104 | 105 | Maintainers: 106 | 107 | Jérémie Kalfon @jkobject 108 | James McFarland 109 | Javad Noorbak @jnoorbak 110 | 111 | Created by: 112 | 113 | Allie Warren @awarren 114 | 115 | -------------------------------------------------------------------------------- /R/global_params.R: -------------------------------------------------------------------------------- 1 | # Parameters 2 | celligner_global <- list( 3 | n_genes = 'all', # set to 'all' to use all protein coding genes found in both datasets 4 | umap_n_neighbors = 10, # num nearest neighbors used to create UMAP plot 5 | umap_min_dist = 0.5, # min distance used to create UMAP plot 6 | mnn_k_CL = 5, # number of nearest neighbors of tumors in the cell line data 7 | mnn_k_tumor = 50, # number of nearest neighbors of cell lines in the tumor data 8 | top_DE_genes_per = 1000, # differentially expressed genes with a rank better than this is in the cell line or tumor data 9 | # are used to identify mutual nearest neighbors in the MNN alignment step 10 | remove_cPCA_dims = c(1,2,3,4), # which cPCA dimensions to regress out of the data 11 | distance_metric = 'euclidean', # distance metric used for the UMAP projection 12 | mod_clust_res = 5, # resolution parameter used for clustering the data 13 | mnn_ndist = 3, # ndist parameter used for MNN 14 | n_PC_dims = 70, # number of PCs to use for dimensionality reduction 15 | reduction.use = 'umap', # 2D projection used for plotting 16 | fast_cPCA = 10 # to run fast cPCA (approximate the cPCA eigenvectors instead of calculating all) set this to a value >= 4 17 | ) 18 | 19 | tissue_colors <- c(`central_nervous_system`= "#f5899e",`engineered_central_nervous_system` = "#f5899e", 20 | `teratoma` = "#f5899e", 21 | `bone` = "#9f55bb", 22 | `pancreas` = "#b644dc", 23 | `soft_tissue` = "#5fdb69", 24 | `skin` = "#6c55e2", 25 | `liver` = "#9c5e2b", 26 | `blood` = "#da45bb", 27 | `lymphocyte`= "#abd23f", 28 | `peripheral_nervous_system` = "#73e03d", 29 | `ovary` = "#56e79d",`engineered_ovary` = "#56e79d", 30 | `adrenal` = "#e13978", `adrenal_cortex` = "#e13978", 31 | `upper_aerodigestive` = "#5da134", 32 | `kidney` = "#1f8fff",`engineered_kidney` = "#1f8fff", 33 | `gastric` = "#dfbc3a", 34 | `eye` = "#349077", 35 | `nasopharynx` = "#a9e082", 36 | `nerve` = "#c44c90", 37 | `unknown` = "#999999", 38 | `cervix` = "#5ab172", 39 | `thyroid` = "#d74829", 40 | `lung` = "#51d5e0",`engineered_lung` = "#51d5e0", 41 | `rhabdoid` = "#d04850", 42 | `germ_cell` = "#75dfbb", `embryo` = "#75dfbb", 43 | `colorectal` = "#96568e", 44 | `endocrine` = "#d1d684", 45 | `bile_duct` = "#c091e3", 46 | `pineal` = "#949031", 47 | `thymus` = "#659fd9", 48 | `mesothelioma` = "#dc882d", 49 | `prostate` = "#3870c9", `engineered_prostate` = "#3870c9", 50 | `uterus` = "#e491c1", 51 | `breast` = "#45a132",`engineered_breast` = "#45a132", 52 | `urinary_tract` = "#e08571", 53 | `esophagus` = "#6a6c2c", 54 | `fibroblast` = "#d8ab6a", 55 | `plasma_cell` = "#e6c241") 56 | -------------------------------------------------------------------------------- /R/install_packages.R: -------------------------------------------------------------------------------- 1 | options(repos = c("https://cran.cnr.berkeley.edu")) 2 | options(repos = c( 3 | "https://iwww.broadinstitute.org/~datasci/R-packages", 4 | "https://cran.rstudio.com/")) 5 | cran_packages <- c('tidyverse', 'reshape2', 'plyr', 'data.table', 'Seurat', 'pdist','FNN', 'irlba') 6 | new_cran_packages <- cran_packages[!(cran_packages %in% installed.packages()[,"Package"])] 7 | if(length(new_cran_packages)) install.packages(new_cran_packages) 8 | 9 | bioconductor_packages <- c('limma', 'batchelor', 'BiocParallel') 10 | new_bioconductor_packages <- bioconductor_packages[!(bioconductor_packages %in% installed.packages()[,"Package"])] 11 | if(length(new_bioconductor_packages)) { 12 | if (!requireNamespace("BiocManager", quietly = TRUE)) { 13 | install.packages("BiocManager") 14 | } 15 | BiocManager::install(new_bioconductor_packages) 16 | } 17 | -------------------------------------------------------------------------------- /R/mutlidataset_alignment.R: -------------------------------------------------------------------------------- 1 | 2 | #' Load additional expression and annotation data 3 | #' @name load_additional_data 4 | #' 5 | #' @param data_name: if data_taiga = TRUE, then the data.name of the taiga file containing the expression data, 6 | #' if data_taiga=FALSE, then the file path to the local folder containing the expression data. Assumes that genes 7 | #' are labeled using ensembl IDs and that there are fewer samples than genes in the matrix, will transpose the matrix 8 | #' so that rows are samples and columns are genes. 9 | #' @param data_file: if data_taiga = TRUE, then the data.file of the taiga file containing the expression data, 10 | #' if data_taiga = FALSE, then the name of the file of expression data 11 | #' @param data_version: (optional) parameter to specify the version to pull from taiga for the expression data, default set to NULL 12 | #' @param data_taiga: if TRUE then pulls the expression data from taiga, if FALSE then finds expression data in local folder 13 | #' @param ann_name: if ann_taiga = TRUE, then the data.name of the taiga file containing the data annotations, 14 | #' if ann_taiga=FALSE, then the file path to the local folder containing the annotations 15 | #' @param ann_file: if ann_taiga = TRUE, then the data.file of the taiga file containing the data annotations, 16 | #' if ann_taiga=FALSE, then the name of the file of data annotations 17 | #' @param ann_version: (optional) parameter to specify the version to pull from taiga for the annotations, default set to NULL 18 | #' @param ann_taiga: if TRUE (default) then pulls the annotations from taiga, if FALSE then finds cell line annotations in local folder 19 | #' @param data_type: string added to the annotation file under the column type to specify the data, default is "" 20 | #' @description load additional expression and annotation files 21 | #' 22 | #' @return object containing expression matrix and annotations table 23 | #' @export 24 | #' 25 | load_additional_data <- function(data_name, data_file, data_version = NULL, data_taiga = TRUE, 26 | ann_name, ann_file, ann_version = NULL, ann_taiga = TRUE, data_type = "") { 27 | 28 | if(data_taiga) { 29 | data_mat <- taigr::load.from.taiga(data.name = data_name, data.version = data_version, data.file = data_file) 30 | if(is.null(data_mat)) { 31 | stop("expression data file input does not exist on taiga") 32 | } 33 | } else { 34 | if(file.exists(file.path(data_name, data_file))) { 35 | data_mat <- readr::read_csv(file.path(data_name, data_file)) %>% 36 | as.data.frame() %>% 37 | tibble::column_to_rownames('X1') %>% 38 | as.matrix() 39 | } else { 40 | stop('expression data file input does not exist') 41 | } 42 | } 43 | 44 | 45 | # transpose matrix, if needed, so rownames are samples and column names are genes 46 | if(nrow(data_mat) > ncol(data_mat)) { 47 | warning('more rows than columns, taking transpose of expression matrix') 48 | data_mat <- t(data_mat) 49 | } 50 | 51 | 52 | if(ann_taiga) { 53 | ann <- taigr::load.from.taiga(data.name = ann_name, data.version = ann_version, data.file = ann_file) 54 | column_names <- c('sampleID', 'lineage', 'subtype') 55 | if(is.null(ann)) { 56 | warning('annotation file does not exist on taiga, creating default annotations') 57 | ann <- data.frame(sampleID = rownames(data_mat), 58 | lineage = NA, 59 | subtype = NA, 60 | type = data_type) 61 | } 62 | if(!all(column_names %in% colnames(ann))) { 63 | warning('annotation file does not contain expected columns (sampleID, lineage, & subtype), creating tumor annotations') 64 | ann <- data.frame(sampleID = rownames(data_mat), 65 | lineage = NA, 66 | subtype = NA, 67 | type = data_type) 68 | } else { 69 | ann <- ann[,column_names] 70 | ann$type <- data_type 71 | } 72 | } else { 73 | if(file.exists(file.path(ann_name, ann_file))) { 74 | ann <- data.table::fread(file.path(ann_name, ann_file)) %>% 75 | as.data.frame() 76 | } else { 77 | warning('annotation file does not exist, creating default annotations') 78 | ann <- data.frame(sampleID = rownames(data_mat), 79 | lineage = NA, 80 | subtype = NA, 81 | type = data_type) 82 | } 83 | if(!all(c('sampleID', 'lineage', 'subtype', 'type') %in% colnames(ann))) { 84 | warning('annotation file does not contain expected columns (sampleID, lineage, subtype & type), creating default annotations') 85 | ann <- data.frame(sampleID = rownames(data_mat), 86 | lineage = NA, 87 | subtype = NA, 88 | type = data_type) 89 | } 90 | } 91 | 92 | return(list(mat = data_mat, ann = ann)) 93 | 94 | } 95 | 96 | 97 | 98 | #' All methods to run Celligner, with additional alignment of Met500 and PDX data, and save the output, if desired 99 | #' @name run_multidataset_alignment 100 | #' 101 | #' @param cell_line_data_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 102 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data 103 | #' @param cell_line_data_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 104 | #' if cell_line_taiga=FALSE, then the name of the file of cell line expression data 105 | #' @param cell_line_version: parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL 106 | #' @param cell_line_taiga: if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder 107 | #' @param cell_line_ann_name: if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 108 | #' if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations 109 | #' @param cell_line_ann_file: if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 110 | #' if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 111 | #' file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'. 112 | #' @param cell_line_ann_version: parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL 113 | #' @param cell_line_ann_taiga: if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder 114 | #' @param tumor_data_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 115 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data. 116 | #' @param tumor_data_file: if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 117 | #' if tumor_taiga=FALSE, then the name of the file the tumor expression data 118 | #' @param tumor_version: parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL 119 | #' @param tumor_taiga: if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder 120 | #' @param tumor_ann_name: if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 121 | #' if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations 122 | #' @param tumor_ann_file: if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 123 | #' if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 124 | #' if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'. 125 | #' @param tumor_ann_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 126 | #' @param tumor_ann_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 127 | #' @param additional_annotations_name: if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 128 | #' if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 129 | #' for the cell lines. If null, assumes there are no additional annotations. 130 | #' @param additional_annotations_file: if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 131 | #' if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 132 | #' no additional annotations. 133 | #' @param additional_annotations_version: parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL 134 | #' @param additional_annotations_taiga: if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder 135 | #' @param hgnc_data_name: if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 136 | #' if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations 137 | #' @param hgnc_data_file: if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 138 | #' if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations 139 | #' @param hgnc_version: parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL 140 | #' @param hgnc_taiga: if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder 141 | #' @param met500_data_name: Met500 expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using met500_taiga=F 142 | #' @param met500_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using met500_taiga=F 143 | #' @param met500_version: default NULL, used to specify version of taiga dataset 144 | #' @param met500_taiga: if TRUE (default) pulls Met500 expression from taiga dataset, if FALSE reads from local 145 | #' @param met500_ann_name: Met500 annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using met500_ann_taiga=F 146 | #' @param met500_ann_file: Met500 annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using met500_ann_taiga=F 147 | #' @param met500_ann_version: default NULL, used to specify version of taiga dataset 148 | #' @param met500_ann_taiga: if TRUE (default) pulls met500 annotations from taiga dataset, if FALSE reads from local 149 | #' @param Novartis_PDX_data_name: Novartis PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using Novartis_PDX_taiga=F 150 | #' @param Novartis_PDX_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using Novartis_PDX_taiga=F 151 | #' @param Novartis_PDX_version: default NULL, used to specify version of taiga dataset 152 | #' @param Novartis_PDX_taiga: if TRUE (default) pulls Novartis PDX expression from taiga dataset, if FALSE reads from local 153 | #' @param Novartis_PDX_ann_name: Novartis PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or path to folder is using met500_ann_taiga=F 154 | #' @param Novartis_PDX_ann_file: Novartis PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or name of local file is using Novartis_PDX_ann_taiga=F 155 | #' @param Novartis_PDX_ann_version: default NULL, used to specify version of taiga dataset 156 | #' @param Novartis_PDX_ann_taiga: if TRUE (default) pulls Novartis PDX annotations from taiga dataset, if FALSE reads from local 157 | #' @param pediatric_PDX_data_name: pediatric PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using pediatric_PDX_taiga=F 158 | #' @param pediatric_PDX_data_file: default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using pediatric_PDX_taiga=F 159 | #' @param pediatric_PDX_version: default NULL, used to specify version of taiga dataset 160 | #' @param pediatric_PDX_taiga: if TRUE (default) pulls pediatric PDX expression from taiga dataset, if FALSE reads from local 161 | #' @param pediatric_PDX_ann_name: Pediatric PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using pediatric_PDX_ann_taiga=F 162 | #' @param pediatric_PDX_ann_file: Pediatric PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using pediatric_PDX_ann_taiga=F 163 | #' @param pediatric_PDX_ann_version: default NULL, used to specify version of taiga dataset 164 | #' @param pediatric_PDX_ann_taiga: if TRUE (default) pulls pediatric PDX annotations from taiga dataset, if FALSE reads from local 165 | #' @param save_output: by default is NULL and won't save output, to save output pass in a filepath of where to save the output 166 | #' 167 | #' @importFrom magrittr "%>%" 168 | #' 169 | #' @description run all parts of the Celligner pipeline, with alignment of additional datasets 170 | #' 171 | #' @return seurat object of the Celligner-aligned data 172 | #' @export 173 | #' 174 | run_multidataset_alignment <- function(cell_line_data_name='public-20q4-a4b3', cell_line_data_file = 'CCLE_expression_full', cell_line_version = NULL, cell_line_taiga=TRUE, 175 | cell_line_ann_name='arxspan-cell-line-export-f808', cell_line_ann_file = 'ACH',cell_line_ann_version = NULL, cell_line_ann_taiga=TRUE, 176 | tumor_data_name = 'celligner-input-9827', tumor_data_file = 'tumor_expression', tumor_version = NULL, tumor_taiga = TRUE, 177 | tumor_ann_name = 'celligner-input-9827', tumor_ann_file = 'tumor_annotations', tumor_ann_version = NULL, tumor_ann_taiga = TRUE, 178 | additional_annotations_name = 'celligner-input-9827', additional_annotations_file = 'CCLE_annotations', additional_annotations_version = NULL, additional_annotations_taiga = TRUE, 179 | hgnc_data_name = 'hgnc-87ab', hgnc_data_file='hgnc_complete_set', hgnc_version= NULL, hgnc_taiga = TRUE, 180 | met500_data_name = 'met500-fc3c', met500_data_file = 'met500_TPM', met500_version = NULL, met500_taiga = TRUE, 181 | met500_ann_name = 'met500-fc3c', met500_ann_file = 'met500_ann', met500_ann_version = NULL, met500_ann_taiga = TRUE, 182 | Novartis_PDX_data_name = 'pdx-data-3d29', Novartis_PDX_data_file = 'Novartis_PDX_TPM', Novartis_PDX_version = NULL, Novartis_PDX_taiga = TRUE, 183 | Novartis_PDX_ann_name = 'pdx-data-3d29', Novartis_PDX_ann_file = 'Novartis_PDX_ann', Novartis_PDX_ann_version = NULL, Novartis_PDX_ann_taiga = TRUE, 184 | pediatric_PDX_data_name = 'pdx-data-3d29', pediatric_PDX_data_file = 'pediatric_PDX_TPM', pediatric_PDX_version = NULL, pediatric_PDX_taiga = TRUE, 185 | pediatric_PDX_ann_name = 'pdx-data-3d29', pediatric_PDX_ann_file = 'pediatric_PDX_ann', pediatric_PDX_ann_version = NULL, pediatric_PDX_ann_taiga = TRUE, 186 | save_output = NULL) { 187 | 188 | require(magrittr) 189 | require(tidyverse) 190 | 191 | dat <- load_data(cell_line_data_name, cell_line_data_file, cell_line_version, cell_line_taiga, 192 | cell_line_ann_name, cell_line_ann_file,cell_line_ann_version, cell_line_ann_taiga, 193 | tumor_data_name, tumor_data_file, tumor_version, tumor_taiga, 194 | tumor_ann_name, tumor_ann_file, tumor_ann_version, tumor_ann_taiga, 195 | additional_annotations_name, additional_annotations_file, additional_annotations_version, additional_annotations_taiga, 196 | hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) 197 | 198 | met500 <- load_additional_data(met500_data_name, met500_data_file, met500_data_version, met500_data_taiga, 199 | met500_ann_name, met500_ann_file, met500_ann_version, met500_ann_taiga) 200 | Novartis_PDX <- load_additional_data(Novartis_PDX_data_name, Novartis_PDX_data_file, Novartis_PDX_data_version, Novartis_PDX_data_taiga, 201 | Novartis_PDX_ann_name, Novartis_PDX_ann_file, Novartis_PDX_ann_version, Novartis_PDX_ann_taiga) 202 | 203 | pediatric_PDX <- load_additional_data(pediatric_PDX_data_name, pediatric_PDX_data_file, pediatric_PDX_data_version, pediatric_PDX_data_taiga, 204 | pediatric_PDX_ann_name, pediatric_PDX_ann_file, pediatric_PDX_ann_version, pediatric_PDX_ann_taiga) 205 | 206 | shared_genes <- intersect(colnames(dat$TCGA_mat), colnames(dat$CCLE_mat)) %>% 207 | intersect(colnames(met500$mat)) %>% 208 | intersect(colnames(Novartis_PDX$mat)) %>% 209 | intersect(colnames(pediatric_PDX$mat)) 210 | 211 | dat$TCGA_mat <- dat$TCGA_mat[,shared_genes] 212 | dat$CCLE_mat <- dat$CCLE_mat[,shared_genes] 213 | met500$mat <- met500$mat[,shared_genes] 214 | Novartis_PDX$mat <- Novartis_PDX$mat[,shared_genes] 215 | pediatric_PDX$mat <- pediatric_PDX$mat[,shared_genes] 216 | 217 | gene_stats <- calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) 218 | 219 | comb_ann <- rbind( 220 | dat$TCGA_ann %>% dplyr::select(sampleID, lineage, subtype) %>% 221 | dplyr::mutate(type = 'tumor'), 222 | dat$CCLE_ann %>% dplyr::select(sampleID, lineage, subtype) %>% 223 | dplyr::mutate(type = 'CL') 224 | ) 225 | 226 | TCGA_obj <- create_Seurat_object(dat$TCGA_mat, dat$TCGA_ann, type='tumor') 227 | CCLE_obj <- create_Seurat_object(dat$CCLE_mat, dat$CCLE_ann, type='CL') 228 | 229 | TCGA_obj <- cluster_data(TCGA_obj) 230 | CCLE_obj <- cluster_data(CCLE_obj) 231 | 232 | tumor_DE_genes <- find_differentially_expressed_genes(TCGA_obj) 233 | CL_DE_genes <- find_differentially_expressed_genes(CCLE_obj) 234 | 235 | DE_genes <- full_join(tumor_DE_genes, CL_DE_genes, by = 'Gene', suffix = c('_tumor', '_CL')) %>% 236 | mutate( 237 | tumor_rank = dplyr::dense_rank(-gene_stat_tumor), 238 | CL_rank = dplyr::dense_rank(-gene_stat_CL), 239 | best_rank = pmin(tumor_rank, CL_rank, na.rm=T)) %>% 240 | dplyr::left_join(gene_stats, by = 'Gene') 241 | 242 | # take genes that are ranked in the top 1000 from either dataset, used for finding mutual nearest neighbors 243 | DE_gene_set <- DE_genes %>% 244 | dplyr::filter(best_rank < celligner_global$top_DE_genes_per) %>% 245 | .[['Gene']] 246 | 247 | 248 | cov_diff_eig <- run_cPCA(TCGA_obj, CCLE_obj, celligner_global$fast_cPCA) 249 | 250 | if(is.null(celligner_global$fast_cPCA)) { 251 | cur_vecs <- cov_diff_eig$vectors[, celligner_global$remove_cPCA_dims, drop = FALSE] 252 | } else { 253 | cur_vecs <- cov_diff_eig$rotation[, celligner_global$remove_cPCA_dims, drop = FALSE] 254 | } 255 | 256 | rownames(cur_vecs) <- colnames(dat$TCGA_mat) 257 | TCGA_cor <- resid(lm(t(dat$TCGA_mat) ~ 0 + cur_vecs)) %>% t() 258 | CCLE_cor <- resid(lm(t(dat$CCLE_mat) ~ 0 + cur_vecs)) %>% t() 259 | 260 | mnn_res <- run_MNN(CCLE_cor, TCGA_cor, k1 = celligner_global$mnn_k_tumor, k2 = celligner_global$mnn_k_CL, ndist = celligner_global$mnn_ndist, 261 | subset_genes = DE_gene_set) 262 | 263 | combined_mat <- rbind(mnn_res$corrected, CCLE_cor) 264 | 265 | # clear unused objects 266 | rm(TCGA_obj); rm(CCLE_obj); rm(cov_diff_eig); rm(TCGA_cor); rm(CCLE_cor); gc() 267 | 268 | # Met500 alignment 269 | met500_cor <- resid(lm(t(met500$mat) ~ 0 + cur_vecs)) %>% t() 270 | 271 | mnn_res <- run_MNN(combined_mat, met500_cor, k1 = 20, k2 = 50, ndist = celligner_global$mnn_ndist, 272 | subset_genes = DE_gene_set) 273 | combined_mat <- rbind(combined_mat, mnn_res$corrected) 274 | 275 | ## align PDX datasets 276 | 277 | ### PDX - Novartis 278 | Novartis_PDX_cor <- resid(lm(t(Novartis_PDX$mat) ~ 0 + cur_vecs)) %>% t() 279 | 280 | mnn_res_Novartis_PDX <- run_MNN(combined_mat, Novartis_PDX_cor, k1 = 10, k2 = 50, ndist = 3, 281 | subset_genes = DE_gene_set) 282 | 283 | combined_mat <- rbind(combined_mat, mnn_res_Novartis_PDX$corrected) 284 | 285 | ### PDX - pediatric 286 | pediatric_PDX_cor <- resid(lm(t(pediatric_PDX$mat) ~ 0 + cur_vecs)) %>% t() 287 | 288 | mnn_res_pediatric_PDX <- run_MNN(combined_mat[-which(rownames(combined_mat) %in% rownames(pediatric_PDX_cor)),], 289 | pediatric_PDX_cor, k1 = 10, k2 = 50, ndist = 3, 290 | subset_genes = DE_gene_set) 291 | 292 | combined_mat <- t(rbind(combined_mat, mnn_res_pediatric_PDX$corrected)) 293 | 294 | # combine all output 295 | comb_ann <- rbind.data.frame(comb_ann[,c('sampleID', 'lineage', 'subtype', 'type')], 296 | met500$ann[,c('sampleID', 'lineage', 'subtype', 'type')], 297 | Novartis_PDX$ann[,c('sampleID', 'lineage', 'subtype', 'type')], 298 | pediatric_PDX$ann[,c('sampleID', 'lineage', 'subtype', 'type')]) 299 | rownames(comb_ann) <- comb_ann$sampleID 300 | comb_ann <- comb_ann[colnames(combined_mat),] 301 | 302 | # clear unused object 303 | rm(met500); rm(Novartis_PDX); rm(pediatric_PDX); rm(met500_cor); rm(Novartis_PDX_cor); rm(pediatric_PDX_cor); gc() 304 | 305 | # create seurat object 306 | comb_obj <- create_Seurat_object(combined_mat, comb_ann) 307 | comb_obj <- cluster_data(comb_obj) 308 | 309 | Celligner_res <- Seurat::Embeddings(comb_obj, reduction = 'umap') %>% 310 | as.data.frame() %>% 311 | magrittr::set_colnames(c('UMAP_1', 'UMAP_2')) %>% 312 | tibble::rownames_to_column(var = 'sampleID') %>% 313 | dplyr::left_join(comb_obj@meta.data, by = 'sampleID') 314 | 315 | lineage_averages <- Celligner_res %>% 316 | dplyr::filter(!lineage %in% c('adrenal_cortex', 'embryo', 'endocrine', 'engineered', 'engineered_blood', 317 | 'engineered_breast', 'engineered_central_nervous_system', 'engineered_kidney', 318 | 'engineered_lung', 'engineered_ovary', 'engineered_prostate', 'epidermoid_carcinoma', 319 | 'nasopharynx', 'nerve','pineal', 'teratoma', 'unknown')) %>% 320 | dplyr::group_by(lineage) %>% 321 | dplyr::summarise(UMAP_1 = median(UMAP_1, na.rm=T), 322 | UMAP_2 = median(UMAP_2, na.rm=T)) 323 | lineage_averages$lineage <- gsub("_", " ", lineage_averages$lineage) 324 | lineage_lab_aes <- ggplot2::geom_text(data = lineage_averages, mapping = aes(x = UMAP_1, y = UMAP_2, label = lineage), size = 3, color="#000000") 325 | 326 | 327 | if('type' %in% colnames(Celligner_res) & 'tumor' %in% Celligner_res$type & 'CL' %in% Celligner_res$type) { 328 | celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) + 329 | ggplot2::geom_point(alpha=0.7, pch=21, ggplot2::aes(color = type, fill = lineage, size = type)) + 330 | ggplot2::scale_color_manual(values = c(tumor = 'white', CL = 'black')) + 331 | ggplot2::scale_size_manual(values=c(tumor=0.75, CL=1.5)) + 332 | ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') + 333 | ggplot2::guides(fill=FALSE, 334 | color = ggplot2::guide_legend(override.aes = list(color=c('black', 'white'), fill = c('white','black')))) + 335 | ggplot2::theme_classic() 336 | } else { 337 | celligner_plot <- ggplot2::ggplot(Celligner_res, ggplot2::aes(UMAP_1, UMAP_2)) + 338 | ggplot2::geom_point(alpha=0.7, pch=21, size = 1, ggplot2::aes(fill = lineage)) + 339 | ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') + 340 | ggplot2::theme_classic() + ggplot2::theme(legend.position = 'none') 341 | } 342 | 343 | print(celligner_plot) 344 | print(celligner_plot + lineage_lab_aes) 345 | 346 | 347 | if(!is.null(save_output)) { 348 | if(file.exists(save_output)) { 349 | print('saving files') 350 | write.csv(combined_mat, file.path(save_output, 'Celligner_multidataset_aligned_data.csv')) 351 | readr::write_csv(Celligner_res, file.path(save_output, 'Celligner_multidataset_info.csv')) 352 | ggplot2::ggsave(file.path(save_output, 'Celligner_multidataset_plot.png'), celligner_plot, device='png', width = 8, height = 6) 353 | ggplot2::ggsave(file.path(save_output, 'labeled_Celligner_multidataset_plot.png'), celligner_plot + lineage_lab_aes, device='png', width = 8, height = 6) 354 | 355 | } else{ 356 | warning("can't save output, folder does not exist") 357 | } 358 | } 359 | 360 | return(comb_obj) 361 | } 362 | 363 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Celligner 2 | 3 | ![](docs/celligner_public22q2.png) 4 | 5 | __Celligner__ is a computational approach for aligning tumor and cell line transcriptional profiles. 6 | 7 | To learn more, see the [paper](https://www.nature.com/articles/s41467-020-20294-x) 8 | 9 | ## Remark 10 | 11 | __Celligner__ is initially an R project that you can find in the `R/` folder. 12 | 13 | A Python version was made that performs the same computations as the R version, but the results may differ slightly due to small implementation differences in the Louvain clustering and contrastive PCA steps. 14 | 15 | ## Overview 16 | 17 | A **reference** expression dataset (e.g. CCLE cell lines) should be fit using the `fit()` function, and a **target** expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the `transform()` function. See the `run_celligner.py` script for example usage. Celligner is unsupervised and does not require annotations to be run; as such they are not used in this version of the model but can be added post-hoc to aid in interpretation of the output. See the `celligner_output.ipynb` notebook for an example of how to draw an output UMAP. 18 | 19 | The Celligner output can be explored at: [https://depmap.org/portal/celligner/](https://depmap.org/portal/celligner/) 20 | 21 | ## Install 22 | 23 | > To see the old R package installation instruction, see the `R/` folder. 24 | 25 | Before running pip, make sure that you have R installed. 26 | 27 | To install the latest version of Celligner in dev mode, run the following (note that Celligner requires the specific version of mnnpy that is associated with the repository as a submodule): 28 | 29 | ```bash 30 | git clone https://github.com/broadinstitute/celligner.git 31 | git checkout new_dev 32 | cd celligner 33 | pip install -e . 34 | cd mnnpy 35 | pip install . 36 | ``` 37 | 38 | A dockerfile and build script is also provided. 39 | 40 | 41 | ## Using Celligner 42 | 43 | Celligner has `fit()` and `transform()` functions in the style of scikit-learn models. 44 | 45 | A reference expression dataset (e.g. CCLE cell lines TPM expression) should first be fit: 46 | 47 | ```python 48 | from celligner import Celligner 49 | 50 | my_celligner = Celligner() 51 | my_celligner.fit(CCLE_expression) 52 | ``` 53 | 54 | A target expression dataset (e.g. TCGA+ tumor samples) can then be aligned to this reference using the transform function: 55 | 56 | ```python 57 | my_celligner.transform(TCGA_expression) 58 | ``` 59 | 60 | The combined transformed expression matrix can then be accessed via `my_celligner.combined_output`. Clusters, UMAP coordinates and tumor-model distances for all samples can be computed with `my_celligner.computeMetricsForOutput()`. There are also functions to save/load a fitted Celligner model as a .pkl file. 61 | 62 | ### Aligning the target dataset to a new reference dataset 63 | This use case is for the scenario where you want to align the same target dataset to a new reference dataset (which might be the same reference dataset as before with some new samples). In this case you can call transform without the target dataset to re-use the previous target dataset and skip re-doing some computation (see diagram below). 64 | 65 | ```python 66 | my_celligner.fit(new_reference_expression) 67 | my_celligner.transform() 68 | ``` 69 | 70 | ### Aligning a third dataset to the previous combined output 71 | This use case is for the scenario where you have a third dataset (e.g. Met500 tumor samples), that you want to align the the previously aligned (e.g. CCLE+TCGA) dataset. This is the current approach for multi-dataset alignment taken by the Celligner app. 72 | 73 | ```python 74 | my_celligner.makeNewReference() 75 | # The value of k1 should be selected based on the size of the new dataset. 76 | # We use k=20 for Met500 (n=~850), and k1=10 for the PDX datasets (n=~250-450). 77 | my_celligner.mnn_kwargs.update({"k1":20, "k2":50}) 78 | my_celligner.transform(met500_TPM, compute_cPCs=False) 79 | ``` 80 | 81 | ### Diagram 82 | This diagram provides an overview of how Celligner works, including for the different use cases described above. 83 | 84 | ![](docs/celligner_diagram.png) 85 | 86 | ### Computational complexity 87 | 88 | Depending on the dataset, Celligner can be quite memory hungry. 89 | For TCGA, expect at least _50-60Gb_ of memory to be used. You might need a powerfull computer, lots of _swap_ and to increase R's default _maximum allowed memory_. 90 | 91 | You can also use the `low_memory=True` option to reduce the memory used by Celligner in the memory intensive `PCA` & `cPCA` methods. 92 | 93 | 94 | # R Celligner 95 | 96 | For the original R version of celligner, please check the R/README.md file here: [https://github.com/broadinstitute.org/celligner/tree/master/R/README.md](https://github.com/broadinstitute.org/celligner/tree/master/R/README.md) 97 | 98 | --- 99 | 100 | __Initial project:__ 101 | 102 | Allie Warren @awarren 103 | 104 | __Initial python version:__ 105 | 106 | Jérémie Kalfon @jkobject 107 | 108 | __Current maintainer:__ 109 | 110 | Barbara De Kegel @bdekegel 111 | -------------------------------------------------------------------------------- /build_docker.sh: -------------------------------------------------------------------------------- 1 | # Builds celligner docker image 2 | # Note that this docker image does not have taigapy or mnnpy installed 3 | 4 | docker buildx build --platform linux/amd64 --push -t us.gcr.io/bdekegel/celligner:latest . -------------------------------------------------------------------------------- /celligner/VERSION: -------------------------------------------------------------------------------- 1 | 1.1.1 -------------------------------------------------------------------------------- /celligner/__init__.py: -------------------------------------------------------------------------------- 1 | from celligner.params import * 2 | from celligner import limma 3 | 4 | from sklearn.decomposition import PCA, IncrementalPCA 5 | from sklearn.linear_model import LinearRegression 6 | import sklearn.metrics as metrics 7 | import umap.umap_ as umap 8 | 9 | import scanpy as sc 10 | from anndata import AnnData 11 | 12 | import os 13 | import pickle 14 | import gc 15 | 16 | import pandas as pd 17 | import numpy as np 18 | 19 | #from contrastive import CPCA 20 | import mnnpy 21 | 22 | 23 | class Celligner(object): 24 | def __init__( 25 | self, 26 | topKGenes=TOP_K_GENES, 27 | pca_ncomp=PCA_NCOMP, 28 | cpca_ncomp=CPCA_NCOMP, 29 | louvain_kwargs=LOUVAIN_PARAMS, 30 | mnn_kwargs=MNN_PARAMS, 31 | umap_kwargs=UMAP_PARAMS, 32 | mnn_method="mnn_marioni", 33 | low_mem=False, 34 | ): 35 | """ 36 | Initialize Celligner object 37 | 38 | Args: 39 | topKGenes (int, optional): see params.py. Defaults to 1000. 40 | pca_ncomp (int, optional): see params.py. Defaults to 70. 41 | cpca_ncomp (int, optional): see params.py. Defaults to 4. 42 | louvain_kwargs (dict, optional): see params.py 43 | mnn_kwargs (dict, optional): see params.py 44 | umap_kwargs (dict, optional): see params.py 45 | mnn_method (str, optional): Only default "mnn_marioni" supported right now. 46 | low_mem (bool, optional): adviced if you have less than 32Gb of RAM. Defaults to False. 47 | """ 48 | 49 | self.topKGenes = topKGenes 50 | self.pca_ncomp = pca_ncomp 51 | self.cpca_ncomp = cpca_ncomp 52 | self.louvain_kwargs = louvain_kwargs 53 | self.mnn_kwargs = mnn_kwargs 54 | self.umap_kwargs = umap_kwargs 55 | self.mnn_method = mnn_method 56 | self.low_mem = low_mem 57 | 58 | self.ref_input = None 59 | self.ref_clusters = None 60 | self.ref_de_genes = None 61 | 62 | self.target_input = None 63 | self.target_clusters = None 64 | self.target_de_genes = None 65 | 66 | self.de_genes = None 67 | self.cpca_loadings = None 68 | self.cpca_explained_var = None 69 | self.combined_output = None 70 | 71 | self.umap_reduced = None 72 | self.output_clusters = None 73 | self.tumor_CL_dist = None 74 | 75 | 76 | def __checkExpression(self, expression, is_reference): 77 | """ 78 | Checks gene overlap with reference, checks for NaNs, then does mean-centering. 79 | 80 | Args: 81 | expression (pd.Dataframe): expression data as samples (rows) x genes (columns) 82 | is_reference (bool): whether the expression is a reference or target 83 | 84 | Raises: 85 | ValueError: if some common genes are missing from the expression dataset 86 | ValueError: if the expression matrix contains nan values 87 | 88 | Returns: 89 | (pd.Dataframe): the expression matrix 90 | """ 91 | # Check gene overlap 92 | if expression.loc[:, expression.columns.isin(self.common_genes)].shape[1] < len(self.common_genes): 93 | if not is_reference: 94 | raise ValueError("Some genes from reference dataset not found in target dataset") 95 | else: 96 | raise ValueError("Some genes from previously fit target dataset not found in new reference dataset") 97 | 98 | expression = expression.loc[:, self.common_genes].astype(float) 99 | 100 | # Raise issue if there are any NaNs in the expression dataframe 101 | if expression.isnull().values.any(): 102 | raise ValueError("Expression dataframe contains NaNs") 103 | 104 | # Mean center the expression dataframe 105 | expression = expression.sub(expression.mean(0), 1) 106 | 107 | return expression 108 | 109 | 110 | def __cluster(self, expression): 111 | """ 112 | Cluster expression in (n=70)-dimensional PCA space using a shared nearest neighbor based method 113 | 114 | Args: 115 | expression (pd.Dataframe): expression data as samples (rows) x genes (columns) 116 | 117 | Returns: 118 | (list): cluster label for each sample 119 | """ 120 | # Create anndata object 121 | adata = AnnData(expression, dtype='float64') 122 | 123 | # Find PCs 124 | print("Doing PCA..") 125 | sc.tl.pca(adata, n_comps=self.pca_ncomp, zero_center=True, svd_solver='arpack') 126 | 127 | # Find shared nearest neighbors (SNN) in PC space 128 | # Might produce different results from the R version as ScanPy and Seurat differ in their implementation. 129 | print("Computing neighbors..") 130 | sc.pp.neighbors(adata, knn=True, use_rep='X_pca', n_neighbors=20, n_pcs=self.pca_ncomp) 131 | 132 | print("Clustering..") 133 | sc.tl.louvain(adata, use_weights=True, **self.louvain_kwargs) 134 | fit_clusters = adata.obs["louvain"].values.astype(int) 135 | 136 | del adata 137 | gc.collect() 138 | 139 | return fit_clusters 140 | 141 | 142 | def __runDiffExprOnClusters(self, expression, clusters): 143 | """ 144 | Runs limma (R) on the clustered data. 145 | 146 | Args: 147 | expression (pd.Dataframe): expression data 148 | clusters (list): the cluster labels (per sample) 149 | 150 | Returns: 151 | (pd.Dataframe): limmapy results 152 | """ 153 | 154 | n_clusts = len(set(clusters)) 155 | print("Running differential expression on " + str(n_clusts) + " clusters..") 156 | clusts = set(clusters) - set([-1]) 157 | 158 | # make a design matrix 159 | design_matrix = pd.DataFrame( 160 | index=expression.index, 161 | data=np.array([clusters == i for i in clusts]).T, 162 | columns=["C" + str(i) + "C" for i in clusts], 163 | ) 164 | design_matrix.index = design_matrix.index.astype(str).str.replace("-", ".") 165 | design_matrix = design_matrix[design_matrix.sum(1) > 0] 166 | 167 | # creating the matrix 168 | data = expression.T 169 | data = data[data.columns[clusters != -1].tolist()] 170 | 171 | # running limmapy 172 | print("Running limmapy..") 173 | res = ( 174 | limma.limmapy() 175 | .lmFit(data, design_matrix) 176 | .eBayes(trend=False) 177 | .topTable(number=len(data)) 178 | .iloc[:, len(clusts) :] 179 | ) 180 | return res.sort_values(by="F", ascending=False) 181 | 182 | 183 | def __runCPCA(self, centered_ref_input, centered_target_input): 184 | """ 185 | Perform contrastive PCA on the centered reference and target expression datasets 186 | 187 | Args: 188 | centered_ref_input (pd.DataFrame): reference expression matrix where the cluster mean has been subtracted 189 | centered_target_input (pd.DataFrame): target expression matrix where the cluster mean has been subtracted 190 | 191 | Returns: 192 | (ndarray, ncomponents x ngenes): principal axes in feature space 193 | (ndarray, ncomponents,): variance explained by each component 194 | 195 | """ 196 | target_cov = centered_target_input.cov() 197 | ref_cov = centered_ref_input.cov() 198 | if not self.low_mem: 199 | pca = PCA(self.cpca_ncomp, svd_solver="randomized", copy=False) 200 | else: 201 | pca = IncrementalPCA(self.cpca_ncomp, copy=False, batch_size=1000) 202 | 203 | pca.fit(target_cov - ref_cov) 204 | return pca.components_, pca.explained_variance_ 205 | 206 | 207 | def fit(self, ref_expr): 208 | """ 209 | Fit the model to the reference expression dataset - cluster + find differentially expressed genes. 210 | 211 | Args: 212 | ref_expr (pd.Dataframe): reference expression matrix of samples (rows) by genes (columns), 213 | where genes are ensembl gene IDs. Data should be log2(X+1) TPM data. 214 | In the standard Celligner pipeline this the cell line data. 215 | 216 | Raises: 217 | ValueError: if only 1 cluster is found in the PCs of the expression 218 | """ 219 | 220 | self.common_genes = list(ref_expr.columns) 221 | self.ref_input = self.__checkExpression(ref_expr, is_reference=True) 222 | 223 | # Cluster and find differential expression for reference data 224 | self.ref_clusters = self.__cluster(self.ref_input) 225 | if len(set(self.ref_clusters)) < 2: 226 | raise ValueError("Only one cluster found in reference data, no differential expression possible") 227 | self.ref_de_genes = self.__runDiffExprOnClusters(self.ref_input, self.ref_clusters) 228 | 229 | return self 230 | 231 | 232 | def transform(self, target_expr=None, compute_cPCs=True): 233 | """ 234 | Align samples in the target dataset to samples in the reference dataset 235 | 236 | Args: 237 | target_expr (pd.Dataframe, optional): target expression matrix of samples (rows) by genes (columns), 238 | where genes are ensembl gene IDs. Data should be log2(X+1) TPM data. 239 | In the standard Celligner pipeline this the tumor data (TCGA). 240 | Set to None if re-running transform with new reference data. 241 | compute_cPCs (bool, optional): if True, compute cPCs from the fitted reference and target expression. Defaults to True. 242 | 243 | Raises: 244 | ValueError: if compute_cPCs is True but there is no reference input (fit has not been run) 245 | ValueError: if compute_cPCs is False but there are no previously computed cPCs available (transform has not been previously run) 246 | ValueError: if no target expression is provided and there is no previously provided target data 247 | ValueError: if no target expression is provided and compute_cPCs is true; there is no use case for this 248 | ValueError: if there are not enough clusters to compute DE genes for the target dataset 249 | """ 250 | 251 | if self.ref_input is None and compute_cPCs: 252 | raise ValueError("Need fitted reference dataset to compute cPCs, run fit function first") 253 | 254 | if not compute_cPCs and self.cpca_loadings is None: 255 | raise ValueError("No cPCs found, transform needs to be run with compute_cPCs==True at least once") 256 | 257 | if target_expr is None and self.target_input is None: 258 | raise ValueError("No previous data found for target, transform needs to be run with target expression at least once") 259 | 260 | if not compute_cPCs and target_expr is None: 261 | raise ValueError("No use case for running transform without new target data when compute_cPCs==True") 262 | 263 | if compute_cPCs: 264 | 265 | if target_expr is not None: 266 | 267 | self.target_input = self.__checkExpression(target_expr, is_reference=False) 268 | 269 | # Cluster and find differential expression for target data 270 | self.target_clusters = self.__cluster(self.target_input) 271 | if len(set(self.target_clusters)) < 2: 272 | raise ValueError("Only one cluster found in reference data, no differential expression possible") 273 | self.target_de_genes = self.__runDiffExprOnClusters(self.target_input, self.target_clusters) 274 | 275 | # Union of the top 1000 differentially expressed genes in each dataset 276 | self.de_genes = pd.Series(list(self.ref_de_genes[:self.topKGenes].index) + 277 | list(self.target_de_genes[:self.topKGenes].index)).drop_duplicates().to_list() 278 | 279 | else: 280 | print("INFO: No new target expression provided, using previously provided target dataset") 281 | 282 | # Subtract cluster average from cluster samples 283 | centered_ref_input = pd.concat( 284 | [ 285 | self.ref_input.loc[self.ref_clusters == val] - self.ref_input.loc[self.ref_clusters == val].mean(axis=0) 286 | for val in set(self.ref_clusters) 287 | ] 288 | ).loc[self.ref_input.index] 289 | 290 | centered_target_input = pd.concat( 291 | [ 292 | self.target_input.loc[self.target_clusters == val] - self.target_input.loc[self.target_clusters == val].mean(axis=0) 293 | for val in set(self.target_clusters) 294 | ] 295 | ).loc[self.target_input.index] 296 | 297 | # Compute contrastive PCs 298 | print("Running cPCA..") 299 | self.cpca_loadings, self.cpca_explained_var = self.__runCPCA(centered_ref_input, centered_target_input) 300 | 301 | del centered_ref_input, centered_target_input 302 | gc.collect() 303 | 304 | print("Regressing top cPCs out of reference dataset..") 305 | # Take the residuals of the linear regression of ref_input with the cpca_loadings 306 | transformed_ref = (self.ref_input - 307 | LinearRegression(fit_intercept=False) 308 | .fit(self.cpca_loadings.T, self.ref_input.T) 309 | .predict(self.cpca_loadings.T) 310 | .T 311 | ) 312 | 313 | # Using previously computed cPCs - for multi-dataset alignment 314 | else: 315 | 316 | # Allow some genes to be missing in new target dataset 317 | missing_genes = list(self.ref_input.loc[:, ~self.ref_input.columns.isin(target_expr.columns)].columns) 318 | if len(missing_genes) > 0: 319 | print('WARNING: %d genes from reference dataset not found in new target dataset, subsetting to overlap' % (len(missing_genes))) 320 | # Get index of dropped genes 321 | drop_idx = [self.ref_input.columns.get_loc(g) for g in missing_genes] 322 | 323 | # Filter refence dataset 324 | self.ref_input = self.ref_input.loc[:, self.ref_input.columns.isin(target_expr.columns)] 325 | self.common_genes = list(self.ref_input.columns) 326 | 327 | # Drop cPCA loadings for genes that were filtered out 328 | self.cpca_loadings = np.array([np.delete(self.cpca_loadings[n], drop_idx) for n in range(self.cpca_ncomp)]) 329 | 330 | # Check if genes need to be dropped from DE list 331 | overlap = self.ref_input.loc[:, self.ref_input.columns.isin(self.de_genes)] 332 | if overlap.shape[1] < len(self.de_genes): 333 | print('WARNING: dropped genes include %d differentially expressed genes that may be important' % (len(self.de_genes) - overlap.shape[1])) 334 | temp = pd.Series(self.de_genes) 335 | self.de_genes = temp[temp.isin(self.ref_input.columns)].to_list() 336 | 337 | self.target_input = self.__checkExpression(target_expr, is_reference=False) 338 | transformed_ref = self.ref_input 339 | 340 | # Only need to regress out of target dataset if using previously computed cPCs 341 | print("Regressing top cPCs out of target dataset..") 342 | transformed_target = (self.target_input - 343 | LinearRegression(fit_intercept=False) 344 | .fit(self.cpca_loadings.T, self.target_input.T) 345 | .predict(self.cpca_loadings.T) 346 | .T 347 | ) 348 | 349 | # Do MNN 350 | print("Doing the MNN analysis using Marioni et al. method..") 351 | # Use top DE genes only 352 | varsubset = np.array([1 if i in self.de_genes else 0 for i in self.target_input.columns]).astype(bool) 353 | target_corrected, self.mnn_pairs = mnnpy.marioniCorrect( 354 | transformed_ref, 355 | transformed_target, 356 | var_index=list(range(len(self.ref_input.columns))), 357 | var_subset=varsubset, 358 | **self.mnn_kwargs, 359 | ) 360 | 361 | if compute_cPCs: 362 | self.combined_output = pd.concat([target_corrected, transformed_ref]) 363 | else: # Append at the end for multi-dataset alignment case 364 | self.combined_output = pd.concat([transformed_ref, target_corrected]) 365 | 366 | del target_corrected 367 | gc.collect() 368 | 369 | print('Done') 370 | 371 | return self 372 | 373 | 374 | def computeMetricsForOutput(self, umap_rand_seed=14, UMAP_only=False, model_ids=None, tumor_ids=None): 375 | """ 376 | Compute UMAP embedding and optionally clusters and tumor - model distance. 377 | 378 | Args: 379 | UMAP_only (bool, optional): Only recompute the UMAP. Defaults to False. 380 | umap_rand_seed (int, optional): Set seed for UMAP, to try an alternative. Defaults to 14. 381 | model_ids (list, optional): model IDs for computing tumor-CL distance. Defaults to None, in which case the reference index is used. 382 | tumor_ids (list, optional): tumor IDs for computing tumor-CL distance. Defaults to None, in which case the target index is used. 383 | 384 | Raises: 385 | ValueError: if there is no corrected expression matrix 386 | """ 387 | if self.combined_output is None: 388 | raise ValueError("No corrected expression matrix found, run this function after transform()") 389 | 390 | print("Computing UMAP embedding...") 391 | # Compute UMAP embedding for results 392 | pca = PCA(self.pca_ncomp) 393 | pcs = pca.fit_transform(self.combined_output) 394 | 395 | umap_reduced = umap.UMAP(**self.umap_kwargs, random_state=umap_rand_seed).fit_transform(pcs) 396 | self.umap_reduced = pd.DataFrame(umap_reduced, index=self.combined_output.index, columns=['umap1','umap2']) 397 | 398 | if not UMAP_only: 399 | 400 | print('Computing clusters..') 401 | self.output_clusters = self.__cluster(self.combined_output) 402 | 403 | print("Computing tumor-CL distance..") 404 | pcs = pd.DataFrame(pcs, index=self.combined_output.index) 405 | if model_ids is None: model_ids = self.ref_input.index 406 | if tumor_ids is None: tumor_ids = self.target_input.index 407 | model_pcs = pcs[pcs.index.isin(model_ids)] 408 | tumor_pcs = pcs[pcs.index.isin(tumor_ids)] 409 | 410 | self.tumor_CL_dist = pd.DataFrame(metrics.pairwise_distances(tumor_pcs, model_pcs), index=tumor_pcs.index, columns=model_pcs.index) 411 | 412 | return self 413 | 414 | 415 | def makeNewReference(self): 416 | """ 417 | Make a new reference dataset from the previously transformed reference+target datasets. 418 | Used for multi-dataset alignment with previously computed cPCs and DE genes. 419 | 420 | """ 421 | self.ref_input = self.combined_output 422 | self.target_input = None 423 | return self 424 | 425 | 426 | def save(self, file_name): 427 | """ 428 | Save the model as a pickle file 429 | 430 | Args: 431 | file_name (str): name of file in which to save the model 432 | """ 433 | # save the model 434 | with open(os.path.normpath(file_name), "wb") as f: 435 | pickle.dump(self, f) 436 | 437 | 438 | def load(self, file_name): 439 | """ 440 | Load the model from a pickle file 441 | 442 | Args: 443 | file_name (str): pickle file to load the model from 444 | """ 445 | with open(os.path.normpath(file_name), "rb") as f: 446 | model = pickle.load(f) 447 | self.__dict__.update(model.__dict__) 448 | return self -------------------------------------------------------------------------------- /celligner/limma.py: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # 3 | # limmapy 4 | # 5 | ################################################################## 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | import rpy2.robjects as robjects 10 | from rpy2.robjects import pandas2ri 11 | pandas2ri.activate() 12 | import rpy2 13 | from rpy2.robjects.packages import importr 14 | limma = importr('limma') 15 | from rpy2.robjects.conversion import localconverter 16 | import rpy2.robjects as ro 17 | import sys 18 | 19 | to_dataframe = robjects.r('function(x) data.frame(x)') 20 | 21 | 22 | class limmapy: 23 | ''' 24 | limma object through rpy2 25 | input: 26 | count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id 27 | example: 28 | id sampleA sampleB 29 | geneA 5 1 30 | geneB 4 5 31 | geneC 1 2 32 | design_matrix: an design matrix in the form of pandas dataframe, see limma manual, samplenames as rownames 33 | treatment1, treatment2, ... 34 | sampleA A A B 35 | sampleA A A B 36 | sampleB B B A 37 | sampleB B A B 38 | ''' 39 | 40 | def __init__(self): 41 | self.limma_result = None 42 | 43 | def lmFit(self, count_matrix, design_matrix, **kwargs): 44 | with localconverter(ro.default_converter + pandas2ri.converter): 45 | count_matrix = pandas2ri.py2rpy(count_matrix.astype(int)) 46 | design_matrix = pandas2ri.py2rpy(design_matrix.astype(int)) 47 | self.fit = limma.lmFit(count_matrix, design_matrix, **kwargs) 48 | return self 49 | 50 | def eBayes(self, **kwargs): 51 | self.fit = limma.eBayes(self.fit, **kwargs) 52 | return self 53 | 54 | def topTable(self, **kwargs): 55 | val = limma.topTable(self.fit, **kwargs) 56 | if type(val) == robjects.vectors.DataFrame: 57 | with robjects.conversion.localconverter( 58 | robjects.default_converter + pandas2ri.converter): 59 | val = ro.conversion.rpy2py(val) 60 | return val 61 | -------------------------------------------------------------------------------- /celligner/params.py: -------------------------------------------------------------------------------- 1 | # Oncotree tissue colors 2 | TISSUE_COLOR_OT = { 3 | "Adrenal Gland": "#E13978", 4 | "Ampulla of Vater": "#F5899E", 5 | "Biliary Tract": "#C091E3", 6 | "Bladder/Urinary Tract":"#E08571", 7 | "Bone": "#9F55BB", 8 | "Breast":"#45A132", 9 | "Bowel":"#96568E", 10 | "CNS/Brain": "#F5899E", 11 | "Cervix":"#5AB172", 12 | "Esophagus/Stomach": "#DFBC3A", 13 | "Eye": "#349077", 14 | "Fibroblast": "#D8AB6A", 15 | "Embryonal":"#75DFBB", 16 | "Head and Neck": "#5DA134", 17 | "Kidney": "#1F8FFF", 18 | "Liver": "#9C5E2B", 19 | "Lung": "#51D5E0", 20 | "Lymphoid": "#ABD23F", 21 | "Myeloid": "#DA45BB", 22 | "Normal":"#555555", 23 | "Ovary/Fallopian Tube": "#56E79D", 24 | "Pancreas": "#B644DC", 25 | "Peripheral Nervous System": "#73E03D", 26 | "Pleura": "#F5899E", ### 27 | "Prostate": "#3870C9", 28 | "Skin": "#6C55E2", 29 | "Soft Tissue": "#5FDB69", 30 | "Testis": "#F5899E", ### 31 | "Thymus": "#659FD9", 32 | "Thyroid": "#D74829", 33 | "Other/Unknown": "#bdbdbd", 34 | "Uterus": "#E491C1", 35 | "Vulva/Vagina":"#E491C1" 36 | } 37 | 38 | TISSUE_COLOR = { 39 | "engineered": "#bcdfbd", 40 | "fibroblast": "#9eAeAe", 41 | "other": "#A3969d", 42 | "skin": "#969696", 43 | "soft_tissue": "#cedb9c", 44 | "sarcomatoid": "#cdcdbd", 45 | "unknown": "#bdbdbd", 46 | "NS": "#becdbd", 47 | "teratoma": "#252525", 48 | "germ_cell": "#c7c7c7", 49 | "embryo": "#7f7f7f", 50 | "bone": "#aec7e8", 51 | "lymphocyte": "#17becf", 52 | "plasma_cell": "#9edae5", 53 | "blood": "#1f77b4", 54 | "engineered_blood": "#2f87b4", 55 | "central_nervous_system": "#ff7f0e", 56 | "engineered_central_nervous_system": "#ff8f3f", 57 | "peripheral_nervous_system": "#ffbb78", 58 | "nerve": "#dbdb8d", 59 | "autonomic_ganglia": "#ebcb8d", 60 | "eye": "#bcbd22", 61 | "lung": "#d62728", 62 | "engineered_lung": "#ee2e3e", 63 | "upper_aerodigestive": "#ff9896", 64 | "esophagus": "#e7969c", 65 | "nasopharynx": "#f7b6d2", 66 | "oral": "#feceee", 67 | "parotid": "#fdbf6f", 68 | "stomach": "#e377c2", 69 | "gall_bladder": "#ff7f0e", 70 | "bile_duct": "#a55194", 71 | "engineered_bile_duct": "#a55194", 72 | "ampulla_of_vater": "#ad3184", 73 | "pancreas": "#e377c2", 74 | "liver": "#9467bd", 75 | "gastric": "#c49c94", 76 | "small_intestine": "#9e5e6e", 77 | "colon": "#8c564b", 78 | "ovary": "#2ca02c", 79 | "engineered_ovary": "#4eae4e", 80 | "uterus": "#98df8a", 81 | "cervix": "#5ab172", 82 | "breast": "#393b79", 83 | "engineered_breast": "#4e3e7e", 84 | "kidney": "#386cb0", 85 | "engineered_kidney": "#386cb0", 86 | "bladder": "#397cb9", 87 | "urinary_tract": "#b644dc", 88 | "prostate": "#637939", 89 | "engineered_prostate": "#6e7e3e", 90 | "testis": "#8c6d31", 91 | "thyroid": "#8f7e3e", 92 | "endocrine": "#bd9e39", 93 | "biliary_tract": "#e7ba52", 94 | "adrenal": "#8ca252", 95 | "thymus": "#659fd9" 96 | } 97 | 98 | TISSUE_COLOR_R = { 99 | "central_nervous_system": "#f5899e", 100 | "engineered_central_nervous_system": "#f5899e", 101 | "teratoma": "#f5899e", 102 | "bone": "#9f55bb", 103 | "pancreas": "#b644dc", 104 | "soft_tissue": "#5fdb69", 105 | "skin": "#6c55e2", 106 | "liver": "#9c5e2b", 107 | "blood": "#da45bb", 108 | "lymphocyte": "#abd23f", 109 | "peripheral_nervous_system": "#73e03d", 110 | "ovary": "#56e79d", 111 | "engineered_ovary": "#56e79d", 112 | "adrenal": "#e13978", 113 | "adrenal_cortex": "#e13978", 114 | "upper_aerodigestive": "#5da134", 115 | "kidney": "#1f8fff", 116 | "engineered_kidney": "#1f8fff", 117 | "gastric": "#dfbc3a", 118 | "eye": "#349077", 119 | "nasopharynx": "#a9e082", 120 | "nerve": "#c44c90", 121 | "unknown": "#999999", 122 | "cervix": "#5ab172", 123 | "thyroid": "#d74829", 124 | "lung": "#51d5e0", 125 | "engineered_lung": "#51d5e0", 126 | "rhabdoid": "#d04850", 127 | "germ_cell": "#75dfbb", 128 | "embryo": "#75dfbb", 129 | "colorectal": "#96568e", 130 | "endocrine": "#d1d684", 131 | "bile_duct": "#c091e3", 132 | "pineal": "#949031", 133 | "thymus": "#659fd9", 134 | "mesothelioma": "#dc882d", 135 | "prostate": "#3870c9", 136 | "engineered_prostate": "#3870c9", 137 | "uterus": "#e491c1", 138 | "breast": "#45a132", 139 | "engineered_breast": "#45a132", 140 | "urinary_tract": "#e08571", 141 | "esophagus": "#6a6c2c", 142 | "fibroblast": "#d8ab6a", 143 | "plasma_cell": "#e6c241", 144 | } 145 | 146 | 147 | 148 | #mnn_ndist = 3, # ndist parameter used for MNN 149 | 150 | # Differentially expressed genes with a rank better than this is in the cell line 151 | # or tumor data are used to identify mutual nearest neighbors in the MNN alignment step 152 | TOP_K_GENES = 1000 153 | 154 | # number of PCs to use for dimensionality reduction 155 | PCA_NCOMP = 70 156 | 157 | # number of cPCA dimensions to regress out of the data 158 | CPCA_NCOMP = 4 159 | 160 | # @see https://scanpy.readthedocs.io/en/latest/generated/scanpy.tl.louvain.html 161 | LOUVAIN_PARAMS = { 162 | "resolution": 5, # resolution parameter used for clustering the data 163 | } 164 | 165 | # For Mariona method (default) 166 | MNN_PARAMS = { 167 | "k1": 5, # number of nearest neighbors of tumors in the cell line data 168 | "k2": 50, # number of nearest neighbors of cell lines in the tumor data 169 | "cosine_norm": False, 170 | "fk": 5 171 | } 172 | 173 | UMAP_PARAMS = { 174 | "n_neighbors": 10, # num nearest neighbors used to create UMAP plot 175 | "n_components": 2, 176 | "metric": "euclidean", # distance metric used for the UMAP projection 177 | "min_dist": 0.5 # min distance used to create UMAP plot 178 | } 179 | -------------------------------------------------------------------------------- /docs/Screenshot 2021-10-29 at 10.51.53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/Screenshot 2021-10-29 at 10.51.53.png -------------------------------------------------------------------------------- /docs/Screenshot 2021-10-29 at 10.53.01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/Screenshot 2021-10-29 at 10.53.01.png -------------------------------------------------------------------------------- /docs/celligner.md: -------------------------------------------------------------------------------- 1 | # Reference 2 | 3 | ::: celligner 4 | selection: 5 | docstring_style: google 6 | rendering: 7 | show_source: true -------------------------------------------------------------------------------- /docs/celligner_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/celligner_diagram.png -------------------------------------------------------------------------------- /docs/celligner_public22q2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/celligner_public22q2.png -------------------------------------------------------------------------------- /docs/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/example.pdf -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to MkDocs 2 | 3 | For full documentation visit [mkdocs.org](https://www.mkdocs.org). 4 | 5 | ## Commands 6 | 7 | * `mkdocs new [dir-name]` - Create a new project. 8 | * `mkdocs serve` - Start the live-reloading docs server. 9 | * `mkdocs build` - Build the documentation site. 10 | * `mkdocs -h` - Print help message and exit. 11 | 12 | ## Project layout 13 | 14 | mkdocs.yml # The configuration file. 15 | docs/ 16 | index.md # The documentation homepage. 17 | ... # Other markdown pages, images and other files. 18 | -------------------------------------------------------------------------------- /docs/typical_celligner.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/celligner/d9c9246f8a1b6885d07f2f28bbdca24253e57cf1/docs/typical_celligner.webp -------------------------------------------------------------------------------- /install_submodules_and_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #install submodules 4 | echo "INSTALLING SUBMODULES..." 5 | 6 | #upgrade pip 7 | pip install --upgrade pip 8 | 9 | #setup other dependencies 10 | pip install taigapy 11 | cd mnnpy; pip install .; cd .. 12 | 13 | #run QC 14 | echo "RUNNING CELLIGNER..." 15 | python "$@" -------------------------------------------------------------------------------- /man/calc_gene_stats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{calc_gene_stats} 4 | \alias{calc_gene_stats} 5 | \title{Method to calculate gene average expression and variance for an expression matrix} 6 | \usage{ 7 | calc_gene_stats(dat, hgnc_data_name, hgnc_data_file, hgnc_version, hgnc_taiga) 8 | } 9 | \arguments{ 10 | \item{dat:}{data object containing tumor and cell line expression data and annotations produced by running load_data} 11 | 12 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 13 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations} 14 | 15 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 16 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations} 17 | 18 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL} 19 | 20 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder} 21 | } 22 | \value{ 23 | gene stats matrix 24 | } 25 | \description{ 26 | calculate the average gene expression and variance 27 | } 28 | -------------------------------------------------------------------------------- /man/calc_tumor_CL_cor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{calc_tumor_CL_cor} 4 | \alias{calc_tumor_CL_cor} 5 | \title{Method to calculate the correlation between cell lines and tumor in the Celligner aligned data} 6 | \usage{ 7 | calc_tumor_CL_cor(Celligner_aligned_data, Celligner_info) 8 | } 9 | \arguments{ 10 | \item{Celligner_aligned_data:}{Celligner aligned data matrix of samples (cells line and tumors) by genes} 11 | 12 | \item{Celligner_info:}{annotation file of cell line and tumor samples with a column 'type' marking samples as either 13 | cell lines or tumors and a column 'sampleID' that matches the row names of Celligner_aligned_data} 14 | } 15 | \value{ 16 | matrix of correlations that is tumors by cell lines 17 | } 18 | \description{ 19 | calculate the correlation between cell line and tumor samples in the Celligner aligned data 20 | } 21 | -------------------------------------------------------------------------------- /man/check_NAs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{check_NAs} 4 | \alias{check_NAs} 5 | \title{check for NAs in the expression data and remove samples with NAs} 6 | \usage{ 7 | check_NAs(mat) 8 | } 9 | \arguments{ 10 | \item{mat:}{matrix of gene expression data that is samples by genes} 11 | } 12 | \value{ 13 | matrix of gene expression data, removing samples that have NAs 14 | } 15 | \description{ 16 | check for NAs in the expression data and remove samples with NAs 17 | } 18 | -------------------------------------------------------------------------------- /man/cluster_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{cluster_data} 4 | \alias{cluster_data} 5 | \title{Method to take in a Seurat object and run default Seurat clustering algorithm} 6 | \usage{ 7 | cluster_data(seu_obj) 8 | } 9 | \arguments{ 10 | \item{seu_obj:}{seurat object containing expression data and sample annotations. 11 | Expects PCA for the seurat object has already been calculated.} 12 | } 13 | \value{ 14 | Seurat object with cluster annotations 15 | } 16 | \description{ 17 | cluster data in seurat object, using default Seurat clustering method. Clsuters data 18 | within PCA space using the number of dimensions provided in celligner_global$n_PC_dims (default is 70) 19 | } 20 | -------------------------------------------------------------------------------- /man/create_Seurat_object.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{create_Seurat_object} 4 | \alias{create_Seurat_object} 5 | \title{Method to create seurat objects given an expression matrix and annotation table} 6 | \usage{ 7 | create_Seurat_object(exp_mat, ann, type = NULL) 8 | } 9 | \arguments{ 10 | \item{exp_mat:}{matrix of samples by genes, where genes are ensembl gene IDs. Data should be log2(X+1) TPM data.} 11 | 12 | \item{ann:}{matrix of sample anntoations. Expects column 'sampleID' which matches the rownames of exp_mat.} 13 | 14 | \item{type:}{optional parameter, string specifying the data type of the current data (ex. 'tumor'), which is added to the annotation matrix.} 15 | } 16 | \value{ 17 | Seurat object with scaled expression data and annotations stored in meta.data 18 | } 19 | \description{ 20 | create Seurat object of expression data and annotations and run dimensionality reduction. 21 | Dimensionality reductions will be run with the parameters (n_PC_dims, umap_n_neighbors, umap_min_dist, distance_metric) specified in celligner_global. 22 | } 23 | -------------------------------------------------------------------------------- /man/dot-average_correction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{.average_correction} 4 | \alias{.average_correction} 5 | \title{calculate the average correction vector} 6 | \usage{ 7 | .average_correction(refdata, mnn1, curdata, mnn2) 8 | } 9 | \arguments{ 10 | \item{refdata:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 11 | In the standard Celligner pipeline this the cell line data.} 12 | 13 | \item{mnn1:}{mnn1 pairs} 14 | 15 | \item{curdata:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 16 | In the standard Celligner pipeline this the tumor data.} 17 | 18 | \item{mnn2:}{mnn2 pairs} 19 | } 20 | \value{ 21 | correction vector and pairs 22 | } 23 | \description{ 24 | Computes correction vectors for each MNN pair, and then averages them for each MNN-involved cell in the second batch. 25 | Copied from dev version of scran (2018-10-28), with slight modifications as noted https://github.com/MarioniLab/scran 26 | } 27 | -------------------------------------------------------------------------------- /man/dot-center_along_batch_vector.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{.center_along_batch_vector} 4 | \alias{.center_along_batch_vector} 5 | \title{centers samples within each batch} 6 | \usage{ 7 | .center_along_batch_vector(mat, batch.vec) 8 | } 9 | \arguments{ 10 | \item{mat:}{matrix of samples by genes} 11 | 12 | \item{batch.vec:}{batch vector} 13 | } 14 | \value{ 15 | correction vector and pairs 16 | } 17 | \description{ 18 | Projecting along the batch vector, and shifting all samples to the center within each batch. 19 | This removes any variation along the overall batch vector within each matrix. 20 | } 21 | -------------------------------------------------------------------------------- /man/dot-compute_tricube_average.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{.compute_tricube_average} 4 | \alias{.compute_tricube_average} 5 | \title{compute tricube averages} 6 | \usage{ 7 | .compute_tricube_average(vals, indices, distances, bandwidth = NULL, ndist = 3) 8 | } 9 | \arguments{ 10 | \item{values:}{correction vector} 11 | 12 | \item{indices:}{nxk matrix for the nearest neighbor indice} 13 | 14 | \item{distances:}{nxk matrix for the nearest neighbor Euclidea distances} 15 | 16 | \item{bandwidth:}{Is set at 'ndist' times the median distance, if not specified.} 17 | 18 | \item{ndist:}{By default is 3.} 19 | } 20 | \description{ 21 | Centralized function to compute tricube averages. 22 | } 23 | -------------------------------------------------------------------------------- /man/dot-tricube_weighted_correction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{.tricube_weighted_correction} 4 | \alias{.tricube_weighted_correction} 5 | \title{tricube-weighted correction} 6 | \usage{ 7 | .tricube_weighted_correction( 8 | curdata, 9 | correction, 10 | in.mnn, 11 | k = 20, 12 | ndist = 3, 13 | subset_genes, 14 | BNPARAM = NULL, 15 | BPPARAM = BiocParallel::SerialParam() 16 | ) 17 | } 18 | \arguments{ 19 | \item{curdata:}{target matrix of samples by genes} 20 | 21 | \item{correction:}{corrected vector} 22 | 23 | \item{in.mnn:}{mnn pairs} 24 | 25 | \item{k:}{k values, default 20} 26 | 27 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 28 | By default is 3.} 29 | 30 | \item{subset_genes:}{genes used to identify mutual nearest neighbors} 31 | 32 | \item{BNPARAM:}{default NULL} 33 | 34 | \item{BPPARAM:}{default BiocParallel::SerialParam()} 35 | } 36 | \value{ 37 | MNN corrected data 38 | } 39 | \description{ 40 | Computing tricube-weighted correction vectors for individual samples, 41 | using the nearest neighbouring samples involved in MNN pairs. 42 | Modified to use FNN rather than queryKNN for nearest neighbor finding 43 | } 44 | -------------------------------------------------------------------------------- /man/find_differentially_expressed_genes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{find_differentially_expressed_genes} 4 | \alias{find_differentially_expressed_genes} 5 | \title{Method to find genes that are differentially expressed between clusters within the expression data} 6 | \usage{ 7 | find_differentially_expressed_genes(seu_obj) 8 | } 9 | \arguments{ 10 | \item{seu_obj:}{seurat object containing expression data and sample annotations. Expects data in the Seurat object 11 | slot scale.data and a column 'seurat_clusters' within the meta.data of the Seurat object.} 12 | } 13 | \value{ 14 | table with gene level stats 15 | } 16 | \description{ 17 | find genes that are differentially expressed between clusters within the expression data 18 | } 19 | -------------------------------------------------------------------------------- /man/get_cluster_averages.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{get_cluster_averages} 4 | \alias{get_cluster_averages} 5 | \title{calculate the average expression per cluster} 6 | \usage{ 7 | get_cluster_averages(mat, cluster_df) 8 | } 9 | \arguments{ 10 | \item{mat:}{sample by genes matrix of expression data} 11 | 12 | \item{cluster_df:}{table of sample metadata that includes a column 'seurat_clusters', 13 | containing transcriptional clusters} 14 | } 15 | \value{ 16 | average cluster expression 17 | } 18 | \description{ 19 | calculate the average expression per cluster 20 | } 21 | -------------------------------------------------------------------------------- /man/load_additional_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutlidataset_alignment.R 3 | \name{load_additional_data} 4 | \alias{load_additional_data} 5 | \title{Load additional expression and annotation data} 6 | \usage{ 7 | load_additional_data( 8 | data_name, 9 | data_file, 10 | data_version = NULL, 11 | data_taiga = TRUE, 12 | ann_name, 13 | ann_file, 14 | ann_version = NULL, 15 | ann_taiga = TRUE, 16 | data_type = "" 17 | ) 18 | } 19 | \arguments{ 20 | \item{data_name:}{if data_taiga = TRUE, then the data.name of the taiga file containing the expression data, 21 | if data_taiga=FALSE, then the file path to the local folder containing the expression data. Assumes that genes 22 | are labeled using ensembl IDs and that there are fewer samples than genes in the matrix, will transpose the matrix 23 | so that rows are samples and columns are genes.} 24 | 25 | \item{data_file:}{if data_taiga = TRUE, then the data.file of the taiga file containing the expression data, 26 | if data_taiga = FALSE, then the name of the file of expression data} 27 | 28 | \item{data_version:}{(optional) parameter to specify the version to pull from taiga for the expression data, default set to NULL} 29 | 30 | \item{data_taiga:}{if TRUE then pulls the expression data from taiga, if FALSE then finds expression data in local folder} 31 | 32 | \item{ann_name:}{if ann_taiga = TRUE, then the data.name of the taiga file containing the data annotations, 33 | if ann_taiga=FALSE, then the file path to the local folder containing the annotations} 34 | 35 | \item{ann_file:}{if ann_taiga = TRUE, then the data.file of the taiga file containing the data annotations, 36 | if ann_taiga=FALSE, then the name of the file of data annotations} 37 | 38 | \item{ann_version:}{(optional) parameter to specify the version to pull from taiga for the annotations, default set to NULL} 39 | 40 | \item{ann_taiga:}{if TRUE (default) then pulls the annotations from taiga, if FALSE then finds cell line annotations in local folder} 41 | 42 | \item{data_type:}{string added to the annotation file under the column type to specify the data, default is ""} 43 | } 44 | \value{ 45 | object containing expression matrix and annotations table 46 | } 47 | \description{ 48 | load additional expression and annotation files 49 | } 50 | -------------------------------------------------------------------------------- /man/load_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{load_data} 4 | \alias{load_data} 5 | \title{method to load in tumor and cell line expression data and annotations} 6 | \usage{ 7 | load_data( 8 | cell_line_data_name, 9 | cell_line_data_file, 10 | cell_line_version, 11 | cell_line_taiga, 12 | cell_line_ann_name, 13 | cell_line_ann_file, 14 | cell_line_ann_version, 15 | cell_line_ann_taiga, 16 | tumor_data_name, 17 | tumor_data_file, 18 | tumor_version, 19 | tumor_taiga, 20 | tumor_ann_name, 21 | tumor_ann_file, 22 | tumor_ann_version, 23 | tumor_ann_taiga, 24 | additional_annotations_name, 25 | additional_annotations_file, 26 | additional_annotations_version, 27 | additional_annotations_taiga, 28 | hgnc_data_name, 29 | hgnc_data_file, 30 | hgnc_version, 31 | hgnc_taiga 32 | ) 33 | } 34 | \arguments{ 35 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 36 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data} 37 | 38 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 39 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data} 40 | 41 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL} 42 | 43 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder} 44 | 45 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 46 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations} 47 | 48 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 49 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 50 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.} 51 | 52 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL} 53 | 54 | \item{cell_line_ann_taiga:}{if TRUE then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder} 55 | 56 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 57 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data. 58 | If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 59 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.} 60 | 61 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 62 | if tumor_taiga=FALSE, then the name of the file the tumor expression data} 63 | 64 | \item{tumor_taiga:}{if TRUE then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder} 65 | 66 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 67 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations} 68 | 69 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 70 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations} 71 | 72 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 73 | 74 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 75 | 76 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 77 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 78 | for the cell lines. If null, assumes there are no additional annotations.} 79 | 80 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 81 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 82 | no additional annotations.} 83 | 84 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 85 | 86 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 87 | 88 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 89 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations} 90 | 91 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 92 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations} 93 | 94 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL} 95 | 96 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder} 97 | } 98 | \value{ 99 | dat object with cell line and tumor expression data and annotations 100 | } 101 | \description{ 102 | load expression and annotation files for cell lines and tumors 103 | } 104 | -------------------------------------------------------------------------------- /man/modified_mnnCorrect.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{modified_mnnCorrect} 4 | \alias{modified_mnnCorrect} 5 | \title{MNN} 6 | \usage{ 7 | modified_mnnCorrect( 8 | ref_mat, 9 | targ_mat, 10 | k1 = 20, 11 | k2 = 20, 12 | ndist = 3, 13 | subset_genes = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{ref_mat:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 18 | In the standard Celligner pipeline this the cell line data.} 19 | 20 | \item{targ_mat:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 21 | In the standard Celligner pipeline this the tumor data.} 22 | 23 | \item{k1:}{the number of neighbors within the data being corrected (in standard pipeline the tumor data). By default this is 20.} 24 | 25 | \item{k2:}{the number of neighbors within the reference data (in standard pipeline the cell line data). By default this is 20.} 26 | 27 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 28 | By default is 3.} 29 | 30 | \item{subset_genes:}{the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially 31 | expressed genes is usually passed here. By default is NULL, meaning all genes are used} 32 | } 33 | \value{ 34 | MNN object, containing the targ_mat corrected data and the mutual nearest neighbor pairs. 35 | } 36 | \description{ 37 | Mutual nearest neighbors correction. Modification of the scran::fastMNN (https://github.com/MarioniLab/scran). 38 | Allows for separate k values per dataset, and simplifies some of the IO and doesn't use PCA reduction 39 | } 40 | -------------------------------------------------------------------------------- /man/run_Celligner.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{run_Celligner} 4 | \alias{run_Celligner} 5 | \title{All methods to run Celligner and save the output, if desired} 6 | \usage{ 7 | run_Celligner( 8 | cell_line_data_name = "public-20q4-a4b3", 9 | cell_line_data_file = "CCLE_expression_full", 10 | cell_line_version = NULL, 11 | cell_line_taiga = TRUE, 12 | cell_line_ann_name = "arxspan-cell-line-export-f808", 13 | cell_line_ann_file = "ACH", 14 | cell_line_ann_version = NULL, 15 | cell_line_ann_taiga = TRUE, 16 | tumor_data_name = "celligner-input-9827", 17 | tumor_data_file = "tumor_expression", 18 | tumor_version = NULL, 19 | tumor_taiga = TRUE, 20 | tumor_ann_name = "celligner-input-9827", 21 | tumor_ann_file = "tumor_annotations", 22 | tumor_ann_version = NULL, 23 | tumor_ann_taiga = TRUE, 24 | additional_annotations_name = "celligner-input-9827", 25 | additional_annotations_file = "CCLE_annotations", 26 | additional_annotations_version = NULL, 27 | additional_annotations_taiga = TRUE, 28 | hgnc_data_name = "hgnc-87ab", 29 | hgnc_data_file = "hgnc_complete_set", 30 | hgnc_version = NULL, 31 | hgnc_taiga = TRUE, 32 | save_output = NULL 33 | ) 34 | } 35 | \arguments{ 36 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 37 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data} 38 | 39 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 40 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data} 41 | 42 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL} 43 | 44 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder} 45 | 46 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 47 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations} 48 | 49 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 50 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 51 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.} 52 | 53 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL} 54 | 55 | \item{cell_line_ann_taiga:}{if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder} 56 | 57 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 58 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.} 59 | 60 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 61 | if tumor_taiga=FALSE, then the name of the file the tumor expression data} 62 | 63 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL} 64 | 65 | \item{tumor_taiga:}{if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder} 66 | 67 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 68 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations} 69 | 70 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 71 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 72 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.} 73 | 74 | \item{tumor_ann_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 75 | 76 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 77 | 78 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 79 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 80 | for the cell lines. If null, assumes there are no additional annotations.} 81 | 82 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 83 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 84 | no additional annotations.} 85 | 86 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 87 | 88 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 89 | 90 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 91 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations} 92 | 93 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 94 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations} 95 | 96 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL} 97 | 98 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder} 99 | 100 | \item{save_output:}{by default is NULL and won't save output, to save output pass in a filepath of where to save the output} 101 | } 102 | \value{ 103 | seurat object of the Celligner-aligned data 104 | } 105 | \description{ 106 | run all parts of the Celligner pipeline 107 | } 108 | -------------------------------------------------------------------------------- /man/run_MNN.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{run_MNN} 4 | \alias{run_MNN} 5 | \title{Method to run mutual nearest neighbors batch correction} 6 | \usage{ 7 | run_MNN( 8 | CCLE_cor, 9 | TCGA_cor, 10 | k1 = celligner_global$mnn_k_tumor, 11 | k2 = celligner_global$mnn_k_CL, 12 | ndist = celligner_global$mnn_ndist, 13 | subset_genes 14 | ) 15 | } 16 | \arguments{ 17 | \item{CCLE_cor:}{matrix of samples by genes of cPC corrected data that serves as the reference data in the MNN alignment. 18 | In the default Celligner pipeline this the cell line data.} 19 | 20 | \item{TCGA_cor:}{matrix of samples by genes of cPC corrected data that is corrected in the MNN alignment and projected onto the reference data. 21 | In the default Celligner pipeline this the tumor data.} 22 | 23 | \item{k1:}{the number of neighbors within the data being corrected (by default the tumor data). By default this 24 | pulls from the celligner_global paramter mnn_k_tumor, which by default is 50.} 25 | 26 | \item{k2:}{the number of neighbors within the reference data (by default the cell line data). By default this 27 | pulls from the celligner_global parameter mnn_k_CL, which by default is 5.} 28 | 29 | \item{ndist:}{A numeric scalar specifying the threshold beyond which neighbors are to be ignored when computing correction vectors. 30 | By default this pulls from the celligner_global parameter mnn_ndist, which by default is 3.} 31 | 32 | \item{subset_genes:}{the subset of genes used for identifying mutual nearest neighbors within the datasets. The set of differentially 33 | expressed genes is usually passed here.} 34 | } 35 | \value{ 36 | mutual nearest neighbors object with corrected data for the second dataset provided as input and the mutual nearest neighbors 37 | } 38 | \description{ 39 | run MNN batch correction to align data to a reference dataset 40 | } 41 | -------------------------------------------------------------------------------- /man/run_cPCA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_methods.R 3 | \name{run_cPCA} 4 | \alias{run_cPCA} 5 | \title{Method to run contrastive principal components analysis} 6 | \usage{ 7 | run_cPCA(TCGA_obj, CCLE_obj, pc_dims = NULL) 8 | } 9 | \arguments{ 10 | \item{TCGA_obj:}{seurat object containing expression data and sample annotations, usually the tumor data} 11 | 12 | \item{CCLE_obj:}{seurat object containing expression data and sample annotations, usually the cell line data} 13 | 14 | \item{pc_dims:}{the number of cPCs calculated. If set to null then all cPCs will be calculated (this is quite slow), but if set to 15 | some value >=4 then an approximate cPCA will be calculated, which just calculates the input number of contrastive principle components, 16 | which is quicker.} 17 | } 18 | \value{ 19 | object containing cPC vectors and values 20 | } 21 | \description{ 22 | run contrastive principal components analysis. 23 | Set pc_dims to a value >= 4 to run fast cPCA by just calculating the top contrastive principle components 24 | } 25 | -------------------------------------------------------------------------------- /man/run_cPCA_analysis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{run_cPCA_analysis} 4 | \alias{run_cPCA_analysis} 5 | \title{cPCA} 6 | \usage{ 7 | run_cPCA_analysis( 8 | TCGA_dat, 9 | CCLE_dat, 10 | tumor_cluster_df, 11 | CL_cluster_df, 12 | pc_dims = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{TCGA_dat:}{sample by genes matrix of scaled expression data} 17 | 18 | \item{CCLE_dat:}{sample by genes matrix of scaled expression data} 19 | 20 | \item{tumor_cluster_df:}{table of sample metadata that includes a column 'seurat_clusters', 21 | containing transcriptional clusters in the TCGA data} 22 | 23 | \item{CL_cluster_df:}{table of sample metadata that includes a column 'seurat_clusters', 24 | containing transcriptional clusters in the CCLE data} 25 | 26 | \item{pc_dims:}{numbers of cPCs calculated. If set to NULL (default) all cPCs will be calculated, if set to a value 27 | then that number of cPCs will be approximated. Values input should be >= 4.} 28 | } 29 | \value{ 30 | contrastive principal component object containing cPC vectors and values 31 | } 32 | \description{ 33 | Run contrastive principal components analysis, first removing average cluster expression, to 34 | } 35 | -------------------------------------------------------------------------------- /man/run_lm_stats_limma_group.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Celligner_helpers.R 3 | \name{run_lm_stats_limma_group} 4 | \alias{run_lm_stats_limma_group} 5 | \title{Differentially expressed genes} 6 | \usage{ 7 | run_lm_stats_limma_group( 8 | mat, 9 | phenos, 10 | covars = NULL, 11 | weights = NULL, 12 | target_type = "Gene", 13 | limma_trend = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{mat:}{Nxp data matrix of N cell lines and p genes} 18 | 19 | \item{phenos:}{N vector of independent variables. Can be two-group labels as factors, bools, or can be numeric} 20 | 21 | \item{covars:}{optional Nxk matrix of sample covariates} 22 | 23 | \item{weights:}{optional N vector of precision weights for each data point} 24 | 25 | \item{target_type:}{name of the column variable in the data (default 'Gene')} 26 | } 27 | \value{ 28 | table of gene level stata 29 | } 30 | \description{ 31 | Estimate linear-model stats for a matrix of data with respect to a group of phenotype variables 32 | } 33 | -------------------------------------------------------------------------------- /man/run_multidataset_alignment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutlidataset_alignment.R 3 | \name{run_multidataset_alignment} 4 | \alias{run_multidataset_alignment} 5 | \title{All methods to run Celligner, with additional alignment of Met500 and PDX data, and save the output, if desired} 6 | \usage{ 7 | run_multidataset_alignment( 8 | cell_line_data_name = "public-20q4-a4b3", 9 | cell_line_data_file = "CCLE_expression_full", 10 | cell_line_version = NULL, 11 | cell_line_taiga = TRUE, 12 | cell_line_ann_name = "arxspan-cell-line-export-f808", 13 | cell_line_ann_file = "ACH", 14 | cell_line_ann_version = NULL, 15 | cell_line_ann_taiga = TRUE, 16 | tumor_data_name = "celligner-input-9827", 17 | tumor_data_file = "tumor_expression", 18 | tumor_version = NULL, 19 | tumor_taiga = TRUE, 20 | tumor_ann_name = "celligner-input-9827", 21 | tumor_ann_file = "tumor_annotations", 22 | tumor_ann_version = NULL, 23 | tumor_ann_taiga = TRUE, 24 | additional_annotations_name = "celligner-input-9827", 25 | additional_annotations_file = "CCLE_annotations", 26 | additional_annotations_version = NULL, 27 | additional_annotations_taiga = TRUE, 28 | hgnc_data_name = "hgnc-87ab", 29 | hgnc_data_file = "hgnc_complete_set", 30 | hgnc_version = NULL, 31 | hgnc_taiga = TRUE, 32 | met500_data_name = "met500-fc3c", 33 | met500_data_file = "met500_TPM", 34 | met500_version = NULL, 35 | met500_taiga = TRUE, 36 | met500_ann_name = "met500-fc3c", 37 | met500_ann_file = "met500_ann", 38 | met500_ann_version = NULL, 39 | met500_ann_taiga = TRUE, 40 | Novartis_PDX_data_name = "pdx-data-3d29", 41 | Novartis_PDX_data_file = "Novartis_PDX_TPM", 42 | Novartis_PDX_version = NULL, 43 | Novartis_PDX_taiga = TRUE, 44 | Novartis_PDX_ann_name = "pdx-data-3d29", 45 | Novartis_PDX_ann_file = "Novartis_PDX_ann", 46 | Novartis_PDX_ann_version = NULL, 47 | Novartis_PDX_ann_taiga = TRUE, 48 | pediatric_PDX_data_name = "pdx-data-3d29", 49 | pediatric_PDX_data_file = "pediatric_PDX_TPM", 50 | pediatric_PDX_version = NULL, 51 | pediatric_PDX_taiga = TRUE, 52 | pediatric_PDX_ann_name = "pdx-data-3d29", 53 | pediatric_PDX_ann_file = "pediatric_PDX_ann", 54 | pediatric_PDX_ann_version = NULL, 55 | pediatric_PDX_ann_taiga = TRUE, 56 | save_output = NULL 57 | ) 58 | } 59 | \arguments{ 60 | \item{cell_line_data_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line expression data, 61 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line expression data} 62 | 63 | \item{cell_line_data_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line expression data, 64 | if cell_line_taiga=FALSE, then the name of the file of cell line expression data} 65 | 66 | \item{cell_line_version:}{parameter to specify the version to pull from taiga for the cell line expression data, default set to NULL} 67 | 68 | \item{cell_line_taiga:}{if TRUE then pulls the cell line expression data from taiga, if FALSE then finds cell line expression data in local folder} 69 | 70 | \item{cell_line_ann_name:}{if cell_line_taiga = TRUE, then the data.name of the taiga file containing the cell line annotations, 71 | if cell_line_taiga=FALSE, then the file path to the local folder containing the cell line annotations} 72 | 73 | \item{cell_line_ann_file:}{if cell_line_taiga = TRUE, then the data.file of the taiga file containing the cell line annotations, 74 | if cell_line_taiga=FALSE, then the name of the file of cell line annotations. If pulling from taiga, assumes that the file is the arxspan 75 | file (could also use virtual release sample_info file), if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='CL'.} 76 | 77 | \item{cell_line_ann_version:}{parameter to specify the version to pull from taiga for the cell line annotations, default set to NULL} 78 | 79 | \item{cell_line_ann_taiga:}{if TRUE (default) then pulls the cell line annotations from taiga, if FALSE then finds cell line annotations in local folder} 80 | 81 | \item{tumor_data_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor expression data, 82 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor expression data.} 83 | 84 | \item{tumor_data_file:}{if tumor_taiga = TRUE, then the data.file of the taiga file containing the tumor expression data, 85 | if tumor_taiga=FALSE, then the name of the file the tumor expression data} 86 | 87 | \item{tumor_version:}{parameter to specify the version to pull from taiga for the tumor expression data, default set to NULL} 88 | 89 | \item{tumor_taiga:}{if TRUE (default) then pulls the tumor expression data from taiga, if FALSE then finds tumor expression data in local folder} 90 | 91 | \item{tumor_ann_name:}{if tumor_taiga = TRUE, then the data.name of the taiga file containing the tumor annotations, 92 | if tumor_taiga=FALSE, then the file path to the local folder containing the tumor annotations} 93 | 94 | \item{tumor_ann_file:}{if tumor_ann_taiga = TRUE, then the data.file of the taiga file containing the tumor annotations, 95 | if tumor_ann_taiga=FALSE, then the name of the file the tumor annotations. If pulling from taiga, assumes that the file is the already create Celligner info file used in the Celligner manuscript, 96 | if not then assumes it is a local file containing the columns sampleID, lineage, subtype, and type=='tumor'.} 97 | 98 | \item{tumor_ann_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 99 | 100 | \item{tumor_ann_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 101 | 102 | \item{additional_annotations_name:}{if additional_annotations_taiga = TRUE, then the data.name of the taiga file containing the additional annotations, 103 | if additional_annotations_taiga=FALSE, then the file path to the local folder containing the additional annotations. Used to add more fine-grain subtype annotations 104 | for the cell lines. If null, assumes there are no additional annotations.} 105 | 106 | \item{additional_annotations_file:}{if additional_annotations_taiga = TRUE, then the data.file of the taiga file containing the additional annotations, 107 | if additional_annotations_taiga=FALSE, then the name of the file the additional annotations. If null, assumes there are 108 | no additional annotations.} 109 | 110 | \item{additional_annotations_version:}{parameter to specify the version to pull from taiga for the tumor annotations, default set to NULL} 111 | 112 | \item{additional_annotations_taiga:}{if TRUE then pulls the tumor annotations from taiga, if FALSE then finds tumor annotations in local folder} 113 | 114 | \item{hgnc_data_name:}{if hgnc_taiga = TRUE, then the data.name of the taiga file containing the HGNC gene annotations, 115 | if hgnc_taiga=FALSE, then the file path to the local folder containing the HGNC gene annotations} 116 | 117 | \item{hgnc_data_file:}{if hgnc_taiga = TRUE, then the data.file of the taiga file containing the HGNC gene annotations, 118 | if hgnc_taiga=FALSE, then the name of the file the HGNC gne annotations} 119 | 120 | \item{hgnc_version:}{parameter to specify the version to pull from taiga for the HGNC gene annotations, default set to NULL} 121 | 122 | \item{hgnc_taiga:}{if TRUE then pulls the HGNC gene annotations from taiga, if FALSE then finds HGNC gene annotations in local folder} 123 | 124 | \item{met500_data_name:}{Met500 expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using met500_taiga=F} 125 | 126 | \item{met500_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using met500_taiga=F} 127 | 128 | \item{met500_version:}{default NULL, used to specify version of taiga dataset} 129 | 130 | \item{met500_taiga:}{if TRUE (default) pulls Met500 expression from taiga dataset, if FALSE reads from local} 131 | 132 | \item{met500_ann_name:}{Met500 annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using met500_ann_taiga=F} 133 | 134 | \item{met500_ann_file:}{Met500 annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using met500_ann_taiga=F} 135 | 136 | \item{met500_ann_version:}{default NULL, used to specify version of taiga dataset} 137 | 138 | \item{met500_ann_taiga:}{if TRUE (default) pulls met500 annotations from taiga dataset, if FALSE reads from local} 139 | 140 | \item{Novartis_PDX_data_name:}{Novartis PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using Novartis_PDX_taiga=F} 141 | 142 | \item{Novartis_PDX_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using Novartis_PDX_taiga=F} 143 | 144 | \item{Novartis_PDX_version:}{default NULL, used to specify version of taiga dataset} 145 | 146 | \item{Novartis_PDX_taiga:}{if TRUE (default) pulls Novartis PDX expression from taiga dataset, if FALSE reads from local} 147 | 148 | \item{Novartis_PDX_ann_name:}{Novartis PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or path to folder is using met500_ann_taiga=F} 149 | 150 | \item{Novartis_PDX_ann_file:}{Novartis PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or name of local file is using Novartis_PDX_ann_taiga=F} 151 | 152 | \item{Novartis_PDX_ann_version:}{default NULL, used to specify version of taiga dataset} 153 | 154 | \item{Novartis_PDX_ann_taiga:}{if TRUE (default) pulls Novartis PDX annotations from taiga dataset, if FALSE reads from local} 155 | 156 | \item{pediatric_PDX_data_name:}{pediatric PDX expression, default pulls from taiga, this the data_name of the taiga dataset, or path to folder if using pediatric_PDX_taiga=F} 157 | 158 | \item{pediatric_PDX_data_file:}{default pulls from taiga, this the data_file of the taiga dataset, or name of local file if using pediatric_PDX_taiga=F} 159 | 160 | \item{pediatric_PDX_version:}{default NULL, used to specify version of taiga dataset} 161 | 162 | \item{pediatric_PDX_taiga:}{if TRUE (default) pulls pediatric PDX expression from taiga dataset, if FALSE reads from local} 163 | 164 | \item{pediatric_PDX_ann_name:}{Pediatric PDX annotations, default pulls from taiga, this the data_name of the taiga dataset, or path to folder is using pediatric_PDX_ann_taiga=F} 165 | 166 | \item{pediatric_PDX_ann_file:}{Pediatric PDX annotations, default pulls from taiga, this the data_file of the taiga dataset, or name of local file is using pediatric_PDX_ann_taiga=F} 167 | 168 | \item{pediatric_PDX_ann_version:}{default NULL, used to specify version of taiga dataset} 169 | 170 | \item{pediatric_PDX_ann_taiga:}{if TRUE (default) pulls pediatric PDX annotations from taiga dataset, if FALSE reads from local} 171 | 172 | \item{save_output:}{by default is NULL and won't save output, to save output pass in a filepath of where to save the output} 173 | } 174 | \value{ 175 | seurat object of the Celligner-aligned data 176 | } 177 | \description{ 178 | run all parts of the Celligner pipeline, with alignment of additional datasets 179 | } 180 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: celligner 2 | theme: readthedocs 3 | nav: 4 | - "Celligner": celligner.md 5 | plugins: 6 | - mkdocstrings: 7 | default_handler: python 8 | handlers: 9 | python: 10 | rendering: 11 | show_source: false 12 | custom_templates: templates 13 | watch: 14 | - celligner/ 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.3 2 | pandas==1.4.4 3 | scikit_learn==1.1.2 4 | umap>=0.1 5 | igraph==0.9.11 6 | scanpy==1.9.1 7 | anndata==0.8.0 8 | scipy==1.9.1 9 | louvain==0.7.1 10 | rpy2==3.5.4 11 | cython==0.29.32 12 | matplotlib<3.7 -------------------------------------------------------------------------------- /run_celligner.py: -------------------------------------------------------------------------------- 1 | import celligner 2 | import pandas as pd 3 | import re 4 | from taigapy import TaigaClient 5 | 6 | tc = TaigaClient() 7 | 8 | portal_in_dict = {} 9 | 10 | portal_out_dict = {} 11 | 12 | depmap_params = {'name': 'depmap_public_23q2', 13 | 'taiga_name': 'public-23q2-19de', 14 | 'taiga_file': 'OmicsExpressionProteinCodingGenesTPMLogp1', 15 | 'dset_type': 'model', 16 | 'mnn_params': None} 17 | 18 | tcga_params = {'name': 'tcga', 19 | 'taiga_name': 'celligner-input-9827', 20 | 'taiga_file': 'tumor_expression', 21 | 'dset_type': 'tumor', 22 | 'mnn_params': None} 23 | 24 | met500_params = {'name': 'met500', 25 | 'taiga_name': 'met500-fc3c', 26 | 'taiga_file': 'met500_TPM', 27 | 'dset_type': 'tumor', 28 | 'mnn_params': {'k1': 20, 'k2': 50}} 29 | 30 | pdx_nv_params = {'name': 'pdx_novartis', 31 | 'taiga_name': 'pdx-data-3d29', 32 | 'taiga_file': 'Novartis_PDX_TPM', 33 | 'dset_type': 'model', 34 | 'mnn_params': {'k1': 10, 'k2': 50}} 35 | 36 | pdx_ped_params = {'name': 'pdx_pediatric', 37 | 'taiga_name': 'pdx-data-3d29', 38 | 'taiga_file': 'pediatric_PDX_TPM', 39 | 'dset_type': 'model', 40 | 'mnn_params': {'k1': 10, 'k2': 50}} 41 | 42 | celligner_default_extras = [met500_params, pdx_nv_params, pdx_ped_params] 43 | 44 | 45 | def process_data(bg_df, contrast_df, extra_dfs): 46 | # Filter to columns with ensembl id 47 | # Need to transform from ensembl transcript ids to ensemble gene ids 48 | # insert below 49 | 50 | # Load HGNC gene set, filter to functional subset 51 | hgnc_complete_set = tc.get(name='hgnc-87ab', version=7, file='hgnc_complete_set') 52 | func_genes = hgnc_complete_set[~hgnc_complete_set.locus_group.isin(["non-coding RNA", "pseudogene"])] 53 | 54 | # background datafrrame should have genes labeled by gene symbol 55 | # need to reindex 56 | # print('bg_genes going in:', bg_df.keys()) 57 | bg_genes = pd.Series(bg_df.keys()).apply(lambda s: re.search(r'^([\w.-]+) \(', s).group(1)).rename('symbol') 58 | bg_genes = bg_genes.set_axis(bg_df.keys()).to_frame() 59 | bg_genes = bg_genes.reset_index().merge(hgnc_complete_set[['symbol', 'ensembl_gene_id']], 60 | left_on='symbol', right_on='symbol')[['index','ensembl_gene_id']].set_index('index') 61 | # print('bg_genes processed:', bg_genes) 62 | bg_df = bg_df.rename(columns=bg_genes.to_dict()['ensembl_gene_id']) 63 | # print(bg_df.head()) 64 | 65 | gene_sets = [set(bg_df.columns), set(contrast_df.columns)] + [set(_df.columns) for _df in extra_dfs] 66 | gene_set = set(func_genes.ensembl_gene_id).intersection(*gene_sets) 67 | common_genes = [x for x in bg_df.columns if x in gene_set] 68 | print('Common genes:', len(common_genes)) 69 | 70 | bg_df = bg_df[common_genes] 71 | contrast_df = contrast_df[common_genes] 72 | extra_dfs = [_df[common_genes] for _df in extra_dfs] 73 | 74 | return bg_df, contrast_df, extra_dfs 75 | 76 | 77 | def run_celligner(bg=depmap_params, contrast=tcga_params, extra_dsets=celligner_default_extras): 78 | # load data to be used as the background and label the source 79 | bg_df = tc.get(name=bg['taiga_name'], file=bg['taiga_file']) 80 | bg_df = pd.concat({bg['name']: bg_df}, names=['source']) 81 | 82 | # load data to be contrasted with the background and label the source 83 | contrast_df = tc.get(name=contrast['taiga_name'], file=contrast['taiga_file']) 84 | contrast_df = pd.concat({contrast['name']: contrast_df}, names=['source']) 85 | 86 | # if there are additional datasets to be projected then collect and label them as well 87 | extra_dfs = [] 88 | for dset in extra_dsets: 89 | _df = tc.get(name=dset['taiga_name'], file=dset['taiga_file']) 90 | if dset['name'] == 'pdx_novartis': 91 | _df = _df.T 92 | _df = pd.concat({dset['name']: _df}, names=['source']) 93 | extra_dfs.append(_df) 94 | 95 | '''print(bg_df.head()) 96 | print(contrast_df.head()) 97 | for _df in extra_dfs: 98 | print(_df.head())''' 99 | # make sure all datasets are using ensembl gene ids and are restricted to common sets of genes 100 | bg_df, contrast_df, extra_dfs = process_data(bg_df, contrast_df, extra_dfs) 101 | '''print(bg_df.head()) 102 | print(contrast_df.head()) 103 | for _df in extra_dfs: 104 | print(_df.head())''' 105 | # Create Celligner object and fit + transform the reference (depmap) and target (TCGA) expression datasets 106 | my_celligner = celligner.Celligner() 107 | my_celligner.fit(bg_df.droplevel(0,0)) 108 | my_celligner.transform(contrast_df.droplevel(0,0)) 109 | # add in additional datasets to be projected if they are given 110 | for _df in extra_dfs: 111 | df_name = _df.index.get_level_values(0).unique()[0] 112 | for dset in extra_dsets: 113 | if dset['name'] == df_name: 114 | break 115 | 116 | my_celligner.makeNewReference() 117 | if dset['mnn_params']: 118 | p = dset['mnn_params'] 119 | my_celligner.mnn_kwargs.update({'k1': p['k1'], 'k2': p['k2']}) 120 | my_celligner.transform(_df.droplevel(0,0), compute_cPCs=False) 121 | 122 | # Compute UMAP, clusters and tumor - model distance 123 | model_ids = list(bg_df.index.get_level_values(1)) 124 | tumor_ids = list(contrast_df.index.get_level_values(1)) 125 | 126 | for _df in extra_dfs: 127 | df_name = _df.index.get_level_values(0).unique()[0] 128 | for dset in extra_dsets: 129 | if dset['name'] == df_name: 130 | break 131 | 132 | if dset['dset_type'] == 'model': 133 | model_ids += list(_df.index.get_level_values(1)) 134 | elif dset['dset_type'] == 'tumor': 135 | tumor_ids += list(_df.index.get_level_values(1)) 136 | 137 | my_celligner.computeMetricsForOutput(model_ids=model_ids, tumor_ids=tumor_ids) 138 | 139 | outname = 'celligner_output_' \ 140 | + bg['name'] + '_' \ 141 | + contrast['name'] + '_' \ 142 | + '_'.join([d['name'] for d in extra_dsets]) + ".pkl" 143 | my_celligner.save(outname) 144 | print('Model saved to: ', outname) 145 | return outname 146 | 147 | 148 | if __name__ == "__main__": 149 | run_celligner() 150 | -------------------------------------------------------------------------------- /run_celligner_multi_dataset.py: -------------------------------------------------------------------------------- 1 | import celligner 2 | import pandas as pd 3 | import numpy as np 4 | import re 5 | from taigapy import TaigaClient 6 | tc = TaigaClient() 7 | 8 | # Load data 9 | CCLE_expression = tc.get(name='dmc-22q2-5e51', version=16, file='CCLE_expression_full') 10 | tumor_expression = tc.get(name='celligner-input-9827', version=2, file='tumor_expression') 11 | met500_TPM = tc.get(name='met500-fc3c', version=1, file='met500_TPM') 12 | Novartis_PDX_TPM = tc.get(name='pdx-data-3d29', version=2, file='Novartis_PDX_TPM').T 13 | pediatric_PDX_TPM = tc.get(name='pdx-data-3d29', version=2, file='pediatric_PDX_TPM') 14 | 15 | # Filter to columns with ensembl id 16 | CCLE_expression = CCLE_expression.filter(like='ENSG') 17 | CCLE_expression.columns = pd.Series(CCLE_expression.columns).apply(lambda x: re.search('(ENSG\d+)', x).group(1)) 18 | 19 | ## Load HGNC gene set, filter to functional subset 20 | hgnc_complete_set = tc.get(name='hgnc-87ab', version=5, file='hgnc_complete_set') 21 | func_genes = hgnc_complete_set[~hgnc_complete_set.locus_group.isin(["non-coding RNA", "pseudogene"])] 22 | 23 | # Identify common genes - maintaining order from CCLE expression matrix 24 | gene_sets = [set(tumor_expression.columns), set(met500_TPM.columns), set(Novartis_PDX_TPM.columns), set(pediatric_PDX_TPM.columns)] 25 | gene_set = set(func_genes.ensembl_gene_id).intersection(*gene_sets) 26 | common_genes = [x for x in CCLE_expression.columns if x in gene_set] 27 | print('Common genes:', len(common_genes)) 28 | 29 | # Subset all matrices to common genes 30 | CCLE_expression = CCLE_expression[common_genes] 31 | tumor_expression = tumor_expression[common_genes] 32 | met500_TPM = met500_TPM[common_genes] 33 | Novartis_PDX_TPM = Novartis_PDX_TPM[common_genes] 34 | pediatric_PDX_TPM = pediatric_PDX_TPM[common_genes] 35 | 36 | # Create Celligner object and fit + transform the reference (CCLE) and target (TCGA) expression datasets 37 | my_celligner = celligner.Celligner() 38 | my_celligner.fit(CCLE_expression) 39 | my_celligner.transform(tumor_expression) 40 | 41 | # Multi-dataset alignment - sequentially aligning additional expression datasets 42 | # Met500 43 | my_celligner.makeNewReference() 44 | my_celligner.mnn_kwargs.update({"k1":20, "k2":50}) 45 | my_celligner.transform(met500_TPM, compute_cPCs=False) 46 | # Novartis PDX 47 | my_celligner.makeNewReference() 48 | my_celligner.mnn_kwargs.update({"k1":10, "k2":50}) 49 | my_celligner.transform(Novartis_PDX_TPM, compute_cPCs=False) 50 | # Pediatric PDX 51 | my_celligner.makeNewReference() 52 | my_celligner.mnn_kwargs.update({"k1":10, "k2":50}) 53 | my_celligner.transform(pediatric_PDX_TPM, compute_cPCs=False) 54 | 55 | # Compute UMAP, clusters and tumor - model distance 56 | model_ids = list(CCLE_expression.index)+list(Novartis_PDX_TPM.index)+list(pediatric_PDX_TPM.index) 57 | tumor_ids = list(tumor_expression.index)+list(met500_TPM.index) 58 | my_celligner.computeMetricsForOutput(model_ids=model_ids, tumor_ids=tumor_ids) 59 | 60 | my_celligner.save("model_22q2_dmc_multi_dataset.pkl") 61 | -------------------------------------------------------------------------------- /run_on_sparkles.sh: -------------------------------------------------------------------------------- 1 | sparkles sub -n celligner-run \ 2 | -u celligner:celligner \ 3 | -u mnnpy:mnnpy \ 4 | -u install_submodules_and_run.sh \ 5 | -u run_celligner.py \ 6 | -u $HOME/.taiga/token:.taiga-token \ 7 | sh install_submodules_and_run.sh "run_celligner.py" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | import os 4 | import io 5 | import subprocess 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 2: 8 | raise ValueError("celligner is only compatible with Python 3.3 and above") 9 | if sys.version_info.minor < 5: 10 | import warnings 11 | warnings.warn("celligner may not function properly on Python < 3.5") 12 | 13 | print("trying to install the required limma R package") 14 | try: 15 | subprocess.run( 16 | 'R -e \'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install("limma");\'', shell=True, check=True, 17 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) 18 | except: 19 | print('failed to install limma. \ 20 | please install R or check your R installation and then install limma with:\ 21 | R -e \"if(!requireNamespace(\"BiocManager\", quietly = TRUE)){\ 22 | install.packages(\"BiocManager\", repos=\"http://cran.us.r-project.org\")};\ 23 | BiocManager::install(c(\"limma\"));\"') 24 | 25 | print("Finished!") 26 | def read(*paths, **kwargs): 27 | """Read the contents of a text file safely. 28 | >>> read("celligner", "VERSION") 29 | '0.1.0' 30 | >>> read("README.md") 31 | ... 32 | """ 33 | 34 | content = "" 35 | with io.open( 36 | os.path.join(os.path.dirname(__file__), *paths), 37 | encoding=kwargs.get("encoding", "utf8"), 38 | ) as open_file: 39 | content = open_file.read().strip() 40 | return content 41 | 42 | 43 | def read_requirements(path): 44 | return [ 45 | line.strip() 46 | for line in read(path).split("\n") 47 | if not line.startswith(('"', "#", "-", "git+")) 48 | ] 49 | 50 | 51 | setup( 52 | name='celligner', 53 | version=read("celligner", "VERSION"), 54 | description='A useful module for alligning cell lines to tumors', 55 | long_description=read("README.md"), 56 | long_description_content_type="text/markdown", 57 | author="Broad Institute CDS", 58 | url="https://github.com/BroadInstitute/celligner", 59 | packages=find_packages(exclude=["tests", ".github"]), 60 | package_data={'celligner': ['data/*']}, 61 | python_requires='>=3.5', 62 | install_requires=read_requirements("requirements.txt"), 63 | entry_points={ 64 | "console_scripts": ["celligner = celligner.__main__:main"] 65 | }, 66 | #extras_require={"test": read_requirements("requirements-test.txt")}, 67 | classifiers=[ 68 | "Programming Language :: Python :: 3", 69 | "Intended Audience :: Science/Research", 70 | "Topic :: Scientific/Engineering :: Bio-Informatics", 71 | ], 72 | ) 73 | 74 | # try: 75 | # subprocess.run( 76 | # "git submodule update --remote --init", shell=True, check=True, 77 | # stdout=subprocess.PIPE, stderr=subprocess.PIPE) 78 | # except: 79 | # print('failed to install the mnnpy and CPCA submodules. \ 80 | # please install Python or check your Python installation and then install mnnpy & cpca with:\ 81 | # cd PROJECTLOC && git submodule update --remote --init') 82 | --------------------------------------------------------------------------------