├── .github ├── FUNDING.yml ├── release_message.sh └── workflows │ ├── main.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── .vscode └── settings.json ├── CONTRIBUTING.md ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── data ├── Annotations.RData └── variantFilter │ └── snp_indels_rescue_list.txt ├── docs ├── genepy.md └── index.md ├── documentation └── genome.jpg ├── examples └── rna_diff_expr.ipynb ├── genepy ├── VERSION ├── __init__.py ├── cell_line_mapping-master │ ├── .gitignore │ ├── .travis.yml │ ├── README.md │ ├── celllinemapr │ │ ├── .Rbuildignore │ │ ├── .celllinemapr.Rds │ │ ├── DESCRIPTION │ │ ├── NAMESPACE │ │ ├── R │ │ │ └── cell_line_mapping.R │ │ ├── SOP.Rmd │ │ ├── celllinemapr.Rproj │ │ ├── man │ │ │ ├── arxspan.to.ccle.Rd │ │ │ ├── ccle.to.arxspan.Rd │ │ │ └── ccle.to.latest.Rd │ │ └── naming.csv │ └── python │ │ ├── cell_line_mapper │ │ ├── __init__.py │ │ └── test_mapper.py │ │ ├── name_mapping.csv │ │ ├── requirements.txt │ │ ├── setup.py │ │ └── test-data.csv ├── epigenetics │ ├── CREME.md │ ├── CREME.py │ ├── README.md │ ├── __init__.py │ ├── chipseq.py │ ├── docsCREME │ │ ├── MED1_before_pairplot.png │ │ ├── MED1_before_venn_venn.png │ │ ├── MED1_new_found_peaks_kdeplot.png │ │ └── igv-app-MED1-zoom.png │ └── plot.py ├── google │ ├── README.md │ ├── __init__.py │ ├── gcp.py │ ├── good-retention.json │ ├── google_sheet.py │ └── gsheet_upload.py ├── imaging │ └── fish.py ├── mutations │ ├── README.md │ └── __init__.py ├── rna │ ├── README.md │ ├── __init__.py │ ├── pyDESeq2.py │ └── ssGSEA.R ├── sequencing │ ├── README.md │ └── __init__.py ├── terra │ ├── README.md │ ├── __init__.py │ └── map_terra_workflow.py └── utils │ ├── Datanalytics.py │ ├── README.md │ ├── RScript.R │ ├── __init__.py │ ├── helper.py │ └── plot.py ├── mkdocs.yml ├── requirements-test.txt ├── requirements.txt ├── setup.cfg └── setup.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [rochacbruno] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/release_message.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | previous_tag=$(git tag --sort=-creatordate | sed -n 2p) 3 | git shortlog "${previous_tag}.." | sed 's/^./ &/' 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | # Controls when the workflow will run 6 | on: 7 | # Triggers the workflow on push or pull request events but only for the main branch 8 | push: 9 | branches: [ main ] 10 | pull_request: 11 | branches: [ main ] 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | jobs: 17 | linter: 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | python-version: [3.9] 22 | os: [ubuntu-latest] 23 | runs-on: ${{ matrix.os }} 24 | steps: 25 | - uses: actions/checkout@v2 26 | - uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install project 30 | run: make install 31 | - name: Run linter 32 | run: make lint 33 | 34 | tests_linux: 35 | needs: linter 36 | strategy: 37 | fail-fast: false 38 | matrix: 39 | python-version: [3.9] 40 | os: [ubuntu-latest] 41 | runs-on: ${{ matrix.os }} 42 | steps: 43 | - uses: actions/checkout@v2 44 | - uses: actions/setup-python@v2 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install project 48 | run: make install 49 | - name: Run tests 50 | run: make test 51 | - name: "Upload coverage to Codecov" 52 | uses: codecov/codecov-action@v1 53 | # with: 54 | # fail_ci_if_error: true 55 | 56 | tests_mac: 57 | needs: linter 58 | strategy: 59 | fail-fast: false 60 | matrix: 61 | python-version: [3.9] 62 | os: [macos-latest] 63 | runs-on: ${{ matrix.os }} 64 | steps: 65 | - uses: actions/checkout@v2 66 | - uses: actions/setup-python@v2 67 | with: 68 | python-version: ${{ matrix.python-version }} 69 | - name: Install project 70 | run: make install 71 | - name: Run tests 72 | run: make test 73 | 74 | tests_win: 75 | needs: linter 76 | strategy: 77 | fail-fast: false 78 | matrix: 79 | python-version: [3.9] 80 | os: [windows-latest] 81 | runs-on: ${{ matrix.os }} 82 | steps: 83 | - uses: actions/checkout@v2 84 | - uses: actions/setup-python@v2 85 | with: 86 | python-version: ${{ matrix.python-version }} 87 | - name: Install Pip 88 | run: pip install --user --upgrade pip 89 | - name: Install project 90 | run: pip install -e .[test] 91 | - name: run tests 92 | run: pytest -s -vvvv -l --tb=long tests 93 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | # Sequence of patterns matched against refs/tags 6 | tags: 7 | - '*' # Push events to matching v*, i.e. v1.0, v20.15.10 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | jobs: 13 | release: 14 | name: Create Release 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | # by default, it uses a depth of 1 20 | # this fetches all history so that we can read each commit 21 | fetch-depth: 0 22 | - name: Generate Changelog 23 | run: .github/release_message.sh > release_message.md 24 | - name: Release 25 | uses: softprops/action-gh-release@v1 26 | with: 27 | body_path: release_message.md 28 | 29 | deploy: 30 | needs: release 31 | runs-on: ubuntu-latest 32 | steps: 33 | - uses: actions/checkout@v1 34 | - name: Set up Python 35 | uses: actions/setup-python@v1 36 | with: 37 | python-version: '3.x' 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install setuptools wheel twine 42 | - name: Build and publish 43 | env: 44 | TWINE_USERNAME: jkobject 45 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 46 | run: | 47 | python setup.py sdist bdist_wheel 48 | twine upload dist/* 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/r,macos,python,sublimetext 2 | # Edit at https://www.gitignore.io/?templates=r,macos,python,sublimetext 3 | ### PERSO ### 4 | data/* 5 | 6 | ### macOS ### 7 | # General 8 | .DS_Store 9 | .AppleDouble 10 | .LSOverride 11 | .biomart.csv 12 | 13 | # Icon must end with two \r 14 | Icon 15 | 16 | # Thumbnails 17 | ._* 18 | temp/* 19 | 20 | # Files that might appear in the root of a volume 21 | .DocumentRevisions-V100 22 | .fseventsd 23 | .Spotlight-V100 24 | .TemporaryItems 25 | .Trashes 26 | .VolumeIcon.icns 27 | .com.apple.timemachine.donotpresent 28 | 29 | # Directories potentially created on remote AFP share 30 | .AppleDB 31 | .AppleDesktop 32 | Network Trash Folder 33 | Temporary Items 34 | .apdisk 35 | 36 | ### Python ### 37 | # Byte-compiled / optimized / DLL files 38 | __pycache__/ 39 | *.py[cod] 40 | *$py.class 41 | 42 | # C extensions 43 | *.so 44 | 45 | # Distribution / packaging 46 | .Python 47 | build/ 48 | develop-eggs/ 49 | dist/ 50 | downloads/ 51 | eggs/ 52 | .eggs/ 53 | lib/ 54 | lib64/ 55 | parts/ 56 | sdist/ 57 | var/ 58 | wheels/ 59 | pip-wheel-metadata/ 60 | share/python-wheels/ 61 | *.egg-info/ 62 | .installed.cfg 63 | *.egg 64 | MANIFEST 65 | 66 | # PyInstaller 67 | # Usually these files are written by a python script from a template 68 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 69 | *.manifest 70 | *.spec 71 | 72 | # Installer logs 73 | pip-log.txt 74 | pip-delete-this-directory.txt 75 | 76 | # Unit test / coverage reports 77 | htmlcov/ 78 | .tox/ 79 | .nox/ 80 | .coverage 81 | .coverage.* 82 | .cache 83 | nosetests.xml 84 | coverage.xml 85 | *.cover 86 | .hypothesis/ 87 | .pytest_cache/ 88 | 89 | # Translations 90 | *.mo 91 | *.pot 92 | 93 | # Django stuff: 94 | *.log 95 | local_settings.py 96 | db.sqlite3 97 | 98 | # Flask stuff: 99 | instance/ 100 | .webassets-cache 101 | 102 | # Scrapy stuff: 103 | .scrapy 104 | 105 | # Sphinx documentation 106 | docs/_build/ 107 | 108 | # PyBuilder 109 | target/ 110 | 111 | # Jupyter Notebook 112 | .ipynb_checkpoints 113 | 114 | # IPython 115 | profile_default/ 116 | ipython_config.py 117 | 118 | # pyenv 119 | .python-version 120 | 121 | # pipenv 122 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 123 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 124 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 125 | # install all needed dependencies. 126 | #Pipfile.lock 127 | 128 | # celery beat schedule file 129 | celerybeat-schedule 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | env/ 138 | venv/ 139 | ENV/ 140 | env.bak/ 141 | venv.bak/ 142 | 143 | # Spyder project settings 144 | .spyderproject 145 | .spyproject 146 | 147 | # Rope project settings 148 | .ropeproject 149 | 150 | # mkdocs documentation 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | ### R ### 161 | # History files 162 | .Rhistory 163 | .Rapp.history 164 | 165 | # Session Data files 166 | .RData 167 | 168 | # User-specific files 169 | .Ruserdata 170 | 171 | # Example code in package build process 172 | *-Ex.R 173 | 174 | # Output files from R CMD build 175 | /*.tar.gz 176 | 177 | # Output files from R CMD check 178 | /*.Rcheck/ 179 | 180 | # RStudio files 181 | .Rproj.user/ 182 | 183 | # produced vignettes 184 | vignettes/*.html 185 | vignettes/*.pdf 186 | 187 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 188 | .httr-oauth 189 | 190 | # knitr and R markdown default cache directories 191 | /*_cache/ 192 | /cache/ 193 | 194 | # Temporary files created by R markdown 195 | *.utf8.md 196 | *.knit.md 197 | 198 | ### R.Bookdown Stack ### 199 | # R package: bookdown caching files 200 | /*_files/ 201 | 202 | ### SublimeText ### 203 | # Cache files for Sublime Text 204 | *.tmlanguage.cache 205 | *.tmPreferences.cache 206 | *.stTheme.cache 207 | 208 | # Workspace files are user-specific 209 | *.sublime-workspace 210 | 211 | # Project files should be checked into the repository, unless a significant 212 | # proportion of contributors will probably not be using Sublime Text 213 | # *.sublime-project 214 | 215 | # SFTP configuration file 216 | sftp-config.json 217 | 218 | # Package control specific files 219 | Package Control.last-run 220 | Package Control.ca-list 221 | Package Control.ca-bundle 222 | Package Control.system-ca-bundle 223 | Package Control.cache/ 224 | Package Control.ca-certs/ 225 | Package Control.merged-ca-bundle 226 | Package Control.user-ca-bundle 227 | oscrypto-ca-bundle.crt 228 | bh_unicode_properties.cache 229 | 230 | # Sublime-github package stores a github token in this file 231 | # https://packagecontrol.io/packages/sublime-github 232 | GitHub.sublime-settings 233 | 234 | # tmp files 235 | tmp.py 236 | 237 | # End of https://www.gitignore.io/api/r,macos,python,sublimetext 238 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "genepy/epigenetics/rose"] 2 | path = genepy/epigenetics/rose 3 | url = https://github.com/jkobject/rose.git 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.flake8Enabled": true, 3 | "python.linting.enabled": true, 4 | "python.formatting.provider": "black", 5 | "editor.tabSize": 4, 6 | "editor.detectIndentation": false, 7 | "editor.formatOnSave": true, 8 | "editor.formatOnPaste": true, 9 | "editor.formatOnType": false, 10 | "python.linting.flake8Args": [ 11 | "--max-line-length=120", 12 | "--ignore=F403, E501, E226", 13 | ], 14 | "editor.autoClosingBrackets": true, 15 | "editor.autoClosingQuotes": true, 16 | "editor.autoSurround": true, 17 | "editor.autoIndent": "full", 18 | "editor.insertSpaces": true 19 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to develop on this project 2 | 3 | genepy welcomes contributions from the community. 4 | 5 | **You need PYTHON3!** 6 | 7 | This instructions are for linux base systems. (Linux, MacOS, BSD, etc.) 8 | ## Setting up your own fork of this repo. 9 | 10 | - On github interface click on `Fork` button. 11 | - Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/genepy.git` 12 | - Enter the directory `cd genepy` 13 | - Add upstream repo `git remote add upstream https://github.com/broadinstitute/genepy` 14 | 15 | ## Setting up your own virtual environment 16 | 17 | Run `make virtualenv` to create a virtual environment. 18 | then activate it with `source .venv/bin/activate`. 19 | 20 | ## Install the project in develop mode 21 | 22 | Run `make install` to install the project in develop mode. 23 | 24 | ## Run the tests to ensure everything is working 25 | 26 | Run `make test` to run the tests. 27 | 28 | ## Create a new branch to work on your contribution 29 | 30 | Run `git checkout -b my_contribution` 31 | 32 | ## Make your changes 33 | 34 | Edit the files using your preferred editor. (we recommend VIM or VSCode) 35 | 36 | ## Format the code 37 | 38 | Run `make fmt` to format the code. 39 | 40 | ## Run the linter 41 | 42 | Run `make lint` to run the linter. 43 | 44 | ## Test your changes 45 | 46 | Run `make test` to run the tests. 47 | 48 | Ensure code coverage report shows `100%` coverage, add tests to your PR. 49 | 50 | ## Build the docs locally 51 | 52 | Run `make docs` to build the docs. 53 | 54 | Ensure your new changes are documented. 55 | 56 | ## Commit your changes 57 | 58 | This project uses [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/). 59 | 60 | Example: `fix(package): update setup.py arguments 🎉` (emojis are fine too) 61 | 62 | ## Push your changes to your fork 63 | 64 | Run `git push origin my_contribution` 65 | 66 | ## Submit a pull request 67 | 68 | On github interface, click on `Pull Request` button. 69 | 70 | Wait CI to run and one of the developers will review your PR. 71 | ## Makefile utilities 72 | 73 | This project comes with a `Makefile` that contains a number of useful utility. 74 | 75 | ```bash 76 | ❯ make 77 | Usage: make 78 | 79 | Targets: 80 | help: ## Show the help. 81 | install: ## Install the project in dev mode. 82 | fmt: ## Format code using black & isort. 83 | lint: ## Run pep8, black, mypy linters. 84 | test: lint ## Run tests and generate coverage report. 85 | watch: ## Run tests on every change. 86 | clean: ## Clean unused files. 87 | virtualenv: ## Create a virtual environment. 88 | release: ## Create a new tag for release. 89 | docs: ## Build the documentation. 90 | switch-to-poetry: ## Switch to poetry package manager. 91 | init: ## Initialize the project based on an application template. 92 | ``` 93 | 94 | ## Making a new release 95 | 96 | This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z` 97 | Every time a new tag is created and pushed to the remote repo, github actions will 98 | automatically create a new release on github and trigger a release on PyPI. 99 | 100 | For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, 101 | this token can be generated on [pypi.org](https://pypi.org/account/). 102 | 103 | To trigger a new release all you need to do is. 104 | 105 | 1. If you have changes to add to the repo 106 | * Make your changes following the steps described above. 107 | * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/). 108 | 2. Run the tests to ensure everything is working. 109 | 4. Run `make release` to create a new tag and push it to the remote repo. 110 | 111 | the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked. 112 | 113 | > **CAUTION**: The make release will change local changelog files and commit all the unstaged changes you have. 114 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/HISTORY.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include HISTORY.md 3 | include Containerfile 4 | graft tests 5 | graft genepy 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .ONESHELL: 2 | ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')") 3 | USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes") 4 | 5 | .PHONY: help 6 | help: ## Show the help. 7 | @echo "Usage: make " 8 | @echo "" 9 | @echo "Targets:" 10 | @fgrep "##" Makefile | fgrep -v fgrep 11 | 12 | 13 | .PHONY: show 14 | show: ## Show the current environment. 15 | @echo "Current environment:" 16 | @if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi 17 | @echo "Running using $(ENV_PREFIX)" 18 | @$(ENV_PREFIX)python -V 19 | @$(ENV_PREFIX)python -m site 20 | 21 | .PHONY: install 22 | install: ## Install the project in dev mode. 23 | @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi 24 | @echo "Don't forget to run 'make virtualenv' if you got errors." 25 | $(ENV_PREFIX)pip install -e .[test] 26 | 27 | .PHONY: fmt 28 | fmt: ## Format code using black & isort. 29 | $(ENV_PREFIX)isort genepy/ 30 | $(ENV_PREFIX)black -l 79 genepy/ 31 | $(ENV_PREFIX)black -l 79 tests/ 32 | 33 | .PHONY: lint 34 | lint: ## Run pep8, black, mypy linters. 35 | $(ENV_PREFIX)flake8 genepy/ 36 | $(ENV_PREFIX)black -l 79 --check genepy/ 37 | $(ENV_PREFIX)black -l 79 --check tests/ 38 | $(ENV_PREFIX)mypy --ignore-missing-imports genepy/ 39 | 40 | .PHONY: test 41 | test: lint ## Run tests and generate coverage report. 42 | $(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=genepy -l --tb=short --maxfail=1 tests/ 43 | $(ENV_PREFIX)coverage xml 44 | $(ENV_PREFIX)coverage html 45 | 46 | .PHONY: watch 47 | watch: ## Run tests on every change. 48 | ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/ 49 | 50 | .PHONY: clean 51 | clean: ## Clean unused files. 52 | @find ./ -name '*.pyc' -exec rm -f {} \; 53 | @find ./ -name '__pycache__' -exec rm -rf {} \; 54 | @find ./ -name 'Thumbs.db' -exec rm -f {} \; 55 | @find ./ -name '*~' -exec rm -f {} \; 56 | @rm -rf .cache 57 | @rm -rf .pytest_cache 58 | @rm -rf .mypy_cache 59 | @rm -rf build 60 | @rm -rf dist 61 | @rm -rf *.egg-info 62 | @rm -rf htmlcov 63 | @rm -rf .tox/ 64 | @rm -rf docs/_build 65 | 66 | .PHONY: virtualenv 67 | virtualenv: ## Create a virtual environment. 68 | @if [ "$(USING_POETRY)" ]; then poetry install && exit; fi 69 | @echo "creating virtualenv ..." 70 | @rm -rf .venv 71 | @python3 -m venv .venv 72 | @./.venv/bin/pip install -U pip 73 | @./.venv/bin/pip install -e .[test] 74 | @echo 75 | @echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!" 76 | 77 | .PHONY: release 78 | release: ## Create a new tag for release. 79 | @echo "WARNING: This operation will create s version tag and push to github" 80 | @read -p "Version? (provide the next x.y.z semver) : " TAG 81 | @echo "creating git tag : $${TAG}" 82 | @git tag $${TAG} 83 | @echo "$${TAG}" > genepy/VERSION 84 | @$(ENV_PREFIX)gitchangelog > HISTORY.md 85 | @git add genepy/VERSION HISTORY.md 86 | @git commit -m "release: version $${TAG} 🚀" 87 | @git push -u origin HEAD --tags 88 | @echo "Github Actions will detect the new tag and release the new version." 89 | 90 | .PHONY: docs 91 | docs: ## Build the documentation. 92 | @echo "building documentation ..." 93 | @$(ENV_PREFIX)mkdocs gh-deploy 94 | URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL 95 | 96 | .PHONY: switch-to-poetry 97 | switch-to-poetry: ## Switch to poetry package manager. 98 | @echo "Switching to poetry ..." 99 | @if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi 100 | @rm -rf .venv 101 | @poetry init --no-interaction --name=a_flask_test --author=rochacbruno 102 | @echo "" >> pyproject.toml 103 | @echo "[tool.poetry.scripts]" >> pyproject.toml 104 | @echo "genepy = 'genepy.__main__:main'" >> pyproject.toml 105 | @cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done 106 | @cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done 107 | @poetry install --no-interaction 108 | @mkdir -p .github/backup 109 | @mv requirements* .github/backup 110 | @mv setup.py .github/backup 111 | @echo "You have switched to https://python-poetry.org/ package manager." 112 | @echo "Please run 'poetry shell' or 'poetry run genepy'" 113 | 114 | .PHONY: init 115 | init: ## Initialize the project based on an application template. 116 | @./.github/init.sh 117 | 118 | 119 | # This project has been generated from rochacbruno/python-project-template 120 | # __author__ = 'rochacbruno' 121 | # __repo__ = https://github.com/rochacbruno/python-project-template 122 | # __sponsor__ = https://github.com/sponsors/rochacbruno/ 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # genepy 2 | 3 | _what is [genepy](https://en.wikipedia.org/wiki/G%C3%A9n%C3%A9pi)?_ 4 | 5 | A set of awesome functions & tools for Computational Geneticists 6 | 7 | ![long genome](documentation/genome.jpg) 8 | 9 | ## Content 10 | 11 | - **utils**: where a bunch of helper functions and usefull general scripts are stored 12 | - **plots**: a set of plotting tools based on [matplotlib]() and [bokeh]() to make volcano plots / CNV maps etc.. 13 | - **helper**: and additional helper functions to save data, do merging of dataframes... 14 | - **terra**: contains a set of functions that uses [dalmatian]() to interact with the [GCP]() powered genomics HPC platform: [Terra](). 15 | - **sequencing**: contains a set of function to works with bed/bam/fastqs... 16 | - **rna**: contains function to work with RNAseq (and related) data. 17 | - **pyDESeq2**: it is a python integration of [deseq2]() (the differential expression analyser) with [rpy2]() 18 | - **mutations**: a set of functions to work with maf files, vcf files etc.. 19 | - **google**: functions and packages linked to google's apis 20 | - **google_sheet**: function to upload a df as a google sheet 21 | - **gcp**: sets of functions to interact with google storage (relies on `gsutil`) 22 | - **epigenetics**: where we have things related to epigenomics 23 | - **chipseq**: has functions to read, merge, denoise, ChIP seq data. 24 | - **plot**: has functions to plot ChIP seq data. 25 | 26 | ### Helper tools 27 | 28 | _tools that you do not need to use directly as they have binding functions in genepy._ 29 | 30 | - **epigenetics/rose:**: where an updated version of the rose algorithm is stored (as a git submodule) 31 | - **cell_line_mapping-master/python/cell_line_mapper**: a set of functions to map cell line ids to other cell line ids based on an up to date google spreadsheet. 32 | 33 | 34 | ## Install 35 | 36 | ### with pip 37 | 38 | `pip install broad-genepy` 39 | 40 | and then use with `from genepy.utils/epigenetics/... import ...` 41 | 42 | Please see the next step to get access to all bindings and tools. 43 | 44 | ### dev mode 45 | 46 | ```bash 47 | git clone git://github.com/BroadInstitute/genepy.git 48 | pip install -e genepy 49 | ``` 50 | 51 | then you can import files in python with e.g: 52 | ```python 53 | from genepy import terra 54 | from genepy.utils import helper as h 55 | from genepy.google import gcp 56 | from genepy.utils import plot 57 | from genepy.epigenetics import chipseq 58 | 59 | ``` 60 | 61 | ## installation: to get access to all bindings and tools 62 | 63 | Install the following tools: 64 | - [gcloud](https://cloud.google.com/sdk/docs/install-sdk) 65 | - [firecloud-dalmatian](https://github.com/getzlab/dalmatian) 66 | - [gsheets](https://github.com/xflr6/gsheets) 67 | - [htslib/samtools](http://www.htslib.org/) 68 | - [bwa](https://github.com/lh3/bwa) 69 | just used once: 70 | - [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 71 | 72 | Some of these packages like gsheets, gcloud, firecloud-dalmatian will require you to create google accounts, login on your machine or download oauth files. 73 | 74 | Finaly you can install R packages (GSEABase, erccdashboard, GSVA, DESeq2): 75 | 76 | ```bash 77 | R -e 'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")};BiocManager::install(c("GSEABase", "erccdashboard", "GSVA", "DESeq2"));' 78 | ``` 79 | 80 | ## data: 81 | 82 | hg38 genome sizes: from https://github.com/igvteam/igv/blob/master/genomes/sizes/hg38.chrom.sizes 83 | 84 | ## About 85 | 86 | please do contribute, we do not have time to fix all issues or work on feature requests 87 | 88 | Jeremie Kalfon jkalfon@broadinstitute.org jkobject@gmail.com https://jkobject.com 89 | 90 | Apache license 2.0. 91 | -------------------------------------------------------------------------------- /data/Annotations.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/data/Annotations.RData -------------------------------------------------------------------------------- /data/variantFilter/snp_indels_rescue_list.txt: -------------------------------------------------------------------------------- 1 | gene Chromosome Start_position end type classification ref_allele newbase Protein_Change patient 2 | RPL22 1 6257785 6257785 Frame_Shift_Del DEL T - p.K16fs fh_22RV1_PROSTATE-Tumor 3 | SF3B2 11 65819899 65819900 In_Frame_Ins INS - GCC p.21_22insP fh_HEC6_ENDOMETRIUM-Tumor 4 | -------------------------------------------------------------------------------- /docs/genepy.md: -------------------------------------------------------------------------------- 1 | # Reference 2 | 3 | ::: genepy.utils.helper 4 | 5 | ::: genepy.utils.plot 6 | 7 | ::: genepy.epigenetics.chipseq -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to MkDocs 2 | 3 | For full documentation visit [mkdocs.org](https://www.mkdocs.org). 4 | 5 | ## Commands 6 | 7 | * `mkdocs new [dir-name]` - Create a new project. 8 | * `mkdocs serve` - Start the live-reloading docs server. 9 | * `mkdocs build` - Build the documentation site. 10 | * `mkdocs -h` - Print help message and exit. 11 | 12 | ## Project layout 13 | 14 | mkdocs.yml # The configuration file. 15 | docs/ 16 | index.md # The documentation homepage. 17 | ... # Other markdown pages, images and other files. 18 | -------------------------------------------------------------------------------- /documentation/genome.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/documentation/genome.jpg -------------------------------------------------------------------------------- /genepy/VERSION: -------------------------------------------------------------------------------- 1 | 1.2.7 2 | -------------------------------------------------------------------------------- /genepy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/__init__.py -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *~ 6 | celllinemapr/SOP.html 7 | __pycache__ 8 | python/.cache 9 | *.egg-info 10 | *.pyc 11 | python/dist 12 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: 3 | directories: 4 | - $HOME/.cache/pip 5 | 6 | python: 7 | - 3.5 8 | 9 | install: 10 | - pip install -r requirements.txt 11 | 12 | script: 13 | - set -e && pytest && set +e -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/README.md: -------------------------------------------------------------------------------- 1 | # cell_line_mapping 2 | Code for mapping between different CCLE/DepMap cell line identifiers 3 | 4 | ## Installation 5 | ### R 6 | ``` 7 | options(repos = c( 8 | "https://iwww.broadinstitute.org/~datasci/R-packages", 9 | "https://cran.cnr.berkeley.edu")) 10 | install.packages('celllinemapr') 11 | ``` 12 | As one could not have access to intranet to download the name mapping. The name mapping file is directly available and can be put to work by executing this command: 13 | `mkdir ~/.celllinemapr && mkdir ~/.celllinemapr/data && cp naming.csv ~/.celllinemapr/data` 14 | 15 | ### Python 16 | ``` 17 | pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-latest.tar.gz 18 | ``` 19 | 20 | ## Usage 21 | See [here](https://github.com/broadinstitute/cell_line_mapping/blob/master/celllinemapr/SOP.Rmd) for examples of functions for the R package. 22 | Funtion names for the Python package are analogous to those for R, replacing `.` with `_`. For instance, 23 | 24 | R: `ccle.to.arxspan` 25 | 26 | Python: `ccle_to_arxspan` 27 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/.celllinemapr.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/cell_line_mapping-master/celllinemapr/.celllinemapr.Rds -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: celllinemapr 2 | Title: Functions for mapping between cell line IDs 3 | Version: 0.1 4 | Authors@R: c(person("Philip", "Montgomery", email="pmontgom@broadinstitute.org", role = c("aut", "cre"))) 5 | Description: Streamline mapping between cell line IDs using the mapping which is periodically generated from ArxSpan and stored at https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv 6 | Depends: R (>= 3.3.0) 7 | License: CC0 8 | Encoding: UTF-8 9 | LazyData: true 10 | RoxygenNote: 6.0.1 11 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(arxspan.to.ccle) 4 | export(ccle.to.arxspan) 5 | export(ccle.to.latest) 6 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/R/cell_line_mapping.R: -------------------------------------------------------------------------------- 1 | find.non.unique <- function(x) { 2 | b <- table(x) > 1 3 | names(b)[b] 4 | } 5 | 6 | make.mapping <- function(full.map, input.type, output.type, id.subset, check.unique.mapping) { 7 | x <- as.character(full.map[[output.type]]) 8 | n <- as.character(full.map[[input.type]]) 9 | 10 | mask <- n %in% id.subset 11 | 12 | # collapse duplicate rows 13 | df <- data.frame(x=x[mask], n=n[mask], stringsAsFactors =F) 14 | df <- unique(df) 15 | 16 | x <- df$x 17 | n <- df$n 18 | 19 | if(check.unique.mapping) { 20 | non.unique.inputs <- find.non.unique(n) 21 | non.unique.outputs <- find.non.unique(x) 22 | if(length(non.unique.inputs) > 0) { 23 | stop(paste0("The following had nonunique values: ", paste0(non.unique.inputs, collapse=", "))) 24 | } 25 | 26 | if(length(non.unique.outputs) > 0) { 27 | stop(paste0("The following had nonunique values: ", paste0(non.unique.outputs, collapse=", "))) 28 | } 29 | } 30 | 31 | names(x) <- n 32 | x 33 | } 34 | 35 | read.mapping <- (function() { 36 | # save value from previous read to avoid fetching from url every time 37 | cell.line.mapping.cache <- NULL 38 | mapping.url <- getOption("celllinemapr.url", "../naming.csv") 39 | cache.path <- getOption("celllinemapr.cache.path", "../.celllinemapr.Rds") 40 | 41 | function(force=F) { 42 | if(is.null(cell.line.mapping.cache) || force) { 43 | mapping <- try(read.csv("../naming.csv")) 44 | if(class(mapping) == "try-error") { 45 | # if we got an error, then warn user that this failed and try loading from cache file. 46 | warning(paste0("Could not fetch mapping from ", mapping.url, ", attempting to read most recent cached mapping from ", cache.path)) 47 | mapping <- readRDS(cache.path) 48 | } else { 49 | stopifnot(is.data.frame(mapping)) 50 | saveRDS(mapping, file=cache.path) 51 | } 52 | stopifnot(is.data.frame(mapping)) 53 | cell.line.mapping.cache <<- mapping 54 | } 55 | cell.line.mapping.cache 56 | } 57 | })() 58 | 59 | 60 | name.mapper <- function(input.type, input.names, output.type, ignore.problems, check.unique.mapping, read.mapping.fn) { 61 | full.mapping <- read.mapping.fn() 62 | mapping <- make.mapping(full.mapping, input.type, output.type, input.names, check.unique.mapping) 63 | result <- mapping[input.names] 64 | if(!ignore.problems) { 65 | bad.names <- input.names[is.na(result)] 66 | if(length(bad.names) > 5) { 67 | bad.names <- c(bad.names[1:5], "...") 68 | } 69 | if(length(bad.names) > 0) { 70 | stop(paste0("Could not find cell lines (searching by ", input.type, ") for ", paste(bad.names, collapse=", "))) 71 | } 72 | } 73 | result 74 | } 75 | 76 | # returns a function to get cell line mapping. Returns the default function if 77 | pick.mapping.fn <- function(mapping) { 78 | if(!is.null(mapping)) { 79 | stopifnot(is.data.frame(mapping)) 80 | return (function() { 81 | return (mapping) 82 | } ) 83 | } else { 84 | return(read.mapping) 85 | } 86 | } 87 | 88 | #' Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names 89 | #' 90 | #' @param arxspan.ids A vector of arxspan ids. These are always of the form "ACH-XXXXXX" 91 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead. 92 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same CCLE name (which could cause issues downstream) 93 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest 94 | #' @examples 95 | #' ccle_names <- arxspan.to.ccle(c('ACH-000007', 'ACH-000008')) 96 | #' @export arxspan.to.ccle 97 | arxspan.to.ccle <- function(arxspan.ids, ignore.problems=F, check.unique.mapping=T, mapping=NULL) { 98 | name.mapper('broad_id', arxspan.ids, 'canonical_ccle_name', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping)) 99 | } 100 | 101 | #' Map ccle names to Broad ID (aka ArxSpan IDs) 102 | #' 103 | #' @param ccle.names A vector of CCLE names 104 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead. 105 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same arxspan id (which could cause issues downstream) 106 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest 107 | #' @examples 108 | #' broad_ids <- ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG')) 109 | #' @export ccle.to.arxspan 110 | ccle.to.arxspan <- function(ccle.names, ignore.problems=F, check.unique.mapping=T, mapping=NULL) { 111 | name.mapper('ccle_name', ccle.names, 'broad_id', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping)) 112 | } 113 | 114 | #' Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed. 115 | #' 116 | #' @param ccle.names A vector of CCLE names 117 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead. 118 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same ccle name (which could cause issues downstream) 119 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest 120 | #' @examples 121 | #' ccle_names <- ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE') 122 | #' @export ccle.to.latest 123 | ccle.to.latest <- function(arxspan.ids, ignore.problems=F, check.unique.mapping=T, mapping=NULL) { 124 | name.mapper('ccle_name', arxspan.ids, 'canonical_ccle_name', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping)) 125 | } 126 | 127 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/SOP.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'SOP: Mapping cell line IDs' 3 | author: "Philip montgomery" 4 | date: "7/11/2018" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## R Markdown 13 | 14 | This SOP describes how to use the R library for mapping cell line identifiers based on the situation. The authorative source for all cell line names and mappings is recorded in ArxSpan. However, it is difficult to query ArxSpan so every day, we export the mapping from arxspan and publish it internally as a CSV file at https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv 15 | 16 | The R package celllinemapr pulls directly from this URL automatically. 17 | 18 | ### Loading legacy data which contains CCLE names 19 | 20 | Moving forward, we are tracking cell lines by their Broad IDs which get assigned when the line is registered into 21 | ArxSpan. (As a result, we sometimes refer to these as ArxSpan IDs and are always of the form "ACH-XXXXXX") 22 | 23 | In order to join old data which has CCLE names to a dataset which use Broad IDs, you will need to remap the CCLE names to the Broad IDs. This can be done via ccle.to.arxspan() 24 | 25 | ```{r ccle.to.arxspan} 26 | library(celllinemapr) 27 | ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG')) 28 | ``` 29 | 30 | ### Getting latest CCLE names 31 | 32 | Also, since CCLE names can change, the current name for a line may be different than with the one when a dataset was created. If you wish to get the latest name for a line you can use ccle.to.latest() 33 | 34 | ```{r ccle.to.latest} 35 | ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE') 36 | ``` 37 | 38 | ### Looking up CCLE names by Broad ID 39 | 40 | The Broad ID are opaque and so often you will want a human readable label when reporting info about cell lines. One 41 | can map back to CCLE name via arxspan.to.ccle(). 42 | 43 | ```{r arxspan.to.ccle} 44 | arxspan.to.ccle(c('ACH-000007', 'ACH-000008')) 45 | ``` 46 | 47 | ### Using existing mapping 48 | 49 | The mapping functions only work within the Broad's internal network. You can use these methods and if you cannot reach the internal network, they will use the most recently cached mapping. 50 | 51 | However, if you're running this code on a machine which does not have a cached mapping, you'll need to provide a copy of it yourself. You can do this by providing the "mapping" parameter to any of these methods. 52 | 53 | ```{r arxspan.to.ccle.mapping} 54 | map.df = data.frame(ccle_name=c("A101D_SKIN", "LS513_LARGE_INTESTINE"), 55 | canonical_ccle_name=c("A101D_FAKEMAPPING", "LS513_FAKEMAPPING"), 56 | broad_id=c('ACH-000008', "ACH-000007")) 57 | 58 | arxspan.to.ccle(c('ACH-000007', 'ACH-000008'), mapping=map.df) 59 | ``` 60 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/celllinemapr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/man/arxspan.to.ccle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cell_line_mapping.R 3 | \name{arxspan.to.ccle} 4 | \alias{arxspan.to.ccle} 5 | \title{Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names} 6 | \usage{ 7 | arxspan.to.ccle(arxspan.ids, ignore.problems = F, check.unique.mapping = T) 8 | } 9 | \arguments{ 10 | \item{arxspan.ids}{A vector of arxspan ids. These are always of the form "ACH-XXXXXX"} 11 | 12 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.} 13 | 14 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same CCLE name (which could cause issues downstream)} 15 | } 16 | \description{ 17 | Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names 18 | } 19 | \examples{ 20 | ccle_names <- arxspan.to.ccle(c('ACH-000007', 'ACH-000008')) 21 | } 22 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/man/ccle.to.arxspan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cell_line_mapping.R 3 | \name{ccle.to.arxspan} 4 | \alias{ccle.to.arxspan} 5 | \title{Map ccle names to Broad ID (aka ArxSpan IDs)} 6 | \usage{ 7 | ccle.to.arxspan(ccle.names, ignore.problems = F, check.unique.mapping = T) 8 | } 9 | \arguments{ 10 | \item{ccle.names}{A vector of CCLE names} 11 | 12 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.} 13 | 14 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same arxspan id (which could cause issues downstream)} 15 | } 16 | \description{ 17 | Map ccle names to Broad ID (aka ArxSpan IDs) 18 | } 19 | \examples{ 20 | broad_ids <- ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG')) 21 | } 22 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/celllinemapr/man/ccle.to.latest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cell_line_mapping.R 3 | \name{ccle.to.latest} 4 | \alias{ccle.to.latest} 5 | \title{Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed.} 6 | \usage{ 7 | ccle.to.latest(arxspan.ids, ignore.problems = F, check.unique.mapping = T) 8 | } 9 | \arguments{ 10 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.} 11 | 12 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same ccle name (which could cause issues downstream)} 13 | 14 | \item{ccle.names}{A vector of CCLE names} 15 | } 16 | \description{ 17 | Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed. 18 | } 19 | \examples{ 20 | ccle_names <- ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE') 21 | } 22 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/python/cell_line_mapper/__init__.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from collections import defaultdict, Counter 3 | import pandas as pd 4 | from io import StringIO 5 | 6 | csv_url = 'https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv' 7 | __version__ = "0.1" 8 | 9 | def read_file_to_dict(key_name, value_name): 10 | return_dict = defaultdict(str) 11 | 12 | response = requests.get(csv_url) 13 | assert response.status_code == 200, "Could not fetch mapping from {}, got, status_code={} reason={}".format(csv_url, response.status_code, response.reason) 14 | mapping_request = StringIO(response.text) 15 | df = pd.read_csv(mapping_request) 16 | 17 | for row in df.to_dict(orient='records'): 18 | if return_dict[row[key_name]] == "": 19 | return_dict[row[key_name]] = row[value_name] 20 | else: 21 | if row[value_name] != return_dict[row[key_name]]: 22 | return_dict[row[key_name]] += str(", " + row[value_name]) 23 | 24 | return return_dict 25 | 26 | 27 | def check_unique(input_list, result_list): 28 | input_len = len(input_list) 29 | # make sure one key doesn't point to multiple values 30 | is_unique = True 31 | for mapping in result_list: 32 | if mapping is not None and ',' in mapping: 33 | is_unique = False 34 | break 35 | 36 | # make sure two keys don't point to the same value 37 | if is_unique: 38 | num_unique_values = len(Counter(tuple(mapping) for mapping in result_list if mapping is not None)) + int(None in result_list) 39 | 40 | is_unique = (input_len == num_unique_values) 41 | 42 | if not is_unique: 43 | raise RuntimeError('Mappings are not unique') 44 | 45 | 46 | def name_mapper(input_type, input_names, output_type, ignore_problems=False, check_unique_mapping=True): 47 | if len(input_names) == 0: 48 | raise RuntimeError("Please input a non-empty list") 49 | 50 | mapping_dict = read_file_to_dict(input_type, output_type) 51 | output_names = [] 52 | 53 | for i_name in input_names: 54 | o_name = mapping_dict.get(i_name) 55 | if not ignore_problems and o_name is None: 56 | raise KeyError(output_type + " " + "could not be found for the following " + input_type + ": " + i_name) 57 | output_names.append(o_name) 58 | 59 | if check_unique_mapping: 60 | check_unique(input_names, output_names) 61 | 62 | assert len(output_names) == len(input_names) 63 | return output_names 64 | 65 | 66 | def arxspan_to_ccle(arxspan_ids, ignore_problems=False, check_unique_mapping=True): 67 | return name_mapper('broad_id', arxspan_ids, 'canonical_ccle_name', ignore_problems, check_unique_mapping) 68 | 69 | 70 | def ccle_to_arxspan(ccle_names, ignore_problems=False, check_unique_mapping=True): 71 | return name_mapper('ccle_name', ccle_names, 'broad_id', ignore_problems, check_unique_mapping) 72 | 73 | 74 | def ccle_to_latest(ccle_names, ignore_problems=False, check_unique_mapping=True): 75 | return name_mapper('ccle_name', ccle_names, 'canonical_ccle_name', ignore_problems, check_unique_mapping) 76 | 77 | # alias for the old name of this function 78 | latest_ccle_names=ccle_to_latest 79 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/python/cell_line_mapper/test_mapper.py: -------------------------------------------------------------------------------- 1 | from . import name_mapper, arxspan_to_ccle, ccle_to_arxspan, latest_ccle_names, csv_url 2 | import cell_line_mapper 3 | import pytest 4 | import pandas as pd 5 | import requests 6 | from io import StringIO 7 | from collections import defaultdict 8 | import csv 9 | ################################################################################## 10 | ### TEST ARXSPAN_TO_CCLE 11 | #just the first three broadid's in the doc 12 | 13 | def test_real_data_fetch_check(): 14 | return_dict = defaultdict(set) 15 | 16 | mapping_request = StringIO(requests.get(csv_url).text) 17 | df = pd.read_csv(mapping_request) 18 | assert list(df) == ['ccle_name', 'canonical_ccle_name', 'broad_id'] 19 | 20 | 21 | ################################################################################## 22 | 23 | @pytest.fixture() 24 | def fake_mapping_csv(monkeypatch): 25 | def mock_read_file_to_dict(key_name, value_name): 26 | return_dict = defaultdict(str) 27 | 28 | with open('test-data.csv', mode='r') as csvfile: 29 | map_reader = csv.DictReader(csvfile) 30 | for rows in map_reader: 31 | if return_dict[rows[key_name]] == "": 32 | return_dict[rows[key_name]] = rows[value_name] 33 | else: 34 | return_dict[rows[key_name]]+=str(", "+rows[value_name]) 35 | print(return_dict) 36 | return return_dict 37 | 38 | monkeypatch.setattr(cell_line_mapper, 'read_file_to_dict', mock_read_file_to_dict) 39 | 40 | 41 | ################################################################################## 42 | ### TEST ARXSPAN_TO_CCLE 43 | # just the first three broadid's in the doc 44 | def test_arxspan_to_ccle_first_three_rows(fake_mapping_csv): 45 | assert arxspan_to_ccle(["1", "2", "3"]) == ["A", "B", "C"] 46 | 47 | 48 | # 1. unmappable ID and ignore_problems == false 49 | def test_arxspan_to_ccle_unmappable_ID_ignore_problems_false(fake_mapping_csv): 50 | with pytest.raises(KeyError) as excinfo: 51 | arxspan_to_ccle(["1", "madeupfakename"]) 52 | assert "canonical_ccle_name could not be found for the following broad_id: madeupfakename" in str(excinfo.value) 53 | 54 | 55 | # 2. unmappable ID and ignore_problems == true 56 | def test_arxspan_to_ccle_unmappable_ID_ignore_problems_true(fake_mapping_csv): 57 | assert arxspan_to_ccle(["1", "madeupfakename"], True) == ["A", None] 58 | 59 | 60 | # 2.5. arxspan id has multiple ccle names 61 | def test_arxspan_to_ccle_one_key_many_values(fake_mapping_csv): 62 | assert arxspan_to_ccle(["7"], True, False) == ["D, F"] 63 | 64 | 65 | # 3. unique elements in arxspan_ids do not map to unique ccle names and check_unique_mapping == false 66 | def test_arxspan_to_ccle_nonunique_mapping_check_unique_mapping_false(fake_mapping_csv): 67 | assert arxspan_to_ccle(["4", "5"], True, False) == ["D", "D"] 68 | 69 | 70 | # 4. unique elements in arxspan_ids do not map to unique ccle names and check_unique_mapping == true 71 | def test_arxspan_to_ccle_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv): 72 | with pytest.raises(RuntimeError) as excinfo: 73 | arxspan_to_ccle(["4", "5"], True, True) 74 | assert 'Mappings are not unique' in str(excinfo.value) 75 | 76 | 77 | ################################################################################## 78 | ### TEST CCLE_ARXSPAN 79 | # 0. Just the first three rows 80 | def test_ccle_to_arxspan_first_three_rows(fake_mapping_csv): 81 | assert ccle_to_arxspan(["a", "B", "c"]) == ["1", "2", "3"] 82 | 83 | 84 | # 1. unmappable ccle name and ignore_problems == false 85 | def test_ccle_to_arxspan_unmappable_ID_ignore_problems_false(fake_mapping_csv): 86 | with pytest.raises(KeyError) as excinfo: 87 | ccle_to_arxspan(["a", "madeupfakename", "idk"], False) 88 | assert "broad_id could not be found for the following canonical_ccle_name: madeupfakename" in str(excinfo.value) 89 | 90 | 91 | # 2. unmappable ccle name and ignore_problems == true 92 | def test_ccle_to_arxspan_unmappable_ID_ignore_problems_true(fake_mapping_csv): 93 | assert ccle_to_arxspan(["a", "madeupfakename"], True) == ["1", None] 94 | 95 | 96 | # 2.5. ccle name has multiple arxspan_ids 97 | def test_ccle_to_arxspan_one_key_many_values(fake_mapping_csv): 98 | assert ccle_to_arxspan(["d"], True, False) == ["4, 5, 7"] 99 | 100 | 101 | # 3. unique ccle names do not map to unique arxspan_ids and check_unique_mapping == false 102 | def test_ccle_to_arxspan_nonunique_mapping_check_unique_mapping_false(fake_mapping_csv): 103 | assert ccle_to_arxspan(["e"], True, False) == ["6, 7"] 104 | 105 | 106 | # 4. unique ccle names do not map to unique arxspan_ids and check_unique_mapping == true 107 | def test_ccle_to_arxspan_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv): 108 | with pytest.raises(RuntimeError) as excinfo: 109 | a = ccle_to_arxspan(["e"], True, True) 110 | assert 'Mappings are not unique' in str(excinfo.value) 111 | 112 | 113 | ################################################################################## 114 | ### TEST LATEST_CCLE_NAMES 115 | 116 | # 0. Just a few samples that have different names 117 | def test_latest_ccle_names_first_three_rows(fake_mapping_csv): 118 | assert latest_ccle_names(["a", "B", "c"]) == ["A", "B", "C"] 119 | 120 | 121 | # 1. unmappable ccle name and ignore_problems == false 122 | def test_latest_ccle_names_unmappable_ignore_problems_false(fake_mapping_csv): 123 | with pytest.raises(KeyError) as excinfo: 124 | latest_ccle_names(["madeupfakename"], False) 125 | assert "canonical_ccle_name could not be found for the following ccle_name: madeupfakename" in str(excinfo.value) 126 | 127 | 128 | # 2. unmappable ccle name and ignore_problems == true 129 | def test_latest_ccle_names_unmappable_ignore_problems_true(fake_mapping_csv): 130 | assert latest_ccle_names(["madeupfakename", "a"], True) == [None, "A"] 131 | 132 | 133 | # 2.5 old ccle name maps to multiple latest names 134 | def test_latest_ccle_names_one_key_many_values(fake_mapping_csv): 135 | assert latest_ccle_names(["e"], True, False) == ["E, F"] 136 | 137 | def test_latest_ccle_names_one_key_many_values_check_unique_mapping_true(fake_mapping_csv): 138 | with pytest.raises(RuntimeError) as excinfo: 139 | latest_ccle_names(["e"], True, True) 140 | assert 'Mappings are not unique' in str(excinfo.value) 141 | 142 | # 4. unique ccle names do not map to unique latest names and check_unique_mapping == true 143 | def test_latest_ccle_names_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv): 144 | with pytest.raises(RuntimeError) as excinfo: 145 | latest_ccle_names(["d", "dd", "ddd"], True, True) 146 | assert 'Mappings are not unique' in str(excinfo.value) 147 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/python/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.23.1 2 | pytest==3.6.1 3 | requests==2.20.0 -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import ast 4 | import re 5 | from setuptools import setup, find_packages 6 | 7 | _version_re = re.compile(r'__version__\s*=\s*(.*)') 8 | 9 | with open('cell_line_mapper/__init__.py', 'rt') as f: 10 | version = str(ast.literal_eval(_version_re.search( 11 | f.read()).group(1))) 12 | 13 | setup(name='cell_line_mapper', 14 | version=version, 15 | description='Functions for mapping between cell line identifiers', 16 | author='Phoebe Moh', 17 | author_email='pmoh@broadinstitute.org', 18 | install_requires=['pandas', 'requests'], 19 | packages=find_packages() 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /genepy/cell_line_mapping-master/python/test-data.csv: -------------------------------------------------------------------------------- 1 | ccle_name,canonical_ccle_name,broad_id 2 | a,A,1 3 | B,B,2 4 | c,C,3 5 | d,D,4 6 | d,D,5 7 | d,D,7 8 | e,E,6 9 | e,F,7 10 | -------------------------------------------------------------------------------- /genepy/epigenetics/CREME.md: -------------------------------------------------------------------------------- 1 | # genepy/CREME: ChIP REplicate MErger 2 | 3 | CREME is part of the [genepy](https://github.com/broadinstitute/GenePy) package. 4 | 5 | For Introduction we will link to the [article](https://ro-che.info/articles/2018-07-11-chip-seq-consensus) by Roman Cheplyaka on the subject. 6 | 7 | We built this tool noticing the lack of publicly available simple Chip Merging tool working for [MACS2](https://github.com/macs3-project/MACS)'s output, with replicates of broadly different quality. We wanted a 1 function tool that would work in python. 8 | 9 | We will although note tools such as: 10 | - [PePr](https://pubmed.ncbi.nlm.nih.gov/24894502/) [code](https://github.com/shawnzhangyx/PePr) which can substitute itself from MACS2 by ccalling on mutliple bam files at the same time. It will work by counting reads and looking at the peak shape. 11 | - [multiGPS](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003501), [code](https://github.com/seqcode/multigps) which is mostly for differential binding chipseq but can work with replicates and work in java + R. 12 | - [MSPC](https://academic.oup.com/bioinformatics/article/31/17/2761/183989), [code](https://github.com/Genometric/MSPC) in .NET, which is very well documented, simple and provide some QC by the user. 13 | 14 | - [genoGAM](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2238-7) in R, [code](https://github.com/gstricker/GenoGAM). Which calls peaks by itself as well and seem to handle replicates. 15 | 16 | - [sierra Platinum](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5025614/), [code](https://github.com/sierraplatinum/sierra), which does not seem to be maintained. 17 | 18 | ## Our tool 19 | 20 | The goal of creme is to be a simple, 1 function tool. It works with 1 to many sets of replicates for each pulled protein/mark. 21 | 22 | CREME takes as input a pandas dataframe. This dataframe is the concatenation of each replicates' bed files and can be loaded from a set of MACS2 bedfiles using genepy's loadPeaks function. 23 | 24 | CREME will output, amongst other thing, a dataframe representing a concatenation f bedfiles of merged replicates. 25 | 26 | ## Process 27 | 28 | ### Selection: Finding the best replicate 29 | 30 | A first goal of CREME was to find the best replicate, to do so, it can take manual annotation of _BAD_ (bad/lower quality) replicates. These can be provided by visual inspection of bigwig tracks + bed files on IGV, from thresholding on QC results such as FRiP scores. 31 | 32 | ![plot igv](docsCREME/igv-app-MED1-zoom.png) 33 | 34 | Given all available replicates, CREME will compute a conscensus, considering any peaks at most 150 bp from another peak, to be in overlap. We have noticed that changing this parameter from 0 to 150 decreased the total number of peaks found by only 8%. 35 | 36 | Non overlapping peaks are kept in the conscensus. When we have an overlap we take the mean of signals and the product of pvalues across overlapping replicates. 37 | 38 | ![plot venn](docsCREME/MED1_before_venn_venn.png) 39 | 40 | Then, CREME will look at their overlap and select the one that has the best overlap score: 41 | 42 | $O_{score}(A) = \sum{i from 0 to m} \sum{K in comb(i, G)} i * \sum {j from 0 to n} AND(A[j],...K[j])$ 43 | 44 | Where: 45 | - $G$ is a binary matrix of size (row/col) $m*n$ of $m$ replicates with $n$ conscensus peaks and a value of 1 if replicate $m_i$ has peak on conscensus peak $n_i$. 46 | - $comb(i, G)$ is a list of all possible matrices made from taking $i$ elements (row) from matrix $G$ without replacement. 47 | - $AND$ is a binary operation returning 1 if all passed elements are 1 else 0. 48 | 49 | The non-bad quality replicate with the best score will be selected as the __main replicate__. 50 | 51 | In addition to the venn diagram, correlation between each replicate's peak signal is computed and displayed to the user. 52 | 53 | ![pairplot of replicates](docsCREME/MED1_before_pairplot.png) 54 | 55 | ### Validation: Finding new peaks 56 | 57 | For each additional replicate S, we will now look for new peaks. 58 | First, if we find that the second best replicate and the first best replicate have both less than 30% of their peaks in common we __discard__ that protein/mark and only return the main replicate. 59 | 60 | Taking peaks that are found in the main replicate, we will call peaks, using S's bigwig and a lower threshold than what MACS2 is using by default. We then do the same for peaks in S that were not in the main replicate. 61 | 62 | If after calling new peaks we get less than 30% overlap in both replicates, we discard the replicate. 63 | 64 | Else, we finalize the merging of overlapping peaks and update the __main replicate__ with this overlap. 65 | 66 | ### Calling Peaks 67 | 68 | The process of calling peaks is loosely based on MACS2's peak calling algorithm: 69 | 70 | We compute a distance: the [KL divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), between two poisson distributions. One is representing the distribution of signal from a bigwig file under a region. The other is the representing the same signal under the entire chromosome where that region lies. The region here is the peak in the other sample that we want to look for in the current sample. 71 | 72 | If that distance is above a threshold: here 8, we validate the region as being a peak. 73 | 74 | ### Output and QC 75 | 76 | The output of our tool is a dataframe of concatenated merged replicates. The pipeline also outputs a set of bad quality replicates and bad quality proteins/marks. 77 | 78 | Additionally, inforrmation on distribution of peak signal across replicates and number of peaks found is provided to the user. 79 | 80 | ![kdeplot of new found peaks](docsCREME/MED1_new_found_peaks_kdeplot.png) 81 | 82 | ## WIP and current issues 83 | 84 | 1. For now, we are not using the exact same algorithm as MACS2, as we are comparing the peak's read distribution to overall reads in the chromosome using KL divergence. But MACS2 is comparing 4 terms: the distribution in the likely region of the sample BAM, the distribution in the likely region of the INPUT BAM, the distribution in the sample BAM's chromosome, the distribution in the INPUT BAM's chromosome. Moreover, MACS is comparing them using something like a fisher's exact test and corrects for FDR using the BH method. 85 | 86 | 2. For now, we are not computing a perfect overall replicate quality ourselves. Our scoring method did not work in 5% of cases. We might want to mitigate it by adding peaks' Qvalues and the replicate's Frip score and total read count in our analysis. 87 | 88 | 3. For now, we do not compute pvalue when we compute new peaks. 89 | 90 | 4. For now, we do not integrate the pvalue/ signal of newly found peaks in the conscensus merger. 91 | 92 | 5. More long term: We would hope to do something more akin to joint calling across replicates using graphical models to call peaks. 93 | -------------------------------------------------------------------------------- /genepy/epigenetics/CREME.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from genepy.utils import helper as h 5 | from genepy.utils import plot 6 | from genepy.epigenetics.chipseq import * 7 | import seaborn as sns 8 | import pyBigWig 9 | import matplotlib.pyplot as plt 10 | from scipy.optimize import minimize 11 | from scipy.special import factorial 12 | import warnings 13 | import itertools 14 | 15 | 16 | def findpeakpath(folder, proteiname): 17 | """ 18 | given a folder of bigwigs and a protein name, finds the right bigwig 19 | """ 20 | res = None 21 | for val in os.listdir(folder): 22 | if str(proteiname) in val: 23 | if res: 24 | raise ValueError('more than 1 bigwig file found') 25 | res = val 26 | if res: 27 | return res 28 | raise ValueError('no bigwig file found') 29 | 30 | 31 | def findBestPeak(presence): 32 | """ 33 | given a list of -sets of peak locations for each replicate- will return the best replicate given a simple metric 34 | """ 35 | tot = [] 36 | for ind, el in enumerate(presence): 37 | val = len(el) 38 | pres = [x for j, x in enumerate(presence) if j != ind] 39 | for jnd in range(1, len(pres)+1): 40 | for comb in itertools.combinations(pres, jnd): 41 | ov = el 42 | for knd in range(jnd): 43 | ov = ov & comb[knd] 44 | val += len(ov)*(jnd+1) 45 | tot.append(val) 46 | return np.argsort(tot)[::-1] 47 | 48 | 49 | def mergeReplicatePeaks(peaks, bigwigfolder, markedasbad=None, window=100, 50 | sampling=3000, mincov=4, doPlot=True, cov={}, minKL=8, use='max', 51 | MINOVERLAP=0.3, lookeverywhere=True, only='', saveloc=''): 52 | """ 53 | /!/ should only be passed peaks with at least one good replicate 54 | for each TFpeaksets, 55 | 1. find the replicate that have the most peaks 56 | 2. correlate peaks and get in highest correlation order with the replicate found in 1 57 | 3. find overlap of both and get size of second replicate 58 | 4. if small(er)-> use only to increase statistics 59 | 1. if a lot of uncalled peaks in replicate 2 at replicate 1 peaks (flag for mergebam) 60 | 5. if similar size -> get only intersect 61 | 2. add to intersect, find uncalled peaks in both replicates which are called in the other 62 | 6. repeat for all replicates 63 | ------------------------- 64 | if full overlap of one of the peak replicate, only use the overlapped one to increase confidence on peak 65 | if >80% average non overlap, 66 | print warning and percentage of overlap 67 | 68 | if <20% average non overlap, 69 | take the overlap and increase confidence and avg logfold 70 | 71 | if one is <20%: 72 | if other <40% average non overlap, 73 | take the overlap and increase confidence and avg logfold 74 | else 75 | take 76 | 77 | gets the max cov at the genomic window and if above some threshold, accepts the peak. 78 | 79 | extend peak by X bp if no TSS 80 | remove TSS from peaks 81 | 82 | 83 | create a new data frame containing merged peak size, reassembled peak data (p value etc..) and 84 | a the value for presence of each TF listed in previous df 85 | ------------------------------------ 86 | 87 | args: 88 | ---- 89 | peaks: df[bed-like] all the peaks into the sameBam with a column containing the 'name' 90 | being the id of the sample, the 'replicate' number of this sample, the 'tf' chiped here 91 | bamfolder: str, foldername 92 | avgCov: dict(filename:int) a dict where for each bam filename is given an averageCoverage 93 | if use=='max': 94 | window: 95 | mincov: 96 | 97 | if use=='max': 98 | 99 | 100 | returns: 101 | ------- 102 | mergedpeaks: dict{df-peakslike} 103 | bamtomerge: [[bam1,bam2]] 104 | """ 105 | def col_nan_scatter(x, y, **kwargs): 106 | df = pd.DataFrame({'x': x[:], 'y': y[:]}) 107 | df = df[df.sum(0) != 0] 108 | x = df['x'] 109 | y = df['y'] 110 | plt.gca() 111 | plt.scatter(x, y) 112 | 113 | def col_nan_kde_histo(x, **kwargs): 114 | df = pd.DataFrame({'x': x[:]}) 115 | df = df[df['x'] != 0] 116 | x = df['x'] 117 | plt.gca() 118 | sns.kdeplot(x) 119 | print("/!/ should only be passed peaks with at least one good replicate") 120 | # for a df containing a set of peaks in bed format and an additional column of different TF 121 | tfs = list(set(peaks['tf'])) 122 | totpeaknumber = 0 123 | mergedpeaksdict = {} 124 | remove = [] 125 | tomergebam = [] 126 | ratiosofunique = {} 127 | h.createFoldersFor(saveloc) 128 | f = open(saveloc+'results.txt', 'w') 129 | warnings.simplefilter("ignore") 130 | for tf in tfs: 131 | if only and tf != only: 132 | continue 133 | cpeaks = peaks[peaks.tf == tf] 134 | print('_____________________________________________________') 135 | f.write('_____________________________________________________' + '\n') 136 | if len(set(cpeaks['replicate'])) == 1: 137 | if cpeaks.name.tolist()[0] in markedasbad: 138 | print('the only replicate is considered bad!') 139 | f.write('the only replicate is considered bad!'+"\n") 140 | print('wrong TF: '+tf) 141 | f.write('wrong TF: '+tf+"\n") 142 | mergedpeaksdict.update({tf: cpeaks}) 143 | remove.append(tf) 144 | continue 145 | print("we only have one replicate for " + tf + " .. pass") 146 | f.write("we only have one replicate for " + tf + " .. pass"+"\n") 147 | mergedpeaksdict.update({tf: cpeaks}) 148 | continue 149 | print("merging " + tf + " peaks") 150 | f.write("merging " + tf + " peaks"+"\n") 151 | merged = simpleMergePeaks(cpeaks, window=window, maxp=False) 152 | merged_bed = merged[merged.columns[8:]] 153 | finalpeaks = merged[merged.columns[:8]] 154 | print('--> finish first overlaps lookup') 155 | f.write('--> finish first overlaps lookup'+"\n") 156 | # flag when biggest is <1000 peaks 157 | if len(finalpeaks) < 1000: 158 | print('!TF has less than 1000 PEAKS!') 159 | f.write('!TF has less than 1000 PEAKS!'+"\n") 160 | # for each TF (replicates), compute number of peaks 161 | peakmatrix = merged_bed.values.astype(bool) 162 | 163 | presence = [] 164 | for peakpres in peakmatrix.T: # https://github.com/tctianchi/pyvenn 165 | presence.append(set([i for i, val in enumerate(peakpres) if val == 1])) 166 | # compute overlap matrix (venn?) 167 | if peakmatrix.shape[1] < 7 and doPlot: 168 | plot.venn(presence, [i+'_BAD' if i.split('-')[0] 169 | in markedasbad else i for i in merged_bed.columns], title=tf+"_before_venn", folder=saveloc) 170 | plt.show() 171 | else: 172 | print('too many replicates for Venn: '+str(peakmatrix.shape[1])) 173 | f.write('too many replicates for Venn: '+str(peakmatrix.shape[1])+"\n") 174 | if doPlot: 175 | fig = sns.pairplot(merged_bed, corner=True, diag_kind="kde", 176 | kind="reg", plot_kws={"scatter_kws": {"alpha": .05}}) 177 | #fig = fig.map_upper(col_nan_scatter) 178 | #fig = fig.map_upper(col_nan_kde_histo) 179 | plt.suptitle("correlation of peaks in each replicate", y=1.08) 180 | if saveloc: 181 | fig.savefig(saveloc+tf+"_before_pairplot.pdf") 182 | plt.show() 183 | for i, val in enumerate(merged_bed): 184 | unique_inval = np.logical_and( 185 | np.delete(peakmatrix, i, axis=1).sum(1).astype(bool) == 0, peakmatrix[:, i]) 186 | sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None)) 187 | plt.title("distribution of unique peaks in each replicate") 188 | if saveloc: 189 | plt.savefig(saveloc+tf+"_before_unique_kdeplot.pdf") 190 | plt.show() 191 | 192 | bigwigs = os.listdir(bigwigfolder) 193 | 194 | foundgood = False 195 | sort = findBestPeak(presence) 196 | for ib, sb in enumerate(sort): 197 | if merged_bed.columns[sb].split('-')[0] not in markedasbad: 198 | foundgood = True 199 | break 200 | if not foundgood: 201 | print('no peaks were good enough quality') 202 | f.write('no peaks were good enough quality'+"\n") 203 | print('bad TF: '+tf) 204 | f.write('bad TF: '+tf+"\n") 205 | remove.append(tf) 206 | ib = 0 207 | # distplot 208 | # correlation plot 209 | 210 | biggest_ind = sort[ib] 211 | peakmatrix = peakmatrix.T 212 | biggest = merged_bed.columns[biggest_ind] 213 | print('-> main rep is: '+str(biggest)) 214 | f.write('-> main rep is: '+str(biggest)+'\n') 215 | tot = peakmatrix[biggest_ind].copy().astype(int) 216 | # starts with highest similarity and go descending 217 | j = 0 218 | recovered = 0 219 | additionalpeaksinbig = np.array([]) 220 | for i, val in enumerate(sort): 221 | if i == ib: 222 | continue 223 | j += 1 224 | # if avg non overlap > 60%, and first, and none small flag TF as unreliable. 225 | overlap = len(presence[biggest_ind] & presence[val] 226 | ) / len(presence[biggest_ind]) 227 | peakname = merged_bed.columns[val] 228 | print('- '+peakname) 229 | f.write('- '+peakname+'\n') 230 | print(' overlap: ' + str(overlap*100)+"%") 231 | f.write(' overlap: ' + str(overlap*100)+"%"+'\n') 232 | if overlap < MINOVERLAP: 233 | smallsupport = len(presence[biggest_ind] & 234 | presence[val]) / len(presence[val]) 235 | print(' --> not enough overlap') 236 | f.write(' --> not enough overlap'+'\n') 237 | if smallsupport < MINOVERLAP: 238 | # if the secondary does not have itself the required support 239 | if j == 1 and merged_bed.columns[val].split('-')[0] not in markedasbad: 240 | print(" Wrong TF: "+tf) 241 | f.write(" Wrong TF: "+tf+'\n') 242 | remove.append(tf) 243 | break 244 | # if not first, throw the other replicate and continue 245 | print(" not using this replicate from the peakmatrix") 246 | f.write(" not using this replicate from the peakmatrix"+'\n') 247 | continue 248 | if lookeverywhere: 249 | tolookfor = peakmatrix[val] == 0 250 | else: 251 | tolookfor = np.logical_and(peakmatrix[biggest_ind], peakmatrix[val] == 0) 252 | # ones that we have in the Primary but not in the secondary 253 | additionalpeaksinsec = findAdditionalPeaks(finalpeaks, tolookfor, bigwigfolder + findpeakpath( 254 | bigwigfolder, peakname), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use) 255 | if len(additionalpeaksinsec[additionalpeaksinsec > 0]) > 0: 256 | sns.kdeplot(additionalpeaksinsec[additionalpeaksinsec > 0], 257 | label=peakname, legend=True).set(xlim=(0, None)) 258 | print(' min,max from newly found peaks: ' + 259 | str((additionalpeaksinsec[additionalpeaksinsec > 0].min(), additionalpeaksinsec[additionalpeaksinsec > 0].max()))) 260 | f.write(' min,max from newly found peaks: '+str((additionalpeaksinsec[additionalpeaksinsec > 0].min( 261 | ), additionalpeaksinsec[additionalpeaksinsec > 0].max()))+'\n') 262 | # for testing purposes mainly 263 | finalpeaks[additionalpeaksinsec.astype(bool)].to_csv( 264 | 'additionalpeaksinsec_mp'+merged_bed.columns[val]+'.bed', sep='\t', index=None, header=False) 265 | peakmatrix[val] = np.logical_or( 266 | peakmatrix[val], additionalpeaksinsec.astype(bool)) 267 | overlap = np.sum(np.logical_and( 268 | peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[biggest_ind]) 269 | if overlap < MINOVERLAP: 270 | newsmalloverlap = np.sum(np.logical_and( 271 | peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[val]) 272 | print(" we did not had enough initial overlap.") 273 | f.write(" we did not had enough initial overlap."+'\n') 274 | if newsmalloverlap < MINOVERLAP: 275 | if merged_bed.columns[val].split('-')[0] in markedasbad: 276 | print(' replicate ' + 277 | merged_bed.columns[val] + ' was too bad and had not enough overlap') 278 | f.write(' replicate ' + 279 | merged_bed.columns[val] + ' was too bad and had not enough overlap'+'\n') 280 | continue 281 | elif h.askif("we have two good quality peaks that don't merge well at all: "+merged_bed.columns[val] + 282 | " and " + merged_bed.columns[biggest_ind] + " can the first one be removed?:\n \ 283 | overlap: "+str(overlap*100)+'%\n smalloverlap: '+str(smalloverlap*100)+'%\n new smalloverlap: '+str(newsmalloverlap*100)+"%"): 284 | continue 285 | else: 286 | print(" enough from small overlaps") 287 | f.write(" enough from small overlaps"+'\n') 288 | print(' --> enough overlap') 289 | f.write(' --> enough overlap'+'\n') 290 | recovered += np.sum(additionalpeaksinsec.astype(bool)) 291 | if merged_bed.columns[val].split('-')[0] not in markedasbad: 292 | tot += peakmatrix[val].astype(int) 293 | # ones that we have in the Primary but not in the secondary 294 | if not lookeverywhere or len(additionalpeaksinbig) == 0: 295 | tolookfor = peakmatrix[biggest_ind] == 0 if lookeverywhere else np.logical_and( 296 | peakmatrix[biggest_ind] == 0, peakmatrix[val]) 297 | additionalpeaksinbig = findAdditionalPeaks(finalpeaks, tolookfor, bigwigfolder + findpeakpath( 298 | bigwigfolder, biggest), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use) 299 | if len(additionalpeaksinbig[additionalpeaksinbig > 0]) > 0: 300 | sns.kdeplot(additionalpeaksinbig[additionalpeaksinbig > 0], 301 | label=biggest, legend=True).set(xlim=(0, None)) 302 | print(' min,max from newly found peaks: ' + 303 | str((additionalpeaksinbig[additionalpeaksinbig > 0].min(), additionalpeaksinbig[additionalpeaksinbig > 0].max()))) 304 | f.write(' min,max from newly found peaks: '+str((additionalpeaksinbig[additionalpeaksinbig > 0].min( 305 | ), additionalpeaksinbig[additionalpeaksinbig > 0].max()))+'\n') 306 | 307 | peakmatrix[biggest_ind] = np.logical_or( 308 | peakmatrix[biggest_ind], additionalpeaksinbig) 309 | tot += additionalpeaksinbig.astype(bool).astype(int) 310 | recovered += np.sum(additionalpeaksinbig.astype(bool)) 311 | print(' we have recovered ' + str(recovered)+' peaks, equal to ' + str(100*recovered/np.sum(peakmatrix[biggest_ind])) + 312 | '% of the peaks in main replicate') 313 | f.write(' we have recovered ' + str(recovered)+' peaks, equal to ' + str(100*recovered/np.sum(peakmatrix[biggest_ind])) + 314 | '% of the peaks in main replicate'+'\n') 315 | if overlap < (MINOVERLAP+0.2)/1.2: 316 | # we recompute to see if the overlap changed 317 | newoverlap = np.sum(np.logical_and( 318 | peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[biggest_ind]) 319 | smalloverlap = np.sum(np.logical_and( 320 | peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[val]) 321 | if newoverlap < (MINOVERLAP+0.2)/1.2: 322 | if smalloverlap < (2+MINOVERLAP)/3: 323 | print(" not enough overlap to advice to merge the bams.\n oldnew overlap: "+str(overlap*100)+'%\n \ 324 | new overlap: '+str(newoverlap*100)+"%") 325 | f.write(" not enough overlap to advice to merge the bams.\n oldnew overlap: "+str(overlap*100)+'%\n \ 326 | new overlap: '+str(newoverlap*100)+"%"+'\n') 327 | continue 328 | else: 329 | print(' enough from small overlap to advice to merge the peaks') 330 | f.write(' enough from small overlap to advice to merge the peaks'+'\n') 331 | tomergebam.append([biggest, peakname]) 332 | #the quality is good enough in the end we can pop from the list if it exists 333 | if tf in remove: 334 | remove.remove(tf) 335 | plt.title('distribution of new found peaks') 336 | if saveloc: 337 | plt.savefig(saveloc+tf+"_new_found_peaks_kdeplot.pdf") 338 | plt.show() 339 | # new distplot 340 | # new correlation plot 341 | ratiosofunique[tf] = len(np.argwhere( 342 | peakmatrix.sum(0) == 1))/peakmatrix.shape[1] 343 | if doPlot: 344 | sns.pairplot(merged_bed, corner=True, diag_kind="kde", 345 | kind="reg", plot_kws={"scatter_kws": {"alpha": .05}}) 346 | #fig = fig.map_upper(col_nan_scatter) 347 | #fig = fig.map_upper(col_nan_kde_histo) 348 | plt.suptitle("correlation and distribution of peaks after recovery", y=1.08) 349 | if saveloc: 350 | plt.savefig(saveloc+tf+"_after_pairplot.pdf") 351 | plt.show() 352 | for i, val in enumerate(merged_bed): 353 | unique_inval = np.logical_and( 354 | np.delete(peakmatrix, i, axis=0).sum(0).astype(bool) == 0, peakmatrix[i]) 355 | sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None)) 356 | plt.title("distribution of unique peaks in each replicate after recovery") 357 | if saveloc: 358 | plt.savefig(saveloc+tf+"_after_unique_kdeplot.pdf") 359 | plt.show() 360 | if len(peakmatrix.shape) > 1 and doPlot: 361 | if peakmatrix.shape[0] < 7: 362 | presence = [] 363 | for peakpres in peakmatrix: # https://github.com/tctianchi/pyvenn 364 | presence.append(set([i for i, val in enumerate(peakpres) if val == 1])) 365 | title = tf + '_recovered (TOREMOVE)' if tf in remove else tf+'_recovered' 366 | plot.venn(presence, [i+'_BAD' if i.split('-')[0] 367 | in markedasbad else i for i in merged_bed.columns], title=title, folder=saveloc) 368 | plt.show() 369 | else: 370 | print('too many replicates for Venn') 371 | f.write('(too many replicates for Venn)'+'\n') 372 | finalpeaks = finalpeaks[np.logical_or(tot > 1, peakmatrix[biggest_ind])] 373 | finalpeaks['name'] = biggest 374 | finalpeaks['tf'] = tf 375 | mergedpeaksdict.update({tf: finalpeaks}) 376 | print(str((tf, len(finalpeaks)))) 377 | f.write(str((tf, len(finalpeaks)))+'\n') 378 | mergedpeak = pd.concat( 379 | [peaks for _, peaks in mergedpeaksdict.items()]).reset_index(drop=True) 380 | if doPlot: 381 | df = pd.DataFrame(data=ratiosofunique, index=['percentage of unique']) 382 | df['proteins'] = df.index 383 | fig = sns.barplot(data=df) 384 | plt.xticks(rotation=60, ha='right') 385 | plt.title("ratios of unique in replicates across experiments") 386 | if saveloc: 387 | plt.savefig(saveloc+"All_ratios_unique.pdf") 388 | plt.show() 389 | f.close() 390 | mergedpeak['name'] = mergedpeak.tf 391 | return mergedpeak, tomergebam, remove, ratiosofunique 392 | 393 | 394 | def findAdditionalPeaks(peaks, tolookfor, filepath, sampling=1000, mincov=4, 395 | window=100, cov={}, minKL=8, use='max'): 396 | 397 | """ 398 | findAdditionalPeaks: for all peaks in A and/or B find in coverage file if zone has relative cov 399 | of more than thresh then add to peak 400 | if B is small and > 20% of peaks in A are found back, increase confidence and 401 | flag for mergeBams 402 | if < 20% don't flag for merge bam 403 | f B is big and now mean non overlap < 40%, take union and flag for mergeBam else, throw B. 404 | 405 | Args: 406 | ----- 407 | peaks 408 | tolookfor 409 | filepath 410 | sampling 411 | mincov 412 | window 413 | cov 414 | minKL 415 | use 416 | returns: 417 | ------- 418 | np.array(bool) for each peaks in peakset, returns a binary 419 | """ 420 | # def poisson(k, lamb, scale): return scale * (lamb**k / factorial(k)) * np.exp(-lamb) 421 | 422 | def KLpoisson(lamb1, lamb2): return lamb1 * \ 423 | np.log(lamb1 / lamb2) + lamb2 - lamb1 424 | 425 | def poisson(k, lamb): return (lamb**k/factorial(k)) * np.exp(-lamb) 426 | 427 | def negLogLikelihood(params, data): return - \ 428 | np.sum(np.log(poisson(data, params[0]))) 429 | 430 | def poissonFit(data): return float( 431 | minimize(negLogLikelihood, x0=np.ones(1), args=(data,), method='Powell').x) 432 | bw = pyBigWig.open(filepath) 433 | res = np.zeros(len(peaks)) 434 | prevchrom = '' 435 | lamb = {} 436 | cov = {} 437 | #ignore by message 438 | warnings.filterwarnings("ignore", message="encountered in") 439 | for i, has in enumerate(tolookfor): 440 | if has: 441 | val = peaks.iloc[i] 442 | if val.chrom not in chroms: 443 | continue 444 | if val.chrom != prevchrom: 445 | if val.chrom not in cov: 446 | cov[val.chrom] = bw.stats(str(val.chrom))[0] 447 | prevchrom = val.chrom 448 | if use == 'poisson': 449 | #TODO: compute on INPUT file instead 450 | samples = np.zeros(window * sampling) 451 | sam = np.random.rand(sampling) 452 | sam = sam * (bw.chroms(str(val.chrom))-window) 453 | for j, sample in enumerate(sam.astype(int)): 454 | samples[j*window:(j + 1)*window] = np.nan_to_num( 455 | bw.values(str(val.chrom), sample, sample + window), 0) 456 | scale = np.unique(samples)[1] 457 | samples = (samples/scale).astype(int) 458 | lamb[val.chrom] = (poissonFit(samples), scale) 459 | 460 | start = max([val.start - window, 0]) 461 | end = min(val.end + window, bw.chroms(str(val.chrom))) 462 | zone = np.nan_to_num(bw.values(str(val.chrom), start, end), 0) 463 | if use == 'max': 464 | if max(zone) / cov[val.chrom] > mincov*1.5 or sum(zone) / (cov[val.chrom] * (end - start)) > mincov: 465 | res[i] = max(zone) / cov[val.chrom] 466 | elif use == 'poisson': 467 | #TODO: compute -log10pvalue 468 | la = poissonFit((zone/lamb[val.chrom][1]).astype(int)) 469 | kl = KLpoisson(la, lamb[val.chrom][0]) 470 | if kl > minKL: 471 | res[i] = max(zone) / cov[val.chrom] # foldchange from macs3 472 | 473 | return res 474 | -------------------------------------------------------------------------------- /genepy/epigenetics/README.md: -------------------------------------------------------------------------------- 1 | # epigenomics 2 | 3 | Especially targeted to functions related to the analysis of epigenomics data. It has functions to read, merge, denoise, ChIP seq data. 4 | 5 | ## Available functions: 6 | 7 | ### chipseq.py 8 | 9 | - bigWigFrom: run the bigwig command line for a set of bam files in a folder 10 | - ReadRoseSuperEnhancers: reads ROSE2's output and returns its superenhancer bedfile as a pd dataframe. 11 | - loadPeaks: loads 1 to many peak bedfile into one pandas dataframe. 12 | - simpleMergePeaks: simply merges bedfiles from peak callers. providing a concaneted dataframe of bed-like tables 13 | - putInBed: given a conscensus bed-like dataframe and another one, will merge the second one into the first 14 | - pairwiseOverlap: compute pairwise overlap and correlation on this overlap for a set of peaks mappe to a conscensus 15 | - enrichment: compute pairwise enrichment and correlation for a set of peaks mappe to a conscensus 16 | - fullDiffPeak: will use macs3 to call differential peak binding from two bam files and their control 17 | - diffPeak: calls MACS2 bdgdiff given some parameters 18 | - MakeSuperEnhancers: Calls super enhancer from H3K27ac with the ROSE algorithm 19 | - runChromHMM: runs the chromHMM algorithm 20 | - loadMEMEmotifs: loads motif from the output file of MEME after running fimo. 21 | - simpleMergeMotifs: aggregates the motifs if they overlap, into one motif file 22 | - substractPeaksTo: removes all peaks that are not within a bp distance to a set of loci 23 | 24 | ### CREME.py 25 | 26 | The goal of creme is to be a simple, 1 function tool. It works with 1 to many sets of replicates for each pulled protein/mark. 27 | 28 | CREME takes as input a pandas dataframe. This dataframe is the concatenation of each replicates' bed files and can be loaded from a set of MACS2 bedfiles using genepy's loadPeaks function. 29 | 30 | CREME will output, amongst other thing, a dataframe representing a concatenation f bedfiles of merged replicates. 31 | 32 | find out more at __CREME.md__ 33 | 34 | ## highly recommended packages 35 | 36 | *This package won't contain anything that overlap with those and might use those packages for what it is doing.* 37 | - Bedtools 38 | - deepTools 39 | - MACS2 40 | - ROSE 41 | - MEME 42 | - ChromHMM 43 | -------------------------------------------------------------------------------- /genepy/epigenetics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/__init__.py -------------------------------------------------------------------------------- /genepy/epigenetics/docsCREME/MED1_before_pairplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_before_pairplot.png -------------------------------------------------------------------------------- /genepy/epigenetics/docsCREME/MED1_before_venn_venn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_before_venn_venn.png -------------------------------------------------------------------------------- /genepy/epigenetics/docsCREME/MED1_new_found_peaks_kdeplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_new_found_peaks_kdeplot.png -------------------------------------------------------------------------------- /genepy/epigenetics/docsCREME/igv-app-MED1-zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/igv-app-MED1-zoom.png -------------------------------------------------------------------------------- /genepy/epigenetics/plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from matplotlib import cm 6 | from genepy.epigenetics import chipseq as chip 7 | from genepy.utils import helper as h 8 | 9 | def plotAverageOfSamples(samples, folder="", showAll=False, maxv=None, minv=None): 10 | res = [] 11 | plt.figure() 12 | plt.ylim(minv,maxv) 13 | for sample in samples: 14 | data = pd.read_csv(sample, sep='\t', skiprows=1, header=None, names=['chr', 'start', 'end', 'name', "foldchange","."]+list(range(600))) 15 | r = data[list(range(600))].mean().tolist() 16 | res.append(r) 17 | if showAll: 18 | sns.lineplot(data=np.array(r), color="#BFBFFF") 19 | sns.lineplot(data=np.array(res).mean(0)) 20 | if folder: 21 | plt.savefig(folder+"_averageofsamples.pdf", color="#1F1FFF") 22 | return res 23 | 24 | 25 | def pysam_getPeaksAt(peaks, bams, folder='data/seqs/', window=1000, numpeaks=1000, numthreads=8): 26 | 27 | # get pysam data 28 | # ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks 29 | # for each counts, do a rolling average (or a convolving of the data) with numpy 30 | # append to an array 31 | # return array, normalized 32 | loaded = {} 33 | res = {i: np.zeros((len(peaks), window * 2)) for i in bams} 34 | peaks = peaks.sort_values(by="foldchange", ascending=False).iloc[:numpeaks] 35 | peaks.chrom = peaks.chrom.astype(str) 36 | for val in bams: 37 | loaded.update({val: pysam.AlignmentFile( 38 | folder + val, 'rb', threads=numthreads)}) 39 | for k, bam in loaded.items(): 40 | for num, (i, val) in enumerate(peaks.iterrows()): 41 | print(int(num / len(peaks)), end='\r') 42 | center = int((val['start'] + val['end']) / 2) 43 | for pileupcolumn in bam.pileup(val['chrom'], start=center - window, 44 | stop=center + window, truncate=True): 45 | res[k][num][pileupcolumn.pos - (center - window)] = pileupcolumn.n 46 | fig, ax = plt.subplots(1, len(res)) 47 | for i, (k, val) in enumerate(res.items()): 48 | sns.heatmap(val, ax=ax[i]) 49 | ax[i].set_title(k.split('.')[0]) 50 | fig.show() 51 | return res, fig 52 | 53 | 54 | def bedtools_getPeaksAt(peaks, bams, folder='data/seqs/', window=1000, numpeaks=1000, numthreads=8): 55 | """ 56 | get pysam data 57 | ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks 58 | for each counts, do a rolling average (or a convolving of the data) with numpy 59 | append to an array 60 | return array, normalized 61 | """ 62 | loaded = {} 63 | center = [int((val['start'] + val['end']) / 2) for k, val in peaks.iterrows()] 64 | peaks['start'] = [c - window for c in center] 65 | peaks['end'] = [c + window - 1 for c in center] 66 | peaks[peaks.columns[:3]].sort_values(by=['chrom', 'start']).to_csv( 67 | 'temp/peaks.bed', sep='\t', index=False, header=False) 68 | bedpeaks = BedTool('temp/peaks.bed') 69 | 70 | fig, ax = plt.subplots(1, len(bams)) 71 | peakset = peaks["foldchange"].values.argsort()[::-1][:numpeaks] 72 | for i, val in enumerate(bams): 73 | coverage = BedTool(folder + val).intersect(bedpeaks).genome_coverage(bga=True, split=True)\ 74 | .intersect(bedpeaks).to_dataframe(names=['chrom', 'start', 'end', 'coverage']) 75 | cov = np.zeros((len(peaks), window * 2), dtype=int) 76 | j = 0 77 | pdb.set_trace() 78 | for i, (k, val) in enumerate(peaks.iterrows()): 79 | print(i / len(peaks), end='\r') 80 | while coverage.iloc[j].start > val.start: 81 | j -= 1 82 | while coverage.iloc[j].start < val.end: 83 | cov[i][coverage.iloc[j].start - val.start:coverage.iloc[j].end - val.start] =\ 84 | coverage.iloc[j].coverage 85 | j += 1 86 | sns.heatmap(coverage, ax=ax[i]) 87 | ax[i].set_title(val.split('.')[0]) 88 | fig.show() 89 | return None, fig 90 | 91 | 92 | def makeProfiles(matx=[], folder='', matnames=[], title='', 93 | name='temp/peaksat.pdf', refpoint="TSS", scale=None, 94 | sort=False, withDeeptools=True, cluster=1, vmax=None, vmin=None, overlap=False, 95 | legendLoc=None): 96 | if withDeeptools: 97 | if not (len(matnames) == 2 and len(matx) == 2): 98 | raise ValueError('you need two mat.gz files and two names') 99 | h.createFoldersFor(name) 100 | cmd = 'computeMatrixOperations relabel -m ' 101 | cmd += matx[0] + ' -o '+matx[0]+' --groupLabels '+matnames[0] 102 | cmd += ' && computeMatrixOperations relabel -m ' 103 | cmd += matx[1] + ' -o '+matx[1]+' --groupLabels '+matnames[1] 104 | cmd += ' && computeMatrixOperations rbind -m ' 105 | cmd += matx[0] + ' ' + matx[1] + " -o " + \ 106 | '.'.join(name.split('.')[:-1]) + ".gz" 107 | cmd += ' && plotProfile' 108 | cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz" 109 | cmd += " --outFileName " + name 110 | cmd += " --refPointLabel " + refpoint 111 | if vmax is not None: 112 | cmd += " -max "+str(vmax) 113 | if vmin is not None: 114 | cmd += " -min "+str(vmin) 115 | if cluster > 1: 116 | cmd += " --perGroup --kmeans "+str(cluster) 117 | if legendLoc: 118 | cmd += " --legendLocation "+legendLoc 119 | if title: 120 | cmd += " --plotTitle " + title 121 | data = subprocess.run(cmd, shell=True, capture_output=True) 122 | print(data) 123 | 124 | 125 | def getPeaksAt(peaks, bigwigs, folder='', bigwignames=[], peaknames=[], window=1000, title='', numpeaks=4000, numthreads=8, 126 | width=5, length=10, torecompute=False, name='temp/peaksat.pdf', refpoint="TSS", scale=None, 127 | sort=False, withDeeptools=True, onlyProfile=False, cluster=1, vmax=None, vmin=None, overlap=False, 128 | legendLoc=None): 129 | """ 130 | get pysam data 131 | ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks 132 | for each counts, do a rolling average (or a convolving of the data) with numpy 133 | append to an array 134 | return array, normalized 135 | """ 136 | if withDeeptools: 137 | if isinstance(peaks, pd.DataFrame): 138 | peaks = 'peaks.bed ' 139 | peaks.to_csv('peaks.bed', sep='\t', index=False, header=False) 140 | elif type(peaks) == list: 141 | pe = '' 142 | i = 0 143 | for n, p in enumerate(peaks): 144 | if 20 < int(os.popen('wc -l ' + p).read().split(' ')[0]): 145 | pe += p + ' ' 146 | elif len(peaknames) > 0: 147 | peaknames.pop(n-i) 148 | i += 1 149 | peaks = pe 150 | elif type(peaks) == str: 151 | peaks += ' ' 152 | else: 153 | raise ValueError(' we dont know this filetype') 154 | if type(bigwigs) is list: 155 | pe = '' 156 | for val in bigwigs: 157 | pe += folder + val + ' ' 158 | bigwigs = pe 159 | else: 160 | bigwigs = folder + bigwigs + ' ' 161 | h.createFoldersFor(name) 162 | cmd = '' 163 | if not os.path.exists('.'.join(name.split('.')[:-1]) + ".gz") or torecompute: 164 | cmd += "computeMatrix reference-point -S " 165 | cmd += bigwigs 166 | cmd += " --referencePoint "+refpoint 167 | cmd += " --regionsFileName " + peaks 168 | cmd += " --missingDataAsZero" 169 | cmd += " --outFileName " + '.'.join(name.split('.')[:-1]) + ".gz" 170 | cmd += " --upstream " + str(window) + " --downstream " + str(window) 171 | cmd += " --numberOfProcessors " + str(numthreads) + ' && ' 172 | cmd += "plotHeatmap" if not onlyProfile else 'plotProfile' 173 | if type(name) is list: 174 | if not onlyProfile: 175 | raise ValueError('needs to be set to True, can\'t average heatmaps') 176 | cmd += " --matrixFile " + '.gz '.join(name) + ".gz" 177 | if average: 178 | cmd += "--averageType mean" 179 | else: 180 | cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz" 181 | cmd += " --outFileName " + name 182 | cmd += " --refPointLabel " + refpoint 183 | if vmax is not None: 184 | cmd += " -max "+str(vmax) 185 | if vmin is not None: 186 | cmd += " -min "+str(vmin) 187 | if cluster > 1: 188 | cmd += " --perGroup --kmeans "+str(cluster) 189 | if overlap: 190 | if onlyProfile: 191 | cmd += " --plotType overlapped_lines" 192 | else: 193 | raise ValueError("overlap only works when onlyProfile is set") 194 | if legendLoc: 195 | cmd += " --legendLocation "+legendLoc 196 | 197 | if len(peaknames) > 0: 198 | pe = '' 199 | for i in peaknames: 200 | pe += ' ' + i 201 | cmd += " --regionsLabel" + pe 202 | if type(bigwigs) is list: 203 | if len(bigwignames) > 0: 204 | pe = '' 205 | for i in bigwignames: 206 | pe += ' "' + i + '"' 207 | cmd += " --samplesLabel" + pe 208 | if title: 209 | cmd += " --plotTitle '"+title+"'" 210 | data = subprocess.run(cmd, shell=True, capture_output=True) 211 | print(data) 212 | else: 213 | if 'relative_summit_pos' in peaks.columns: 214 | center = [int((val['start'] + val['relative_summit_pos'])) 215 | for k, val in peaks.iterrows()] 216 | else: 217 | center = [int((val['start'] + val['end']) / 2) 218 | for k, val in peaks.iterrows()] 219 | pd.set_option('mode.chained_assignment', None) 220 | peaks['start'] = [c - window for c in center] 221 | peaks['end'] = [c + window for c in center] 222 | fig, ax = plt.subplots(1, len(bigwigs), figsize=[ 223 | width, length], title=title if title else 'Chip Heatmap') 224 | if sort: 225 | peaks = peaks.sort_values(by=["foldchange"], ascending=False) 226 | if numpeaks > len(peaks): 227 | numpeaks = len(peaks) - 1 228 | cov = {} 229 | maxs = [] 230 | for num, bigwig in enumerate(bigwigs): 231 | bw = pyBigWig.open(folder + bigwig) 232 | co = np.zeros((numpeaks, window * 2), dtype=int) 233 | scale = scale[bigwig] if scale is dict else 1 234 | for i, (k, val) in enumerate(peaks.iloc[:numpeaks].iterrows()): 235 | try: 236 | co[i] = np.nan_to_num(bw.values(str(val.chrom), val.start, val.end), 0) 237 | except RuntimeError as e: 238 | print(str(val.chrom), val.start, val.end) 239 | pass 240 | cov[bigwig] = co 241 | maxs.append(co.max()) 242 | for num, bigwig in enumerate(bigwigs): 243 | sns.heatmap(cov[bigwig] * scale, ax=ax[num], vmax=max(maxs), yticklabels=[], cmap=cmaps[num], 244 | cbar=True) 245 | ax[num].set_title(bigwig.split('.')[0]) 246 | fig.subplots_adjust(wspace=0.1) 247 | fig.show() 248 | fig.savefig(name) 249 | return cov, fig 250 | 251 | 252 | def andrew(groups, merged, annot, enr=None, pvals=None, cols=8, precise=True, title = "sorted clustermap of cobindings clustered", folder="", rangeval=4, okpval=10**-3, size=(20,15),vmax=3, vmin=0): 253 | if enr is None or pvals is None: 254 | enr, pvals = chip.enrichment(merged, groups=groups) 255 | rand = np.random.choice(merged.index,5000) 256 | subgroups = groups[rand] 257 | sorting = np.argsort(subgroups) 258 | redblue = cm.get_cmap('RdBu_r',256) 259 | subenr = enr.iloc[annot-cols:] 260 | subenr[subenr>rangeval]=rangeval 261 | subenr[subenr<-rangeval]=-rangeval 262 | subenr = subenr/rangeval 263 | data = [] 264 | #colors = [] 265 | impv = pvals.values 266 | for i in subgroups[sorting]: 267 | #colors.append(viridis(i)) 268 | a = redblue((128+(subenr[i]*128)).astype(int)).tolist() 269 | for j in range(len(a)): 270 | a[j] = [1.,1.,1.,1.] if impv[j,i] > okpval else a[j] 271 | data.append(a) 272 | data = pd.DataFrame(data=data,columns=list(subenr.index),index= rand[sorting]) 273 | #data["clusters"] = colors 274 | 275 | a = np.log2(1.01+merged[merged.columns[cols:annot]].iloc[rand].iloc[sorting].T) 276 | if not precise: 277 | for i in set(groups): 278 | e = a[a.columns[subgroups[sorting]==i]].mean(1) 279 | e = pd.DataFrame([e for i in range((subgroups[sorting]==i).sum())]).T 280 | a[a.columns[subgroups[sorting]==i]] = e 281 | 282 | fig = sns.clustermap(a, vmin=vmin, vmax=vmax, figsize=size, z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False) 283 | fig.ax_col_dendrogram.set_visible(False) 284 | fig.fig.suptitle(title) 285 | fig.savefig(folder + str(len(set(groups))) + '_clustermap_cobinding_enrichment_andrewplot.pdf') 286 | plt.show() 287 | -------------------------------------------------------------------------------- /genepy/google/README.md: -------------------------------------------------------------------------------- 1 | # google 2 | 3 | ## contains 4 | 5 | _in ./gcp.py_ 6 | 7 | - mvFiles: move files to a folder. 8 | - lsFiles: list all files. 9 | - cpFiles: copy many files to a foler. 10 | - catFiles: get data in many files. 11 | - rmFiles: remove many files. 12 | - recoverFiles: if bucket has versioning enabled, retrieve list of files that have been deleted. 13 | - patternRN: following a renaminig dict, rename a bunch of files in a set of locations. 14 | - get_all_sizes: get file sizes in a folder. 15 | - exists: given a list of file paths, get if files exist or not. 16 | - extractSize: extract file size from ls command. 17 | - extractPath: extract file path from ls command. 18 | - extractHash: extract file hash from ls command. 19 | 20 | _in google\_sheet.py_ 21 | 22 | - dfToSheet: uploads a given dataframe to a given googleSheet location 23 | 24 | GSheet (class) *WIP* 25 | - get_last_modified_date 26 | - get_size 27 | - read_sheet 28 | - read_row 29 | - read_column 30 | - write_column 31 | 32 | # highly recommended: 33 | 34 | - gsutil 35 | - pygsheet -------------------------------------------------------------------------------- /genepy/google/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/google/__init__.py -------------------------------------------------------------------------------- /genepy/google/gcp.py: -------------------------------------------------------------------------------- 1 | # GCPFunction.py 2 | 3 | from google.cloud import storage 4 | import os 5 | import subprocess 6 | import re 7 | from genepy.utils import helper as h 8 | import signal 9 | 10 | 11 | def list_blobs_with_prefix(bucket_name, prefix, delimiter=None): 12 | """Lists all the blobs in the bucket that begin with the prefix. 13 | 14 | This can be used to list all blobs in a "folder", e.g. "public/". 15 | 16 | The delimiter argument can be used to restrict the results to only the 17 | "files" in the given "folder". Without the delimiter, the entire tree under 18 | the prefix is returned. For example, given these blobs: 19 | 20 | /a/1.txt 21 | /a/b/2.txt 22 | 23 | If you just specify prefix = '/a', you'll get back: 24 | 25 | /a/1.txt 26 | /a/b/2.txt 27 | 28 | However, if you specify prefix='/a' and delimiter='/', you'll get back: 29 | 30 | /a/1.txt 31 | 32 | """ 33 | storage_client = storage.Client() 34 | bucket = storage_client.get_bucket(bucket_name) 35 | ret = [] 36 | blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter) 37 | for blob in blobs: 38 | ret.append(blob.name) 39 | return ret 40 | 41 | 42 | def mvFiles(files, location, group=50, listen_to_errors=False): 43 | """ 44 | move a set of files in parallel (when the set is huge) 45 | 46 | Args: 47 | ---- 48 | files: gs paths 49 | location: to move the files to 50 | group: files to do in parallel 51 | """ 52 | by = len(files) if len(files) < group else group 53 | for sfiles in h.grouped(files, by): 54 | a = "" 55 | for val in sfiles: 56 | a += val + " " 57 | code = os.system("gsutil -m mv " + a + location) 58 | if code != 0 and listen_to_errors: 59 | print("pressed ctrl+c or command failed") 60 | break 61 | 62 | 63 | def lsFiles(files, add="", group=50): 64 | """ 65 | list a set of files in parallel (when the set is huge) 66 | 67 | Args: 68 | ---- 69 | files: gs paths 70 | add: additional params to add 71 | group: files to do in parallel 72 | """ 73 | print("listing files in gs") 74 | by = len(files) if len(files) < group else group 75 | res = [] 76 | for sfiles in h.grouped(files, by): 77 | a = "" 78 | for val in sfiles: 79 | a += val + " " 80 | data = subprocess.run( 81 | "gsutil -m ls " + add + " '" + a + "'", capture_output=True, shell=True 82 | ) 83 | if data.returncode != 0: 84 | if "One or more URLs matched no objects" not in str(data.stderr): 85 | raise ValueError("issue with the command: " + str(data.stderr)) 86 | if len(str(data.stdout)) < 4: 87 | return [] 88 | res += ( 89 | str(data.stdout)[2:-1].split("\\n")[:-1] 90 | if "L" not in add 91 | else ["gs://" + i for i in str(data.stdout).split("\\ngs://")] 92 | ) 93 | if "TOTAL:" in res[-1] and "L" not in add: 94 | res = res[:-1] 95 | return res 96 | 97 | 98 | def cpFiles(files, location, group=50): 99 | """ 100 | copy a set of files in parallel (when the set is huge) 101 | 102 | Args: 103 | ---- 104 | files: gs paths 105 | location to copy 106 | group: files to do in parallel 107 | """ 108 | by = len(files) if len(files) < group else group 109 | for sfiles in h.grouped(files, by): 110 | a = "" 111 | for val in sfiles: 112 | a += val + " " 113 | code = os.system("gsutil -m cp " + a + location) 114 | if code != 0: 115 | print("pressed ctrl+c or command failed") 116 | break 117 | 118 | 119 | def catFiles(files, group=50, split=False, cut=False): 120 | """ 121 | copy a set of files in parallel (when the set is huge) 122 | 123 | Args: 124 | ---- 125 | files: gs paths 126 | location to copy 127 | group: files to do in parallel 128 | cut: split all lines into chunks of size cut 129 | split: split lines by split e.g. \\n 130 | """ 131 | by = len(files) if len(files) < group else group 132 | res = [] 133 | for i, sfiles in enumerate(h.grouped(files, by)): 134 | print(i / (len(files) / by)) 135 | a = "" 136 | for val in sfiles: 137 | a += val + " " 138 | data = subprocess.run("gsutil -m cat " + a, capture_output=True, shell=True) 139 | if data.returncode != 0: 140 | if "One or more URLs matched no objects" not in str(data.stderr): 141 | print(ValueError("issue with the command: " + str(data.stderr))) 142 | return res 143 | if len(str(data.stdout)) < 4: 144 | return [] 145 | resa = str(data.stdout)[2:-1] 146 | if cut: 147 | res += [resa[i * cut : (i + 1) * cut] for i in range(int(len(resa) / cut))] 148 | elif split: 149 | res += resa.split(split) 150 | else: 151 | res += [resa] 152 | return res 153 | 154 | 155 | def rmFiles(files, group=50, add="", dryrun=True): 156 | """ 157 | remove a set of files in parallel (when the set is huge) 158 | 159 | Args: 160 | ---- 161 | files: gs paths 162 | group: number to do in parallel 163 | add: additional gsutil cp params 164 | """ 165 | by = len(files) if len(files) < group else group 166 | for sfiles in h.grouped(files, by): 167 | a = "" 168 | for val in sfiles: 169 | a += " " + val 170 | if add: 171 | add = " " + add 172 | if dryrun: 173 | print("gsutil -m rm" + add + a) 174 | else: 175 | code = os.system("gsutil -m rm" + add + a) 176 | if code != 0: 177 | print("pressed ctrl+c or command failed") 178 | break 179 | 180 | 181 | def recoverFiles(files, cores=1): 182 | """ 183 | recover a set of files in parallel that were erased 184 | 185 | files need to have their #id appended found using ls -al file 186 | 187 | Args: 188 | ---- 189 | files: gs paths 190 | location: to move the files to 191 | """ 192 | cmd = ["gsutil mv " + f + " " + f.split("#")[0] for f in files] 193 | h.parrun(cmd, cores=cores) 194 | 195 | 196 | def folderRN(gspath, newpath, cores=1): 197 | """ """ 198 | lis = lsFiles([gspath]) 199 | if lis != 0: 200 | h.parrun(["gsutil -m mv " + val + " " + newpath for val in lis], cores=cores) 201 | else: 202 | raise ValueError("no such folder") 203 | 204 | 205 | def patternRN( 206 | rename_dict, 207 | location, 208 | wildcards, 209 | types=[], 210 | dryrun=True, 211 | check_dependencies=True, 212 | cores=1, 213 | ): 214 | """ 215 | rename/move a bunch of GCP objects found in some specific places 216 | 217 | Args: 218 | ----- 219 | rename_dict: dict(prevName,newName) 220 | location: 221 | wildcards: list[str] can be one of ['**', '.*', '*.','-.*'] if needs to be 222 | ** means any occurence of this file in any folder will change its name 223 | .* means all file unregarding of the suffix, will rename them all a.bam [a]da.bai to b.bam, [b]da.bai 224 | *. means all files with the suffix, will change the suffix of these files from a to b 225 | -.* means all file unregarding of the suffix, will rename them. not just replacing the a part with a to b but the full file name [a]dea.bam to b.bam 226 | types: Nothing yet 227 | test: if test, just shows the command but does not run it 228 | cores: cores tells on how many processor to parallelize the tas#k 229 | """ 230 | val = [] 231 | for k, v in rename_dict.items(): 232 | val.append(v) 233 | if k in val and check_dependencies: 234 | raise ValueError("circular dependency in the rename with key " + k) 235 | for k, v in rename_dict.items(): 236 | loc = location 237 | if "**" in wildcards: 238 | loc += "**/" 239 | if "*." in wildcards or "-.*" in wildcards: 240 | loc += "*" 241 | loc += k 242 | if ".*" in wildcards or "-.*" in wildcards: 243 | loc += "*" 244 | res = os.popen("gsutil -m ls " + loc).read().split("\n")[:-1] 245 | print("found " + str(len(res)) + " files to rename") 246 | if "-.*" in wildcards: 247 | cmd = [ 248 | "gsutil mv " 249 | + val 250 | + " " 251 | + "/".join(val.split("/")[:-1]) 252 | + "/" 253 | + v 254 | + "." 255 | + ".".join(val.split("/")[-1].split(".")[1:]) 256 | for val in res 257 | ] 258 | else: 259 | cmd = ["gsutil mv " + val + " " + val.replace(k, v) for val in res] 260 | if dryrun: 261 | print(cmd) 262 | else: 263 | h.parrun(cmd, cores=cores) 264 | 265 | 266 | def get_all_sizes(folder, suffix="*"): 267 | """ 268 | will sort and list all the files by their sizes. 269 | 270 | If some files have the same size, will list them together 271 | 272 | Args: 273 | ---- 274 | folder: gs folder path 275 | suffix: of a specific file type 276 | 277 | Returns: 278 | ------- 279 | dict(sizes:[paths]) 280 | """ 281 | samples = os.popen("gsutil -m ls -al " + folder + "**." + suffix).read().split("\n") 282 | # compute size filepath 283 | sizes = { 284 | "gs://" 285 | + val.split("gs://")[1].split("#")[0]: int( 286 | re.split("\d{4}-\d{2}-\d{2}", val)[0] 287 | ) 288 | for val in samples[:-2] 289 | } 290 | names = {} 291 | for k, val in sizes.items(): 292 | if val in names: 293 | names[val].append(k) 294 | else: 295 | names[val] = [k] 296 | if names == {}: 297 | # we didn't find any valid file paths 298 | print("We didn't find any valid file paths in folder: " + str(folder)) 299 | return names 300 | 301 | 302 | def exists(val): 303 | """ 304 | tells if a gcp path exists 305 | """ 306 | if type(val) is str: 307 | return os.popen("gsutil ls " + val).read().split("\n")[0] == val 308 | elif type(val) is list: 309 | rest = set(val) - set(lsFiles(val)) 310 | return len(rest) == 0, rest 311 | 312 | 313 | def extractSize(val): 314 | """ 315 | extract the size from the string returned by an ls -l|a command 316 | """ 317 | return "gs://" + val.split("gs://")[1].split("#")[0], int( 318 | re.split("\d{4}-\d{2}-\d{2}", val)[0] 319 | ) 320 | 321 | 322 | def extractTime(val): 323 | """ 324 | extract the size from the string returned by an ls -l|a command 325 | """ 326 | return val.split(" ")[1].split("T")[0] 327 | 328 | 329 | def extractPath(val): 330 | """ 331 | extract the path from the string returned by an ls -l|a command 332 | """ 333 | return "gs://" + val.split("gs://")[1].split("#")[0] 334 | 335 | 336 | def extractHash(val, typ="crc32c"): 337 | """ 338 | extract the crc32 from the string returned by an ls -L command 339 | 340 | Args: 341 | ---- 342 | type: flag ['crc32c','md5'] 343 | """ 344 | if " Hash (crc32c):" in val and typ == "crc32c": 345 | return ( 346 | val.split(" Hash (crc32c): ")[-1] 347 | .split("\\\\n")[0] 348 | .split("\\n")[0] 349 | ) 350 | elif " Hash (md5):" in val and typ == "md5": 351 | return ( 352 | val.split(" Hash (md5): ")[-1].split("\\\\n")[0].split("\\n")[0] 353 | ) 354 | else: 355 | return None 356 | 357 | 358 | async def shareFiles(flist, users): 359 | """ 360 | will share a list of files from gcp with a set of users. 361 | 362 | Args: 363 | ---- 364 | users: list[str] of users' google accounts 365 | flist: list[str] of google storage path for which you want to share data 366 | 367 | """ 368 | if type(users) is str: 369 | users = [users] 370 | for user in users: 371 | files = "" 372 | for i in flist: 373 | files += " " + i 374 | code = os.system("gsutil -m acl ch -ru " + user + ":R " + files) 375 | if code == signal.SIGINT: 376 | print("Awakened") 377 | break 378 | print("the files are stored here:\n\n") 379 | print(flist) 380 | print("\n\njust install and use gsutil to copy them") 381 | print("https://cloud.google.com/storage/docs/gsutil_install") 382 | print("https://cloud.google.com/storage/docs/gsutil/commands/cp") 383 | 384 | 385 | def deleteOldVersions(path, onlymetagene=None, **kwargs): 386 | """ 387 | given a path to a folder in google cloud storage, will delete all the old versions of the files in the path. 388 | """ 389 | data = subprocess.run("gsutil -m ls -alh " + path, capture_output=True, shell=True) 390 | if data.returncode != 0: 391 | if "One or more URLs matched no objects" not in str(data.stderr): 392 | print(ValueError("issue with the command: " + str(data.stderr))) 393 | return res 394 | if len(str(data.stdout)) < 4: 395 | return [] 396 | resa = str(data.stdout)[2:-1].split("\\n")[:-2] 397 | torm = [] 398 | for i, val in enumerate(resa): 399 | if onlymetagene is not None: 400 | print("torework") 401 | # if "metageneration=" + str(metagene) in val: 402 | # name = "gs://" + val.split(" gs://")[1].split(" ")[0] 403 | # torm.append(name) 404 | # else: 405 | # prevname = torm[-1].split("#")[0] 406 | # name = "gs://" + val.split(" gs://")[1].split("#")[0] 407 | # if prevname != name: 408 | # print(prevname + " is unique and won't be deleted") 409 | # torm.pop() 410 | else: 411 | if "metageneration=" + str(1) not in val: 412 | name = "gs://" + val.split(" gs://")[1].split(" ")[0] 413 | torm.append(name) 414 | if ( 415 | "gs://" + resa[i + 1].split(" gs://")[1].split("#")[0] 416 | != name.split("#")[0] 417 | ): 418 | print(name + " is unique and won't be deleted") 419 | torm.pop() 420 | print(h.dups([val.split("#")[0] for val in torm])) 421 | 422 | return rmFiles(torm, **kwargs) 423 | -------------------------------------------------------------------------------- /genepy/google/good-retention.json: -------------------------------------------------------------------------------- 1 | { 2 | "lifecycle": { 3 | "rule": [ 4 | { 5 | "action": { 6 | "type": "Delete" 7 | }, 8 | "condition": { 9 | "daysSinceNoncurrentTime": 90 10 | } 11 | }, 12 | { 13 | "action": { 14 | "type": "Delete" 15 | }, 16 | "condition": { 17 | "numNewerVersions": 2 18 | } 19 | } 20 | ] 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /genepy/google/google_sheet.py: -------------------------------------------------------------------------------- 1 | import gspread 2 | from oauth2client.service_account import ServiceAccountCredentials 3 | 4 | scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets', 5 | "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"] 6 | 7 | 8 | def dfToSheet(df, sheetid, secret='~/.credentials.json'): 9 | credentials = ServiceAccountCredentials.from_json_keyfile_name(secret, scope) 10 | client = gspread.authorize(credentials) 11 | spreadsheet = client.open(sheetid) 12 | df.to_csv('/tmp/sheet.csv') 13 | with open("/tmp/sheet.csv", 'r') as file_obj: 14 | content = file_obj.read() 15 | client.import_csv(spreadsheet.id, data=content) 16 | -------------------------------------------------------------------------------- /genepy/google/gsheet_upload.py: -------------------------------------------------------------------------------- 1 | import gspread 2 | from oauth2client.service_account import ServiceAccountCredentials 3 | 4 | scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets', 5 | "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"] 6 | 7 | credentials = ServiceAccountCredentials.from_json_keyfile_name('~/.client_secret.json', scope) 8 | client = gspread.authorize(credentials) 9 | 10 | spreadsheet = client.open('https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE') 11 | 12 | with open(file, 'r') as file_obj: 13 | content = file_obj.read() 14 | client.import_csv(spreadsheet.id, data=content) 15 | -------------------------------------------------------------------------------- /genepy/imaging/fish.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial import distance_matrix 3 | from genepy.utils import helper as h 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import pandas as pd 7 | 8 | 9 | # make a plot of averaged binned signal strength by distance from locis 10 | def computeDistsFromClass(dots, seconddots, conds=['DMSO', 'VHL'], groupcol="group", 11 | sclass='green', signal="mean_green", area="area"): 12 | """ 13 | """ 14 | dists= {} 15 | twodists = {} 16 | for val in set(dots.exp): 17 | for e in conds: 18 | d = dots[(dots.exp==val)&(dots.treat==e)] 19 | dist = [] 20 | weight = [] 21 | newdist = [] 22 | ind=[] 23 | m = seconddots[(seconddots.exp==val)&(seconddots.treat==e)] 24 | print(val, e) 25 | for i,(k, v) in enumerate(m.iterrows()): 26 | h.showcount(i, len(m)) 27 | dist.append( 28 | distance_matrix(d[(d['class']==sclass)& 29 | (d[groupcol]==v[groupcol])][['x', "y", "z"]].values, 30 | np.array([v[['x_mean', "y_mean", "z_mean"]]])).T[0].astype(float)) 31 | weight.append(d[(d['class'] == sclass)&(d[groupcol]==v[groupcol])][signal]) 32 | dat = d[(d['class'] == sclass) & 33 | (d[groupcol] == v[groupcol])][['x', "y", "z", signal, area, "m_id"]] 34 | a = dat.values 35 | a[:,:3] = a[:,:3] - v[['x_mean', "y_mean", "z_mean"]].values 36 | newdist.append(a) 37 | ind.extend(dat.index.tolist()) 38 | twodists[val+e] = pd.DataFrame(data=np.vstack(newdist), 39 | columns=['x', 'y', 'z', signal, area, "m_id"], 40 | index=ind) 41 | dists[val+e] = [np.hstack(dist), np.hstack(weight)] 42 | return twodists, dists 43 | 44 | 45 | def drawDots(dists, scenter=False, size=1000, zsize=1000, 46 | folder="", signal="signal", levels=20, 47 | area="area", vmin=None, vmax=None, 48 | norm=None, norm_dots=None, second=None, 49 | color="seagreen", 50 | seccolor=sns.light_palette("orange", as_cmap=True), **kwargs): 51 | """ 52 | """ 53 | sm = [] 54 | m = [] 55 | sca=1.2 56 | if second is not None: 57 | for _, a in dists.items(): 58 | sm.append(a[second(a)][signal].max()) 59 | for _, a in dists.items(): 60 | m.append(a[signal].mean()) 61 | for i, (k,a) in enumerate(dists.items()): 62 | a = a.copy() 63 | a[area] = ((a[area]/(3.14))**(1/2)).astype(float) 64 | 65 | a = a[(abs(a.x) 0).T[0].tolist() 119 | # we get all its connections 120 | con_val = gdot.iloc[con] 121 | ids = list(set(con_val[mergedidcol]) - set([None])) 122 | # if connections are already connected we use this id 123 | if len(ids) > 0: 124 | def_id = ids[0] 125 | # for each connection, if have another id, 126 | # replace this id with the current one 127 | for i in ids: 128 | con.extend(np.argwhere( 129 | (gdot[mergedidcol] == i).values).T[0].tolist()) 130 | con = list(set(con)) 131 | # if none we create a new id 132 | else: 133 | idcount+=1 134 | def_id = "id_"+str(idcount) 135 | gdot.loc[gdot.iloc[con].index.tolist(), mergedidcol] = def_id 136 | #except: 137 | # pdb.set_trace() 138 | merged.loc[gdot.index.tolist(), mergedidcol] = gdot[mergedidcol].tolist() 139 | return merged 140 | 141 | def mergeAnnotated(annot, minzstack=2, groupdefault={}, todrop=[], coltocount="image", 142 | id="m_id", colocName="cobinding"): 143 | """ 144 | """ 145 | annot = annot.drop(columns=todrop) 146 | grouping = {i: "mean" for i in annot.columns} 147 | if groupdefault: 148 | grouping.update(groupdefault) 149 | grouping.pop(id) 150 | # merge into a same sample 151 | groups = annot.groupby(id) 152 | counts = groups[coltocount].count() 153 | merged = groups.agg(grouping) 154 | merged['counts'] = counts 155 | merged = merged[merged['counts'] >= minzstack] 156 | merged.columns = [i[0] if "first" in i[1] 157 | else '_'.join(i) for i in merged.columns] 158 | #rename colors 159 | merged['class'] = [i[0] if len( 160 | i) == 1 else colocName for i in merged["class_unique"]] 161 | return merged.drop(columns="class_unique") 162 | -------------------------------------------------------------------------------- /genepy/mutations/README.md: -------------------------------------------------------------------------------- 1 | # Mutations 2 | 3 | A set of functions to help process any types of mutations 4 | 5 | 6 | ## contains: 7 | 8 | - vcf_to_df: transforms a vcf file into a dataframe file as best as it can 9 | - mafToMat: turns a maf file into a matrix of mutations x samples (works with multiple sample file) 10 | - mergeAnnotations: merges two maf files, taking carre of duplicate samples and duplicate (works with multiple sample file) 11 | - filterAllelicFraction: filters a MAF file based on allelic fraction (works with multiple sample file) 12 | - filterCoverage: filters a MAF file based on read coverage (works with multiple sample file) 13 | - manageGapsInSegments: extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file) 14 | - toGeneMatrix: makes a geneXsample matrix from segment level copy number (works with multiple sample file) 15 | - checkAmountOfSegments: will compute the number of segments for each samples from a df of segments from RSEM (works with multiple sample file) 16 | - checkGeneChangeAccrossAll: used to find poor quality genes in CN data (works with multiple sample file) 17 | -------------------------------------------------------------------------------- /genepy/mutations/__init__.py: -------------------------------------------------------------------------------- 1 | # Jeremie Kalfon 2 | # for BroadInsitute 3 | # in 2019 4 | 5 | from __future__ import print_function 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from genepy.utils import helper as h 10 | import gzip 11 | import seaborn as sns 12 | 13 | 14 | def vcf_to_df( 15 | path, 16 | additional_cols=[], 17 | additional_filters=[], 18 | parse_filter=False, 19 | drop_null=False, 20 | force_keep=[], 21 | cols_to_drop=[ 22 | "clinvar_vcf_mc", 23 | "oreganno_build", 24 | "gt", 25 | "ad", 26 | "af", 27 | "dp", 28 | "f1r2", 29 | "f2r1", 30 | "fad", 31 | "sb", 32 | "pid", 33 | ], 34 | **kwargs, 35 | ): 36 | """ 37 | transforms a vcf file into a dataframe file as best as it can 38 | 39 | Args: 40 | ----- 41 | path: str filepath to the vcf file 42 | additional_filters: list[str] additional values added by the filtering tool looks for PASS, base_qual, 43 | clustered_events, fragment, germline, haplotype, map_qual, multiallelic, 44 | panel_of_normals, position, slippage, strand_bias, weak_evidence 45 | additional_cols: list[str] of additional colnames in the vcf already looks for 'DB', 46 | 'SOMATIC', 'GERMLINE', "OVERLAP", "IN_PON", "STR", "ReverseComplementedAlleles" 47 | parse_filter: bool if true, will parse the filter field and add it to the dataframe 48 | drop_null: bool if a column appears to be fully empty, will drop it 49 | force_keep: list[str] columns to force keep even if they are empty 50 | cols_to_drop: list[str] columns to drop even if they are not empty 51 | 52 | Returns: 53 | -------- 54 | a dataframe fo the vcf 55 | a dict associating each column with its description (gathered from the vcf header) 56 | a list of the columns that have been dropped 57 | """ 58 | uniqueargs = [ 59 | "DB", 60 | "SOMATIC", 61 | "GERMLINE", 62 | "OVERLAP", 63 | "IN_PON", 64 | "STR", 65 | "ReverseComplementedAlleles", 66 | ] + additional_cols 67 | 68 | filters = [ 69 | "PASS", 70 | "base_qual", 71 | "clustered_events", 72 | "fragment", 73 | "germline", 74 | "haplotype", 75 | "map_qual", 76 | "multiallelic", 77 | "panel_of_normals", 78 | "position", 79 | "slippage", 80 | "strand_bias", 81 | "weak_evidence", 82 | ] + additional_filters 83 | 84 | FUNCO_DESC = "Functional annotation from the Funcotator tool." 85 | 86 | dropped_cols = [] 87 | 88 | def read_comments(f): 89 | description = {} 90 | colnames = [] 91 | rows = 0 92 | for l in f: 93 | l = l.decode("utf-8") if type(l) is not str else l 94 | if l.startswith("##"): 95 | rows += 1 96 | if "FORMAT" in l[:20]: 97 | res = l.split("ID=")[1].split(",")[0] 98 | desc = l.split("Description=")[1][:-2] 99 | description.update({res: desc}) 100 | if "INFO" in l[:20]: 101 | res = l.split("ID=")[1].split(",")[0] 102 | if res == "FUNCOTATION": 103 | print("parsing funcotator special") 104 | for val in l.split("Description=")[1][:-2].split("|"): 105 | val = val.split("Funcotation fields are: ")[-1] 106 | description.update({val: FUNCO_DESC}) 107 | else: 108 | desc = l.split("Description=")[1][:-2] 109 | description.update({res: desc}) 110 | elif l.startswith("#"): 111 | colnames = l[1:-1].split("\t") 112 | rows += 1 113 | else: 114 | break 115 | return description, colnames, rows 116 | 117 | if path.endswith(".gz"): 118 | with gzip.open(path, "r") as f: 119 | description, colnames, nrows_toskip = read_comments(f) 120 | else: 121 | with open(path, "r") as f: 122 | description, colnames, nrows_toskip = read_comments(f) 123 | colnames = [i for i in colnames] 124 | csvkwargs = { 125 | "sep": "\t", 126 | "index_col": False, 127 | "header": None, 128 | "names": colnames, 129 | "skiprows": nrows_toskip + kwargs.get("skiprows", 0), 130 | } 131 | data = pd.read_csv(path, **{**kwargs, **csvkwargs}) 132 | print(description) 133 | funco_fields = [k for k, v in description.items() if FUNCO_DESC in v] 134 | fields = {k: [] for k, _ in description.items()} 135 | try: 136 | for j, info in enumerate(data["INFO"].str.split(";").values.tolist()): 137 | res = {} 138 | # show speed 139 | if j % 10_000 == 0: 140 | print(j, end="\r") 141 | for annot in info: 142 | if annot in uniqueargs: 143 | res.update({annot: True}) 144 | elif "=" in annot: 145 | # taking care of the funcotator special fields 146 | if "FUNCOTATION" in annot: 147 | # for multi allelic site: 148 | annot = annot.replace("FUNCOTATION=", "")[1:-1] 149 | res.update({name: [] for name in funco_fields}) 150 | for site in annot.split("],["): 151 | if "]#[" in site: 152 | site = site.split("]#[")[0] 153 | site = ( 154 | site.replace("_%7C_", " ") 155 | .replace("_%20_", " ") 156 | .replace("_%2C_", ",") 157 | .replace("_%3D_", "=") 158 | .split("|") 159 | ) 160 | for i, sub_annot in enumerate(site): 161 | res[funco_fields[i]].append(sub_annot) 162 | for k in funco_fields: 163 | res[k] = ",".join(res[k]) 164 | else: 165 | k, annot = annot.split("=") 166 | res.update({k: annot}) 167 | else: 168 | raise ValueError("unknown argument: " + annot) 169 | for k in list(fields.keys()): 170 | fields[k].append(res.get(k, None)) 171 | except ValueError: 172 | print(annot) 173 | raise ValueError("unknown field") 174 | 175 | data = pd.concat( 176 | [data.drop(columns="INFO"), pd.DataFrame(data=fields, index=data.index)], axis=1 177 | ) 178 | if drop_null: 179 | to_drop = [] 180 | for f in funco_fields: 181 | # drop columns that have the same value across all rows 182 | uniq = data[f].unique() 183 | if len(uniq) == 1 and f.lower() not in force_keep: 184 | to_drop.append(f) 185 | continue 186 | elif len(uniq) < 10: 187 | # checking multi allelic stuff 188 | multi = [] 189 | for v in uniq: 190 | multi += v.split(",") 191 | if len(set(multi)) == 1 and f.lower() not in force_keep: 192 | to_drop.append(f) 193 | print("dropping uninformative columns:", to_drop) 194 | data = data.drop(columns=to_drop) 195 | dropped_cols += to_drop 196 | data.columns = [i.lower() for i in data.columns] 197 | samples = [i.lower() for i in colnames[9:]] 198 | print("\nthe samples are:", samples) 199 | sorting = data["format"][0].split(":") 200 | for sample in samples: 201 | res = data[sample].str.split(":").values.tolist() 202 | maxcols = max([len(v) for v in res]) 203 | if maxcols - len(sorting) > 0: 204 | for i in range(maxcols - len(sorting)): 205 | sorting.append(sorting[-1] + "_" + str(i + 1)) 206 | if len(samples) > 1: 207 | sorting = [sample + "_" + v for v in sorting] 208 | data = pd.concat( 209 | [ 210 | data.drop(columns=sample), 211 | pd.DataFrame(data=res, columns=sorting, index=data.index), 212 | ], 213 | axis=1, 214 | ) 215 | 216 | # subsetting filters 217 | if parse_filter: 218 | data[filters] = False 219 | for f in filters: 220 | data.loc[data["filter"].str.contains(f), f] = True 221 | data = data.drop(columns="filter") 222 | dropped_cols.append("filter") 223 | 224 | # cleaning empty cols 225 | data = data.drop(columns="format") 226 | dropped_cols.append("format") 227 | 228 | todrop = [] 229 | for val in cols_to_drop: 230 | if val in data.columns.tolist(): 231 | todrop.append(val) 232 | data = data.drop(columns=todrop) 233 | 234 | if drop_null: 235 | empty = data.columns[data.isna().sum() == len(data)].tolist() 236 | empty = list(set(empty) - set(force_keep)) 237 | print("dropping empty columns:", empty) 238 | data = data.drop(columns=empty) 239 | dropped_cols += empty 240 | 241 | # weird bug sometimes 242 | if "SB_1" in data.columns.tolist(): 243 | loc = ~data.SB_1.isna() 244 | data.loc[loc, "PGT"] = data.loc[loc, "SB"] 245 | data.loc[loc, "SB"] = data.loc[loc, "SB_1_2_3"] 246 | data = data.drop(columns=["SB_1", "SB_1_2_3"]) 247 | data = data.rename(columns={"SB_1_2": "PS", "SB_1": "PID"}) 248 | else: 249 | loc = data.SB.isna() 250 | data.loc[loc, "SB"] = data.loc[loc, "PGT"] 251 | data.loc[loc, "PGT"] = "" 252 | # sorting out issue with 253 | return data, description, dropped_cols 254 | 255 | 256 | def mafToMat( 257 | maf, 258 | mode="bool", 259 | freqcol="tumor_f", 260 | samplesCol="DepMap_ID", 261 | mutNameCol="Hugo_Symbol", 262 | minfreqtocall=0.2, 263 | ): 264 | """ 265 | turns a maf file into a matrix of mutations x samples (works with multiple sample file) 266 | 267 | Args: 268 | ----- 269 | maf: dataframe of the maf file 270 | sample_col: str colname for samples 271 | mode: flag "bool" to convert the matrix into a boolean (mut/no mut) 272 | "float" to keep the allele frequencies as is (0.x) 273 | "genotype" to have either 1, 0.5 or 0 274 | freqcol: str colname where ref/alt frequencies are stored 275 | mutNameCol: str colname where mutation names are stored, will merge things over that column name 276 | 277 | Returns: 278 | -------- 279 | the dataframe matrix 280 | """ 281 | samples = set(maf[samplesCol]) 282 | maf = maf[maf[freqcol] >= minfreqtocall] 283 | maf = maf.sort_values(by=mutNameCol) 284 | mut = pd.DataFrame( 285 | data=np.zeros((len(set(maf[mutNameCol])), 1)), 286 | columns=["fake"], 287 | index=set(maf[mutNameCol]), 288 | ).astype(float) 289 | for i, val in enumerate(samples): 290 | h.showcount(i, len(samples)) 291 | if mode == "genotype": 292 | mut = mut.join( 293 | maf[maf[samplesCol] == val] 294 | .set_index(mutNameCol)[freqcol] 295 | .groupby(mutNameCol) 296 | .agg("sum") 297 | .rename(val) 298 | ) 299 | else: 300 | mut = mut.join( 301 | maf[maf[samplesCol] == val] 302 | .drop_duplicates(mutNameCol) 303 | .set_index(mutNameCol)[freqcol] 304 | .rename(val) 305 | ) 306 | mut = mut.fillna(0).astype(bool if mode == "bool" else float).drop(columns=["fake"]) 307 | if mode == "genotype": 308 | mut[(mut > 1.3)] = 3 309 | mut[(mut >= 0.7) & (mut <= 1.3)] = 2 310 | mut[(mut > 0.3) & (mut < 0.7)] = 1 311 | mut[mut <= 0.3] = 0 312 | return mut 313 | 314 | 315 | def mergeAnnotations( 316 | firstmaf, 317 | additionalmaf, 318 | mutcol="mutation", 319 | Genome_Change="Genome_Change", 320 | Start_position="Start_position", 321 | Chromosome="Chromosome", 322 | samplename="DepMap_ID", 323 | useSecondForConflict=True, 324 | dry_run=False, 325 | ): 326 | """ 327 | merges two maf files, taking carre of duplicate samples and duplicate (works with multiple sample file) 328 | 329 | Args: 330 | ----- 331 | firstmaf: dataframe the first maf file 332 | additionalmaf: dataframe the second maf file (need to contain same colnames) 333 | Genome_Change: str colnames of the Genome_Change column 334 | Start_position: str colnames of the Start_position column 335 | Chromosome: str colnames of the Chromosome column 336 | samplename: str colnames of the samplename column (for multiple samples, even if one, needs to have this column) 337 | useSecondForConflict: bool if false use the first df as reference else use the second one 338 | dry_run: if true, will just output conflict regions and not merge the dataframes 339 | 340 | Returns: 341 | ------- 342 | dataframe of the maf file if not dryrun, else an np array of the merge issues 343 | """ 344 | mutations = firstmaf.copy() 345 | mutations["ind"] = mutations[samplename] + "_" + mutations[Genome_Change] 346 | mutations["loci"] = ( 347 | mutations[samplename] 348 | + "_" 349 | + mutations[Chromosome] 350 | + "_" 351 | + mutations[Start_position].astype(str) 352 | ) 353 | additionalmaf["ind"] = ( 354 | additionalmaf[samplename] + "_" + additionalmaf[Genome_Change] 355 | ) 356 | additionalmaf["loci"] = ( 357 | additionalmaf[samplename] 358 | + "_" 359 | + additionalmaf[Chromosome] 360 | + "_" 361 | + additionalmaf[Start_position].astype(str) 362 | ) 363 | inboth = set(additionalmaf["loci"]) & set(mutations["loci"]) 364 | notineach = set(additionalmaf["ind"]) ^ set(mutations["ind"]) 365 | submut = mutations[mutations.loci.isin(inboth) & mutations.ind.isin(notineach)] 366 | subother = additionalmaf[ 367 | additionalmaf.loci.isin(inboth) & additionalmaf.ind.isin(notineach) 368 | ] 369 | issues = None 370 | if len(submut) > 0: 371 | print("found " + str(len(submut)) + " nonmatching mutations") 372 | issues = np.vstack( 373 | [ 374 | submut.sort_values(by="loci")[Genome_Change].values, 375 | subother.sort_values(by="loci")[Genome_Change].values, 376 | ] 377 | ).T 378 | if dry_run: 379 | print(issues) 380 | if not dry_run: 381 | if issues is not None: 382 | if useSecondForConflict: 383 | mutations = mutations[~mutations.ind.isin(set(submut.ind))] 384 | else: 385 | additionalmaf = additionalmaf[ 386 | ~additionalmaf.ind.isin(set(subother.ind)) 387 | ] 388 | mutations = mutations.append( 389 | additionalmaf[ 390 | additionalmaf["ind"].isin( 391 | set(additionalmaf["ind"]) - set(mutations["ind"]) 392 | ) 393 | ] 394 | ) 395 | subother = additionalmaf[ 396 | additionalmaf.loci.isin(inboth) & ~additionalmaf.ind.isin(notineach) 397 | ].set_index("ind") 398 | mutations = mutations.set_index("ind") 399 | mutations.loc[subother.index.tolist(), mutcol] = subother[mutcol].tolist() 400 | return ( 401 | mutations.drop(columns=["loci"]) 402 | .sort_values(by=[samplename, Chromosome, Start_position]) 403 | .reset_index(drop=True) 404 | ) 405 | else: 406 | return issues 407 | 408 | 409 | def filterAllelicFraction(maf, loc=["CGA_WES_AC"], sep=":", frac=0.1): 410 | """ 411 | filters a MAF file based on allelic fraction (works with multiple sample file) 412 | 413 | Args: 414 | ----- 415 | maf: dataframe of the maf file 416 | loc: list[str] colnames with the ref:alt 417 | sep: str separato between ref:alt 418 | frac: float min fraction 419 | 420 | Returns: 421 | ------- 422 | dataframe of the maf file 423 | """ 424 | muts = np.zeros((len(maf), 2)) 425 | for val in loc: 426 | muts += np.array( 427 | [ 428 | [v[0], 0] if "NA" in v else v 429 | for v in maf[val] 430 | .fillna("0" + sep + "0") 431 | .astype(str) 432 | .str.split(sep) 433 | .tolist() 434 | ] 435 | ).astype(int) 436 | muts = muts[:, 0] / (muts[:, 0] + muts[:, 1]) 437 | return maf[muts >= frac] 438 | 439 | 440 | def filterCoverage(maf, loc=["CGA_WES_AC"], sep=":", cov=4, altloc=0): 441 | """ 442 | filters a MAF file based on read coverage (works with multiple sample file) 443 | 444 | Args: 445 | ----- 446 | maf: dataframe of the maf file 447 | loc: list[str] colnames with the ref:alt 448 | sep: str separato between ref:alt 449 | cov: min coverage 450 | altloc: 0 to filter on alt and 1 to filter on ref 451 | 452 | Returns: 453 | ------- 454 | dataframe of the maf file 455 | """ 456 | muts = np.zeros((len(maf), 2)) 457 | for val in loc: 458 | muts += np.array( 459 | [ 460 | [v[0], 0] if "NA" in v else v 461 | for v in maf[val] 462 | .fillna("0" + sep + "0") 463 | .astype(str) 464 | .str.split(sep) 465 | .tolist() 466 | ] 467 | ).astype(int) 468 | return maf[muts[:, altloc] >= cov] 469 | 470 | 471 | def manageGapsInSegments( 472 | segtocp, Chromosome="Chromosome", End="End", Start="Start", cyto=None 473 | ): 474 | """ 475 | extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file) 476 | 477 | Args: 478 | ---- 479 | segtocp: dataframe of segments from GATK CN pipeline 480 | Chromosome: str the value for the Chromosome columns 481 | End: str the value for the End columns 482 | Start: str the value for the Start columns 483 | cyto: dataframe with chrom;end; columns giving the size of each chromosome (else puts last segment to 1000000000) 484 | """ 485 | prevchr = "" 486 | prevend = 0 487 | count = 0 488 | l = [] 489 | segments = segtocp.copy() 490 | le = len(segments) 491 | for k, val in segments.iterrows(): 492 | h.showcount(count, le) 493 | count += 1 494 | if val[Chromosome] != prevchr: # we changed chromosome 495 | # we extend the previous segment (last of the prev chrom) to.. way enough 496 | if len(l) > 0: 497 | l[-1][2] = ( 498 | 1000000000 499 | if cyto is None 500 | else cyto[cyto["chrom"] == prevchr]["end"].values[-1] 501 | ) 502 | # we extend the first segment to 0 503 | l.append([val[Chromosome], 0, val[End]]) 504 | else: 505 | if val[Start] > prevend + 1: # we have a gap in the same chrom 506 | sizeofgap = val[Start] - prevend 507 | # we add to the previous one half of the gap 508 | l[-1][2] += ( 509 | int(sizeofgap / 2) if sizeofgap % 2 == 0 else int(sizeofgap / 2) + 1 510 | ) 511 | # the rest to the other 512 | l.append([val[Chromosome], val[Start] - int(sizeofgap / 2), val[End]]) 513 | elif val[Start] < prevend: # this should never happen 514 | # import pdb; pdb.set_trace() 515 | raise ValueError("start comes after end") 516 | else: 517 | l.append([val[Chromosome], val[Start], val[End]]) 518 | prevchr = val[Chromosome] 519 | prevend = val[End] 520 | # we extend the last one 521 | l[-1][2] = ( 522 | 1000000000 if cyto is None else cyto[cyto["chrom"] == prevchr]["end"].values[-1] 523 | ) 524 | segments[[Chromosome, Start, End]] = l 525 | return segments.reset_index(drop=True) 526 | 527 | 528 | def toGeneMatrix( 529 | segments, 530 | gene_mapping, 531 | style="weighted", 532 | missingchrom=["Y"], 533 | gene_names_col="gene_name", 534 | ): 535 | """ 536 | makes a geneXsample matrix from segment level copy number (works with multiple sample file) 537 | 538 | Args: 539 | ---- 540 | style: str one of "weighted","mean","closest" 541 | segments: dataframe of segments containing: [Chromosome, Segment_Mean, Chromosome, start, end] columns 542 | gene_mapping: dataframe with symbol, ensembl_id columns for each gene 543 | missingchrom: list[str] chromosomes not to look into 544 | 545 | Returns: 546 | ------- 547 | pd.dataframe: the matrix 548 | """ 549 | samples = list(set(segments.DepMap_ID)) 550 | data = np.zeros((len(samples), len(gene_mapping))) 551 | for i, sample in enumerate(samples): 552 | segs = segments[segments.DepMap_ID == sample][ 553 | ["Chromosome", "Start", "End", "Segment_Mean"] 554 | ].values 555 | hasmissing = set(missingchrom) - set(segs[:, 0]) 556 | j = 0 557 | h.showcount(i, len(samples)) 558 | for k, gene in enumerate(gene_mapping[["Chromosome", "start", "end"]].values): 559 | # print(i,j) 560 | if gene[0] in hasmissing: 561 | data[i, k] = np.nan 562 | continue 563 | try: 564 | while gene[0] != segs[j][0] or gene[1] >= segs[j][2]: 565 | # print("went beyong",gene, segs[j]) 566 | j += 1 567 | # some genes are within other genes, we need to go back in the list of segment in that case 568 | except: 569 | raise ValueError("forgot to sort one of the DF?") 570 | while gene[1] < segs[j][1]: 571 | j -= 1 572 | # print("decrease gene",gene) 573 | # we are entirely within the segment 574 | c = 1 575 | if gene[2] <= segs[j][2]: 576 | data[i, k] = segs[j][3] 577 | else: 578 | # how much of the gene is covered by the segment 579 | coef = (segs[j][2] - gene[1]) / (gene[2] - gene[1]) 580 | # print('coef',coef) 581 | val = segs[j][3] * coef if style == "weighted" else segs[j][3] 582 | end = segs[j][2] 583 | # until the end of a segments goes beyond the end of the gene (say if we have X segments within the gene) 584 | while end < gene[2]: 585 | # pdb.set_trace() 586 | j += 1 587 | c += 1 588 | nextend = segs[j][2] if segs[j][2] < gene[2] else gene[2] 589 | # here, end (of prevsegment) is the next segment's start 590 | ncoef = (nextend - end) / (gene[2] - gene[1]) 591 | # print('multi',gene, ncoef) 592 | if style == "closest": 593 | if ncoef > coef: 594 | val = segs[j][3] 595 | else: 596 | # we switch it back (see line 894) 597 | ncoef = coef 598 | else: 599 | val += segs[j][3] * ncoef if style == "weighted" else segs[j][3] 600 | end = segs[j][2] 601 | coef = ncoef 602 | data[i, k] = val if style == "weighted" else val / c 603 | return pd.DataFrame(data=data, index=samples, columns=gene_mapping[gene_names_col]) 604 | 605 | 606 | def checkAmountOfSegments(segmentcn, thresh=850, samplecol="DepMap_ID"): 607 | """ 608 | if there is too many segments, something might be wrong (works with multiple sample file) 609 | 610 | will compute the number of segments for each samples from a df of segments from RSEM 611 | 612 | Args: 613 | ---- 614 | segmentcn: segment dataframe 615 | thresh: max ok amount 616 | """ 617 | failed = [] 618 | celllines = set(segmentcn[samplecol].tolist()) 619 | amounts = [] 620 | for cellline in celllines: 621 | val = segmentcn[segmentcn[samplecol] == cellline].shape[0] 622 | amounts.append(val) 623 | if val > thresh: 624 | failed.append(cellline) 625 | print(cellline, val) 626 | sns.kdeplot(amounts) 627 | return failed 628 | 629 | 630 | def checkGeneChangeAccrossAll(genecn, thresh=0.2): 631 | """ 632 | used to find poor quality genes in CN data (works with multiple sample file) 633 | 634 | compute given a df of gene x sample CN counts, how much change there is accross samples for 635 | a same gene and returns ones that are below the threshold 636 | 637 | Args: 638 | ----- 639 | genecn: gene cn data frame 640 | thresh: threshold in logfold change accross all of them 641 | """ 642 | return genecn.columns[genecn.var() < thresh].tolist() 643 | 644 | 645 | def renameColumns(df): 646 | """ 647 | rename some of the main columns names from RSEM, GATK.. to more readable column names 648 | Args: 649 | ----- 650 | df: the df to rename 651 | Returns: 652 | ------ 653 | df the renamed df 654 | """ 655 | return df.rename( 656 | columns={ 657 | "Sample": "DepMap_ID", 658 | "CONTIG": "Chromosome", 659 | "START": "Start", 660 | "END": "End", 661 | "seqnames": "Chromosome", 662 | "start": "Start", 663 | "end": "End", 664 | } 665 | ) 666 | -------------------------------------------------------------------------------- /genepy/rna/README.md: -------------------------------------------------------------------------------- 1 | # RNA 2 | 3 | A set of functions to work with RNAseq (and related) data type 4 | 5 | ## contains 6 | 7 | 8 | - filterProteinCoding: removes all non protein coding genes from a list (you need taiga access) 9 | - convertGenes: converts genes from a naming to another (you need taiga access) 10 | - getSpikeInControlScales: extracts the spike in control values from a set of bam files 11 | - GSEAonExperiments: perform GSEA to compare a bunch of conditions at once 12 | - runERCC: creates an ERCC dashboard and extract the RNA spike ins from it (need rpy2 and ipython and R's ERCCdashboard installed) 13 | 14 | ## recommended tools 15 | 16 | - ERCCdashboard (R) 17 | - DESeq2 (R) 18 | - slamdunk 19 | - GSVA (R) 20 | - gseapy (python) -------------------------------------------------------------------------------- /genepy/rna/pyDESeq2.py: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # 3 | # PYDESEQ 4 | # 5 | ################################################################## 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | import rpy2.robjects as robjects 10 | from rpy2.robjects import pandas2ri, Formula, numpy2ri 11 | pandas2ri.activate() 12 | import rpy2 13 | from rpy2.robjects.packages import importr 14 | deseq = importr('DESeq2') 15 | from rpy2.robjects.conversion import localconverter 16 | import rpy2.robjects as ro 17 | import sys 18 | ''' 19 | Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 20 | ''' 21 | 22 | to_dataframe = robjects.r('function(x) data.frame(x)') 23 | 24 | 25 | class pyDESeq2: 26 | ''' 27 | DESeq2 object through rpy2 28 | input: 29 | count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id 30 | example: 31 | id sampleA sampleB 32 | geneA 5 1 33 | geneB 4 5 34 | geneC 1 2 35 | design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames 36 | treatment 37 | sampleA1 A 38 | sampleA2 A 39 | sampleB1 B 40 | sampleB2 B 41 | design_formula: see DESeq2 manual, example: "~ treatment"" 42 | gene_column: column name of gene id columns, exmplae "id" 43 | ''' 44 | 45 | def __init__(self, count_matrix, design_matrix, design_formula, gene_column='gene_id'): 46 | print("you need to have R installed with the DESeq2 library installed") 47 | try: 48 | assert gene_column == count_matrix.columns[0], 'no $gene_column name in 1st column\'s name' 49 | gene_id = count_matrix[gene_column] 50 | except AttributeError: 51 | sys.exit('Wrong Pandas dataframe?') 52 | print(rpy2.__version__) 53 | self.deseq_result = None 54 | self.resLFC = None 55 | self.comparison = None 56 | self.normalized_count_matrix = None 57 | self.gene_column = gene_column 58 | self.gene_id = count_matrix[self.gene_column] 59 | with localconverter(ro.default_converter + pandas2ri.converter): 60 | self.count_matrix = pandas2ri.py2rpy(count_matrix.drop(gene_column, axis=1).astype(int)) 61 | self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool)) 62 | self.design_formula = Formula(design_formula) 63 | self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, 64 | colData=self.design_matrix, 65 | design=self.design_formula) 66 | 67 | def run_estimate_size_factors(self, **kwargs): # OPTIONAL 68 | """ 69 | args: 70 | geoMeans: cond*gene matrix 71 | """ 72 | self.dds = deseq.estimateSizeFactors_DESeqDataSet(self.dds, **kwargs) 73 | 74 | def run_deseq(self, **kwargs): 75 | self.dds = deseq.DESeq(self.dds, **kwargs) 76 | 77 | def getSizeFactors(self): 78 | return deseq.sizeFactors_DESeqDataSet(self.dds) 79 | 80 | def setSizeFactors(self, factors): 81 | val = self.dds.do_slot('colData').do_slot('listData') 82 | val[2] = ro.vectors.FloatVector(np.array(factors)) 83 | self.dds.do_slot('colData').do_slot_assign('listData', val) 84 | 85 | def get_deseq_result(self, **kwargs): 86 | 87 | self.comparison = deseq.resultsNames(self.dds) 88 | 89 | self.deseq_result = deseq.results(self.dds, **kwargs) 90 | self.deseq_result = to_dataframe(self.deseq_result) 91 | with localconverter(ro.default_converter + pandas2ri.converter): 92 | self.deseq_result = ro.conversion.rpy2py(self.deseq_result) # back to pandas dataframe 93 | self.deseq_result[self.gene_column] = self.gene_id.values 94 | -------------------------------------------------------------------------------- /genepy/rna/ssGSEA.R: -------------------------------------------------------------------------------- 1 | args<-commandArgs(TRUE) 2 | 3 | countfile <- args[1]; 4 | gmtfile <- args[2]; 5 | method <- args[3] 6 | 7 | library(GSEABase) 8 | library(GSVA) 9 | counts <- read.csv(countfile, row.names=1) 10 | mat <- data.matrix(counts, rownames.force = T) 11 | colnames(mat) <- colnames(counts) 12 | gsc_obj <- GSEABase::getGmt(gmtfile, 13 | collectionType = GSEABase::BroadCollection(), 14 | geneIdType = GSEABase::EntrezIdentifier()) 15 | gsea <- GSVA::gsva(mat, gsc_obj, method = method) 16 | write.table(gsea, file = "/tmp/res_genepy_ssGSEA.tsv", sep = '\t', quote = F) 17 | -------------------------------------------------------------------------------- /genepy/sequencing/README.md: -------------------------------------------------------------------------------- 1 | # Sequencing 2 | 3 | A set of function to help work with sequencing data (bed files, bam files, fastq files etc...) 4 | 5 | ## Contains 6 | 7 | - fromGTF2BED: transforms a GTF file to a BED file, only works for some GTFs for now 8 | getBamDate: parses a bam file header to try to compute when it was generated (as best as it can, if it has had many modification done to it across a long span of time, you will receive the average of that) 9 | - getBamDate 10 | - indexBams 11 | - dropWeirdChromosomes 12 | - extractPairedSingleEndFrom 13 | - findReplicates 14 | - singleEnd 15 | - pairedEnd 16 | - mergeBams 17 | 18 | ## Other very recommended tools 19 | 20 | _I am not building anything that overlaps with these tools_ 21 | 22 | - Bedtools 23 | - samtools 24 | - pyBedtools 25 | - pysam -------------------------------------------------------------------------------- /genepy/sequencing/__init__.py: -------------------------------------------------------------------------------- 1 | # Jeremie Kalfon 2 | # for BroadInsitute 3 | # in 2019 4 | 5 | from __future__ import print_function 6 | from multiprocessing.sharedctypes import Value 7 | import os 8 | import signal 9 | import re 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | from genepy.google import gcp 15 | from genepy.utils import helper as h 16 | from tqdm import tqdm 17 | 18 | size = {"GRCh37": 2864785220, "GRCh38": 2913022398} 19 | 20 | cmaps = [ 21 | "Greys", 22 | "Purples", 23 | "Blues", 24 | "Greens", 25 | "Oranges", 26 | "Reds", 27 | "YlOrBr", 28 | "YlOrRd", 29 | "OrRd", 30 | "PuRd", 31 | "RdPu", 32 | "BuPu", 33 | "GnBu", 34 | "PuBu", 35 | "YlGnBu", 36 | "PuBuGn", 37 | "BuGn", 38 | "YlGn", 39 | ] 40 | 41 | chroms = { 42 | "chr1", 43 | "chr10", 44 | "chr11", 45 | "chr12", 46 | "chr13", 47 | "chr14", 48 | "chr15", 49 | "chr16", 50 | "chr17", 51 | "chr18", 52 | "chr19", 53 | "chr2", 54 | "chr20", 55 | "chr21", 56 | "chr22", 57 | "chr3", 58 | "chr4", 59 | "chr5", 60 | "chr6", 61 | "chr7", 62 | "chr8", 63 | "chr9", 64 | "chrX", 65 | "chrY", 66 | "1", 67 | "10", 68 | "11", 69 | "12", 70 | "13", 71 | "14", 72 | "15", 73 | "16", 74 | "17", 75 | "18", 76 | "19", 77 | "2", 78 | "20", 79 | "21", 80 | "22", 81 | "3", 82 | "4", 83 | "5", 84 | "6", 85 | "7", 86 | "8", 87 | "9", 88 | "X", 89 | "Y", 90 | } 91 | 92 | 93 | def fromGTF2BED(gtfname, bedname, gtftype="geneAnnot"): 94 | """ 95 | transforms a gtf file into a bed file 96 | 97 | Args: 98 | ---- 99 | gtfname: filepath to gtf file 100 | bedname: filepath to beddfile 101 | gtftype: only geneAnnot for now 102 | 103 | Returns: 104 | -------- 105 | newbed: the bedfile as a pandas.df 106 | 107 | """ 108 | if gtftype == "geneAnnot": 109 | gtf = pd.read_csv( 110 | gtfname, 111 | sep="\t", 112 | header=0, 113 | names=[ 114 | "chr", 115 | "val", 116 | "type", 117 | "start", 118 | "stop", 119 | "dot", 120 | "strand", 121 | "loc", 122 | "name", 123 | ], 124 | ) 125 | gtf["name"] = [ 126 | i.split('gene_id "')[-1].split('"; trans')[0] for i in gtf["name"] 127 | ] 128 | prevname = "" 129 | newbed = {"chr": [], "start": [], "end": [], "gene": []} 130 | for i, val in gtf.iterrows(): 131 | h.showcount(i, len(gtf)) 132 | if val["name"] == prevname: 133 | newbed["end"][-1] = val["stop"] 134 | else: 135 | newbed["chr"].append(val["chr"]) 136 | newbed["start"].append(val["start"]) 137 | newbed["end"].append(val["stop"]) 138 | newbed["gene"].append(val["name"]) 139 | prevname = val["name"] 140 | newbed = pd.DataFrame(newbed) 141 | newbed = newbed[~newbed.chr.str.contains("_fix")] 142 | newbed.to_csv(bedname + ".bed", sep="\t", index=None) 143 | newbed.to_csv(bedname + "_genes.bed", sep="\t", index=None) 144 | return newbed 145 | 146 | 147 | def getBamDate(bams, split="-", order="des", unknown="U"): 148 | """ 149 | from bam files (could be in a google bucket) returns their likely sequencing date if available in the header 150 | 151 | Args: 152 | ----- 153 | bams: the bams file|bucket paths 154 | split: the splitter in the output date 155 | unknown: maybe the some dates can't be found the program will output unknown for them 156 | order: if 'asc', do d,m,y else do y,m,d 157 | 158 | Returns: 159 | ------- 160 | a list of likely dates or [unknown]s 161 | """ 162 | DTs = [] 163 | for i, bam in enumerate(tqdm(bams)): 164 | data = os.popen( 165 | "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`\ 166 | && samtools view -H " 167 | + bam 168 | + ' | grep "^@RG"' 169 | ) 170 | if data == signal.SIGINT: 171 | print("Awakened") 172 | break 173 | else: 174 | res = data.read() 175 | dt = re.findall("(?<=\tDT:).+?\t", res) 176 | if len(dt) > 1: 177 | arr = np.array(dt[0].split("T")[0].split(split)).astype(int) 178 | for val in dt[1:]: 179 | arr = np.vstack( 180 | (arr, np.array(val.split("T")[0].split(split)).astype(int)) 181 | ) 182 | arr = arr.T 183 | i = ( 184 | arr[0] * 365 + arr[1] * 31 + arr[2] 185 | if order == "asc" 186 | else arr[2] * 365 + arr[1] * 31 + arr[0] 187 | ) 188 | DTs.append(dt[np.argsort(i)[0]].split("T")[0]) 189 | elif len(dt) == 1: 190 | DTs.append(dt[0].split("T")[0]) 191 | else: 192 | DTs.append(unknown) 193 | return DTs 194 | 195 | 196 | async def indexBams(bams=None, bucketpath=None, cores=4): 197 | """ 198 | given a bucket path, will index all .bam files without an associated index and return their paths 199 | """ 200 | if bams is None: 201 | if bucketpath is None: 202 | raise ValueError("need one of bams or bucketpath") 203 | files = gcp.lsFiles([bucketpath]) 204 | bams = [val for val in files if ".bam" in val[-4:]] 205 | unindexed = [ 206 | val 207 | for val in bams 208 | if val[:-4] + ".bai" not in files and val[:4] + ".bam.bai" not in files 209 | ] 210 | print("found " + str(len(unindexed)) + " files to reindex") 211 | else: 212 | unindexed = bams 213 | h.parrun( 214 | [ 215 | "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index " 216 | + val 217 | for val in unindexed 218 | ], 219 | cores, 220 | ) 221 | return {val: val[:-4] + ".bam.bai" for val in unindexed} 222 | 223 | 224 | def dropWeirdChromosomes(bedfile, keep=[], skip=0): 225 | """ 226 | given a bedfile path, removes chromosomes that are not one of chroms 227 | 228 | Args: 229 | ---- 230 | bedfile: str the filepath to the bedfile 231 | keep: list[str] of additional chromosomes to keep 232 | """ 233 | if skip >= 20: 234 | raise ValueError("too many header lines!") 235 | try: 236 | bed = pd.read_csv(bedfile, sep="\t", header=None, skiprows=skip) 237 | except ParserError: 238 | dropWeirdChromosomes(bedfile, keep, skip + 1) 239 | return 240 | except EmptyDataError: 241 | print("empty bed") 242 | return 243 | initlen = len(bed) 244 | if initlen == 0: 245 | print("empty bed") 246 | return 247 | bed = bed[bed[0].isin(chroms | set(keep))] 248 | if len(bed) < skip and skip > 5: 249 | raise ValueError("too many header lines!") 250 | print("found " + str(skip) + " header line... removing") 251 | if len(bed) != initlen: 252 | print("removed " + str(initlen - len(bed)) + " lines") 253 | bed.to_csv(bedfile, sep="\t", header=None, index=None) 254 | 255 | 256 | def extractPairedSingleEndFrom(folder, sep="-", namepos=2): 257 | """ 258 | given a folder, find fastq files and sorts paired and single end based on the R1/R2 patterns 259 | 260 | Args: 261 | ----- 262 | folder: the folder where the fastqs are 263 | sep: the separator in filename 264 | namepos: the location of the name in this separated list of name from filepath 265 | 266 | Returns: 267 | ------- 268 | list of filepath to single end files 269 | df with R1 and R2 filepath 270 | """ 271 | single = [] 272 | paired = {} 273 | for val in os.listdir(folder): 274 | if ".fastq" in val or ".fq" in val: 275 | if "R1" in val: 276 | name = val.split(sep)[namepos] 277 | paired[name] = {"R1": val} 278 | elif "R2" in val: 279 | name = val.split(sep)[namepos] 280 | paired[name].update({"R2": val}) 281 | else: 282 | single.append(val) 283 | return single, pd.DataFrame(paired) 284 | 285 | 286 | def findReplicatesBams(folder, sep="-", namings="-r([0-9])", namepos=2): 287 | """ 288 | creates a dict of name and replicate files given a regexp namging scheme 289 | """ 290 | rep = {} 291 | for val in os.listdir(folder): 292 | if val[-4:] == ".bam": 293 | match = re.search(namings, val) 294 | if match: 295 | name = val.split(sep)[namepos] 296 | if name in rep: 297 | rep[name].append(val) 298 | else: 299 | rep[name] = [val] 300 | 301 | return rep 302 | 303 | 304 | def singleEnd( 305 | singlend, 306 | folder="data/seqs/", 307 | numthreads=8, 308 | peaksFolder="peaks/", 309 | ismapped=False, 310 | mappedFolder="mapped/", 311 | refFolder="data/reference/index", 312 | ): 313 | """ 314 | run the singleEnd pipeline 315 | for alignment etc, one can use pysam ready made implementation of samtools 316 | """ 317 | print( 318 | "you need to have bowtie2 installed: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml" 319 | ) 320 | for val in singlend: 321 | out1 = folder + mappedFolder + val.split(".")[0] + ".mapped.sam" 322 | if not ismapped: 323 | in1 = folder + val 324 | os.system( 325 | "bowtie2 -x " 326 | + refFolder 327 | + " --threads " 328 | + str(numthreads) 329 | + " -t -k 1 --very-sensitive -U " 330 | + in1 331 | + " -S " 332 | + out1 333 | ) 334 | out2 = folder + peaksFolder + val.split(".")[0] 335 | print(out1) 336 | os.system("macs2 callpeak -f SAM -t " + out1 + " --outdir " + out2) 337 | # it can take many TB so better delete 338 | 339 | 340 | def pairedEnd( 341 | pairedend, 342 | folder="", 343 | numthreads=8, 344 | peaksFolder="peaks/", 345 | ismapped=False, 346 | mappedFolder="mapped/", 347 | refFolder="data/reference/index", 348 | ): 349 | """ 350 | # run the paired end pipeline 351 | """ 352 | print( 353 | "you need to have bowtie2 installed: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml" 354 | ) 355 | for _, val in pairedend.items(): 356 | out1 = folder + mappedFolder + val[0].split(".")[0] + ".mapped.sam" 357 | in1 = folder + val[0] 358 | in2 = folder + val[1] 359 | os.system( 360 | "bowtie2 -x " 361 | + refFolder 362 | + " --threads " 363 | + str(numthreads) 364 | + " -t -k 1 \ 365 | --very-sensitive -1 " 366 | + in1 367 | + " -2 " 368 | + in2 369 | + " - S " 370 | + out1 371 | ) 372 | out2 = folder + peaksFolder + val[0].split(".")[0] 373 | print(out1) 374 | changefrom = out1 375 | changeto = out1[:-4] + ".bam" 376 | os.system("samtools view -b " + changefrom + " -o " + changeto) 377 | os.system( 378 | "macs2 callpeak --format 'BAMPE' --treatment " 379 | + changeto 380 | + " --outdir " 381 | + out2 382 | ) 383 | # it can take many TB so better delete 384 | 385 | 386 | async def mergeBams(rep): 387 | """ 388 | uses samtools to merge a set of replicates considered into one file 389 | """ 390 | in1 = "" 391 | for i, val in rep.items(): 392 | out1 = i + ".merged.bam" 393 | for bam in val: 394 | in1 += " " + bam 395 | os.system("samtools merge " + out1 + in1) 396 | 397 | 398 | def compare_gcloud_vcfs_overlap_methods(vcfs_met1_path, vcfs_met2_path): 399 | for i, j in zip(vcfs_met1_path, vcfs_met2_path): 400 | compare_gcloud_vcf_overlap(i, j) 401 | 402 | 403 | def compare_gcloud_vcf_overlap(vcf1, vcf2, cols=["chr", "start", ".", "ref", "alt"]): 404 | import subprocess 405 | 406 | name1 = vcf1.split("/")[-1].split(".")[0] + "_1" + ".tsv" 407 | cmd1 = "gsutil cat " + vcf1 + " | gunzip | cut -f -5 > " + name1 408 | name2 = vcf2.split("/")[-1].split(".")[0] + "_2" + ".tsv" 409 | cmd2 = "gsutil cat " + vcf2 + " | gunzip | cut -f -5 > " + name2 410 | try: 411 | subprocess.run( 412 | cmd1, 413 | shell=True, 414 | check=True, 415 | stdout=subprocess.PIPE, 416 | stderr=subprocess.PIPE, 417 | ) 418 | subprocess.run( 419 | cmd2, 420 | shell=True, 421 | check=True, 422 | stdout=subprocess.PIPE, 423 | stderr=subprocess.PIPE, 424 | ) 425 | except subprocess.CalledProcessError as e: 426 | print(e.stderr) 427 | raise e 428 | val2 = pd.read_csv(name2, sep="\t", comment="#", names=cols) 429 | val1 = pd.read_csv(name1, sep="\t", comment="#", names=cols) 430 | val1["loc"] = ( 431 | val1["chr"].astype(str) 432 | + ":" 433 | + val1["start"].astype(str) 434 | + ":" 435 | + val1["alt"].astype(str) 436 | ) 437 | val2["loc"] = ( 438 | val2["chr"].astype(str) 439 | + ":" 440 | + val2["start"].astype(str) 441 | + ":" 442 | + val2["alt"].astype(str) 443 | ) 444 | print("length of vcf1:" + str(len(val1))) 445 | print("length of vcf2:" + str(len(val2))) 446 | print("overlap: " + str(len(set(val1["loc"]).intersection(val2["loc"])))) 447 | return val1, val2 448 | -------------------------------------------------------------------------------- /genepy/terra/README.md: -------------------------------------------------------------------------------- 1 | # terra 2 | 3 | a file containing a set of functions that uses [dalmatian](github.com/broadinstitute/dalmatian) to interact with the [GCP](https://cloud.google.com/storage/docs/gsutil) powered genomics HPC platform: [Terra](www.terra.bio). 4 | They contain a list of additional functions to do more than what is available in dalmatian 5 | 6 | The goal is to improve reproducibility and productionalization of pipelines working with Terra. 7 | 8 | #### Available functions: 9 | 10 | - createManySubmissions: allows you to create many terra jobs in parallel 11 | - waitForSubmission: an await function on Terra jobs 12 | - removeSamples: a function that removes samples on a workspace and takes care of more edge cases (linked sample sets and pair sets..). 13 | - uploadFromFolder: uploads fastq samples from a folder into a Terra workspace with the right namings etc.. 14 | - updateAllSampleSet: updates a sample set with all samples 15 | - addToSampleSet: updates a sample set with some new samples 16 | - addToPairSet: updates a pair set with some new pairs 17 | - saveOmicsOutput: *WIP* 18 | - changeGSlocation: Function to move data around from one workspace to a bucket or to another workspace. can also work on dataframes containing lists of paths 19 | - renametsvs: *WIP* 20 | - findBackErasedDuplicaBamteFromTerraBucket: If you have erased bam files in gcp with bai files still present and the bam files are stored elsewhere and their location is in a terra workspace. Will find them back by matching bai sizes and copy them back to their original locations. 21 | - shareTerraBams: will share some files from gcp with a set of users using terra as metadata repo. Only works with files that are listed on a terra workspace tsv but actually point to a regular google bucket and not a terra bucket. 22 | - shareCCLEbams: same as shareTerraBams but is completed to work with CCLE bams from the CCLE sample tracker. 23 | - saveConfigs: will save everything about a workspace into a csv and json file 24 | - cleanWorkspace: removes all processing folder in a terra workspace easily 25 | - changeToBucket: moves all bam/bai files in a sampleList from Terra, to another gs bucket and rename them in the sample list 26 | - delete_job: removes files generated by a job on Terra. 27 | - removeFromFailedWorkflows: Lists all files from all jobs that have failed and deletes them. 28 | - deleteHeavyFiles: deletes all files above a certain size in a workspace (that are used or unused). 29 | - findFilesInWorkspaces: given All your terra workspaces, find a given gs filename 30 | 31 | ## highly recommanded 32 | 33 | *This package won't contain anything that overlap with those and might use those packages for what it is doing.* 34 | - firecloud-dalmatian (python) 35 | - gsutil 36 | - nextflow (better than terra) 37 | -------------------------------------------------------------------------------- /genepy/terra/map_terra_workflow.py: -------------------------------------------------------------------------------- 1 | # 1. make sure dalmatian is installed 2 | # pip install firecloud-dalmatian 3 | # 2. make sure graphviz is installed (this script will run "dot" to generate images) 4 | # brew install graphviz 5 | 6 | import subprocess 7 | import requests 8 | import pandas as pd 9 | import dalmatian 10 | import subprocess 11 | 12 | 13 | def resolve_dot_path(root_entity_type, name): 14 | name = name.replace(" ", "") 15 | parts = name.split(".") 16 | assert parts[0] == "this" 17 | del parts[0] 18 | cur = root_entity_type 19 | while len(parts) > 1: 20 | next_part = parts[0] 21 | if cur == "sample_set" and next_part == "samples": 22 | cur = "sample" 23 | elif cur == "pair" and next_part == "case_sample": 24 | cur = "sample" 25 | elif cur == "pair" and next_part == "control_sample": 26 | cur = "sample" 27 | elif cur == "sample" and next_part == "participant": 28 | cur = "participant" 29 | else: 30 | raise Exception(f"Unknown case: {cur} -> {next_part} (root_entity_type={root_entity_type} name={name})") 31 | del parts[0] 32 | return cur+"."+parts[0] 33 | 34 | def extract_config_summary(workspace_name, workflows=None): 35 | wm = dalmatian.WorkspaceManager(workspace_name) 36 | configs = wm.get_configs() 37 | 38 | config_summaries = [] 39 | for rec in configs.to_records(): 40 | cfgname = rec['namespace']+"/"+rec['name'] 41 | if workflows is not None: 42 | if cfgname not in workflows: 43 | continue 44 | config = wm.get_config(cfgname) 45 | config['inputs'] = {k:v.strip() for k,v in config['inputs'].items()} 46 | config['outputs'] = {k:v.strip() for k,v in config['outputs'].items()} 47 | inputs=[resolve_dot_path(config['rootEntityType'], x) for x in config['inputs'].values() if x.startswith("this.")] 48 | outputs=[resolve_dot_path(config['rootEntityType'], x) for x in config['outputs'].values() if x.startswith("this.")] 49 | config_summaries.append(dict(inputs=inputs, outputs=outputs, entity_type=rec['rootEntityType'], name=cfgname)) 50 | return config_summaries 51 | 52 | 53 | def write_dependency_graph_image(filename, config_summaries): 54 | with open("/tmp/sample.dot", "wt") as fd: 55 | node_names = {} 56 | def nn(name, is_var): 57 | if name in node_names: 58 | return node_names[name] 59 | node_name = "n{}".format(len(node_names)) 60 | node_names[name] = node_name 61 | fd.write("{} [label=\"{}\" {}];\n".format(node_name, name, {True: "shape=oval", False: "shape=box fillcolor=yellow style=filled"}[is_var])) 62 | return node_names[name] 63 | 64 | fd.write("digraph { rankdir=LR;\n") 65 | for config in config_summaries: 66 | for name in config['inputs']: 67 | fd.write("{} -> {};\n".format(nn(name, True), nn(config['name'], False))) 68 | 69 | for name in config['outputs']: 70 | fd.write("{} -> {};\n".format(nn(config['name'], False), nn(name, True))) 71 | fd.write("}\n") 72 | 73 | 74 | subprocess.check_call(["dot", "/tmp/sample.dot", "-Tpng", "-o", filename]) 75 | 76 | def write_config_summary_table(filename, config_summaries): 77 | from collections import defaultdict 78 | variables = defaultdict(lambda: dict(used_by=[], produced_by=[])) 79 | for config in config_summaries: 80 | for name in config['inputs']: 81 | variables[name]['used_by'].append(config['name']) 82 | 83 | for name in config['outputs']: 84 | variables[name]['produced_by'].append(config['name']) 85 | 86 | var_col = [] 87 | used_by = [] 88 | produced_by = [] 89 | inputs = [] 90 | for k, v in variables.items(): 91 | if len(v["produced_by"]) == 0: 92 | inputs.append(k) 93 | var_col.append(k) 94 | used_by.append(", ".join(v["used_by"])) 95 | produced_by.append(", ".join(v["produced_by"])) 96 | df = pd.DataFrame(dict(variable=var_col, used_by=used_by, produced_by=produced_by)) 97 | df.to_csv(filename) 98 | 99 | def map_workspace_diagram(workspace_name, output_path='terra-workflows', workflows=None): 100 | """ 101 | -- adapted from scripts written by Philip Montgomery -- 102 | this function creates a graph of the workflows within a Terra workspace 103 | 104 | inputs: 105 | workspace_name (str): name of the workspace 106 | output_path (str): path where outputs will be saved 107 | workflows (list[str]): list of workflows to consider in the graph. If None is provided (default), 108 | all the workflows in the workspace will be used. 109 | 110 | example code: 111 | workflows = ['stewart/pipette_wgs_SV', 'stewart/manta', 'stewart/SvABA_xtramem', 112 | 'stewart/svaba_snowmanvcf2dRangerForBP', 'stewart/mantavcf2dRangerForBP', 113 | 'stewart/extract_dRanger_intermediates','stewart/pcawg_snowmanvcf2dRangerForBP', 114 | 'stewart/SV_cluster_forBP', 'stewart/breakpointer', 'stewart/Breakpointer_fix_sample', 115 | 'stewart/REBC_SV_consensus_filter_v3'] 116 | 117 | workspace_name = 'broad-firecloud-ccle/REBC_methods_only-tmp' 118 | print(workspace_name) 119 | config = mtw.map_workspace_diagram(workspace_name, workflows=workflows) 120 | """ 121 | configs = extract_config_summary(workspace_name, workflows=workflows) 122 | write_dependency_graph_image(output_path+'/'+workspace_name.replace("/", " ")+".png", configs) 123 | write_config_summary_table(output_path+'/'+workspace_name.replace("/", " ")+".csv", configs) 124 | return configs 125 | -------------------------------------------------------------------------------- /genepy/utils/Datanalytics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def getDFinfo(df): 5 | val = df 6 | print("sums over cell lines! ------mean, var, totmin, meanmin, totmax, meanmax") 7 | print(val.sum(1).mean(), val.sum(1).var(), val.sum(1).min(), val.mean(1).min(), val.sum(1).idxmin(), val.sum(1).max(), val.mean(1).max(), val.sum(1).idxmax()) 8 | print("sums over features! ------mean, var, totmin, meanmin, totmax, meanmax") 9 | print(val.sum(0).mean(), val.sum(0).var(), val.sum(0).min(), val.mean(0).min(), val.sum(0).idxmin(), val.sum(0).max(), val.mean(0).max(), val.sum(0).idxmax()) 10 | print("nans!") 11 | print(np.count_nonzero(np.isnan(val))) 12 | 13 | 14 | def compare(df1, df2): 15 | df = pd.concat([df1, df2]) 16 | df = df.reset_index(drop=True) 17 | df_gpby = df.groupby(list(df.columns)) 18 | idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1] 19 | df.reindex(idx) 20 | return df 21 | 22 | 23 | -------------------------------------------------------------------------------- /genepy/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utils 2 | 3 | 1. helper functions to save data, generate random strings, run tasks in parallel etc. 4 | 2. a set of plotting tools based on [matplotlib]() and [bokeh]() 5 | 6 | ## Contains: 7 | 8 | _in ./helper.py_ 9 | 10 | - fileToList: convert a txt with a list of values to a python list 11 | - listToFile: converts a list of values to a txt 12 | - dictToFile: converts a dict to a json 13 | - fileToDict: converts a json to a dict 14 | - batchMove: move a lot of file in batch (can provide different locations) 15 | - batchRename: rename a bunch of files in batch 16 | - createFoldersFor: makes the required folders for a given filepath 17 | - grouped: to use in a forloop to group values in batch 18 | - overlap: given two tuples, returns the overlap 19 | - union: given two tuples, returns the union 20 | - nans: gets nans from a panda df 21 | - randomString: generate a random string for namings 22 | - parrun: runs list of commands in parallel 23 | - askif: ask the user a questions and returns the y/n answer 24 | - inttodate: converts an int to a string date. 25 | - datetoint: converts a date to an int 26 | - showcount: pretty print of i/size%, to put in a for loop 27 | - combin: outputs the number of comabination of n object taken k at a time 28 | - dups: shows the duplicates in a list 29 | - makeCombinations: produces probability of X event happening at the same time. wil compute it given binomial probabilities of each event occuring and the number of trials 30 | - closest: returns the index of the value closest to K in a lst 31 | - compareDfs: compares df1 to df2. Shows col difference, index difference, nans & 0s differences 32 | 33 | _in ./plot.py_ 34 | 35 | - scatter: makes a hoverable/zoomable bokeh scatter plot 36 | - bigScatter: 37 | - CNV_Map: makes a hoverable Copy Number plot using bokeh 38 | - volcano: makes a searchable volcano plot for a Differential expression experiment using bokeh 39 | - correlationMatrix: makes a hoverable bokeh correlation matrix, works with annotations, pvalues, clusters, etc. 40 | - venn: makes a venn diagram from a list of sets 41 | - mergeImages: merge multiple pngs/pdfs together into one 42 | - addTextToImage: adds a text in an image to a specific location 43 | - SOMPlot: a tool that uses simpSOM's package output (which produces self organizing maps), to plot its output in an interactive fashion. 44 | 45 | ## other necessary tools 46 | 47 | _I am not creating anything that overlaps with that/ I am using these tools_ 48 | 49 | - os (python) 50 | - subprocess (python) 51 | - sns (python) 52 | - bokeh (python) 53 | -------------------------------------------------------------------------------- /genepy/utils/RScript.R: -------------------------------------------------------------------------------- 1 | MofaRun <- function(valueList){ 2 | library(reticulate) 3 | MOFAobject <- createMOFAobject(valueList) 4 | DataOptions <- getDefaultDataOptions() 5 | ModelOptions <- getDefaultModelOptions(MOFAobject) 6 | TrainOptions <- getDefaultTrainOptions() 7 | ModelOptions$numFactors <- 200 8 | TrainOptions$DropFactorThreshold <- 0.02 9 | MOFAobject <- prepareMOFA( 10 | MOFAobject, 11 | DataOptions = DataOptions, 12 | ModelOptions = ModelOptions, 13 | TrainOptions = TrainOptions 14 | ) 15 | MOFAobject <- runMOFA(MOFAobject) 16 | return(MOFAobject) 17 | } 18 | 19 | # Function that converts the segmented data to be continuous (so can plot chromosomes in 1, 2, 3, 4... order) 20 | generate_chromosome_cutoffs_list <- function(cyto_band_file="data/hg38_cytoband.gz") { 21 | # Have to edit the chr values to 22 | chr_bp_cutoffs <- read_tsv(cyto_band_file, col_names = F) 23 | cutoffs <- chr_bp_cutoffs %>% 24 | group_by(X1) %>% 25 | dplyr::summarize(pos=max(X3)) %>% 26 | mutate(X1=gsub('chr', '', X1)) %$% 27 | setNames(pos, ifelse(X1 %in% seq(1,21), paste0('chr', as.integer(X1) + 1), ifelse(X1==22, 'chrX', ifelse(X1=='X', 'chrY', 'chrZ')))) 28 | 29 | cutoffs_final <- cutoffs[paste0('chr',c(seq(2, 22), 'X', 'Y'))] %>% cumsum() 30 | cutoffs_final['chr1'] = 0 31 | 32 | return(cutoffs_final) 33 | } -------------------------------------------------------------------------------- /genepy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/utils/__init__.py -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: genepy 2 | theme: readthedocs 3 | plugins: 4 | - search 5 | - mkdocstrings -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | # This requirements are for development and testing only, not for production. 2 | pytest 3 | coverage 4 | flake8 5 | black 6 | isort 7 | pytest-cov 8 | codecov 9 | mypy 10 | gitchangelog 11 | mkdocs 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bokeh>= 2.2 2 | colorcet 3 | firecloud_dalmatian>=0.0.17 4 | gseapy==0.9.18 5 | gsheets==0.4.1 6 | gspread==3.6.0 7 | matplotlib 8 | oauth2client>=4.1.3 9 | pandas 10 | pybedtools 11 | pyBigWig 12 | pysam 13 | pytest 14 | requests>=2.24.0 15 | scikit_learn 16 | scipy>=1.0.0 17 | seaborn 18 | statsmodels 19 | taigapy>=2.12 20 | venn 21 | biomart -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import sys 3 | import os 4 | import io 5 | import subprocess 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 2: 8 | raise ValueError("genepy is only compatible with Python 3.5 and above") 9 | if sys.version_info.minor < 5: 10 | import warnings 11 | 12 | warnings.warn("genepy may not function properly on Python < 3.8") 13 | 14 | os.system("git submodule init && git submodule sync") 15 | 16 | with open("README.md", "r") as f: 17 | long_description = f.read() 18 | 19 | print("trying to install R packages") 20 | try: 21 | subprocess.run( 22 | 'R -e \'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install(c("GSEABase", "erccdashboard", "GSVA", "DESeq2"));\'', 23 | shell=True, 24 | check=True, 25 | stdout=subprocess.PIPE, 26 | stderr=subprocess.PIPE, 27 | ) 28 | subprocess.run("pip install rpy2") 29 | except: 30 | print("R packages not installed") 31 | print("if it did not work. please install R or check your R installation") 32 | print( 33 | "once R is installed you need to install erccdashboard, GSEABase GSVA, DESeq2 to have access to all the functions" 34 | ) 35 | 36 | 37 | def read(*paths, **kwargs): 38 | """Read the contents of a text file safely. 39 | >>> read("genepy", "VERSION") 40 | '0.1.0' 41 | >>> read("README.md") 42 | ... 43 | """ 44 | 45 | content = "" 46 | with io.open( 47 | os.path.join(os.path.dirname(__file__), *paths), 48 | encoding=kwargs.get("encoding", "utf8"), 49 | ) as open_file: 50 | content = open_file.read().strip() 51 | return content 52 | 53 | 54 | def read_requirements(path): 55 | return [ 56 | line.strip() 57 | for line in read(path).split("\n") 58 | if not line.startswith(('"', "#", "-", "git+")) 59 | ] 60 | 61 | 62 | setup( 63 | name="Broad-genepy", 64 | version=read("genepy", "VERSION"), 65 | description="A useful module for any CompBio", 66 | long_description=long_description, 67 | long_description_content_type="text/markdown", 68 | author="Jeremie Kalfon", 69 | author_email="jkobject@gmail.com", 70 | url="https://github.com/BroadInstitute/genepy", 71 | packages=[ 72 | "genepy/cell_line_mapping-master/python/cell_line_mapper", 73 | "genepy/epigenetics", 74 | "genepy/mutations", 75 | "genepy/google", 76 | "genepy/sequencing/", 77 | "genepy/terra", 78 | "genepy/rna", 79 | "genepy/utils", 80 | ], 81 | package_data={"genepy": ["data/*"]}, 82 | python_requires=">=3.5", 83 | install_requires=read_requirements("requirements.txt"), 84 | classifiers=[ 85 | "Programming Language :: Python :: 3", 86 | "Intended Audience :: Science/Research", 87 | "Topic :: Scientific/Engineering :: Bio-Informatics", 88 | ], 89 | ) 90 | 91 | 92 | print( 93 | "You might want to install Bowtie2, samtools, bwa and R to be able to use all functions of this package:\n\ 94 | http://bowtie-bio.sourceforge.net/bowtie2/index.shtml\n\ 95 | http://www.htslib.org/\n\ 96 | https://github.com/lh3/bwa\n" 97 | ) 98 | 99 | print("Finished!") 100 | --------------------------------------------------------------------------------