├── .github
    ├── FUNDING.yml
    ├── release_message.sh
    └── workflows
    │   ├── main.yml
    │   └── release.yml
├── .gitignore
├── .gitmodules
├── .vscode
    └── settings.json
├── CONTRIBUTING.md
├── HISTORY.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── data
    ├── Annotations.RData
    └── variantFilter
    │   └── snp_indels_rescue_list.txt
├── docs
    ├── genepy.md
    └── index.md
├── documentation
    └── genome.jpg
├── examples
    └── rna_diff_expr.ipynb
├── genepy
    ├── VERSION
    ├── __init__.py
    ├── cell_line_mapping-master
    │   ├── .gitignore
    │   ├── .travis.yml
    │   ├── README.md
    │   ├── celllinemapr
    │   │   ├── .Rbuildignore
    │   │   ├── .celllinemapr.Rds
    │   │   ├── DESCRIPTION
    │   │   ├── NAMESPACE
    │   │   ├── R
    │   │   │   └── cell_line_mapping.R
    │   │   ├── SOP.Rmd
    │   │   ├── celllinemapr.Rproj
    │   │   ├── man
    │   │   │   ├── arxspan.to.ccle.Rd
    │   │   │   ├── ccle.to.arxspan.Rd
    │   │   │   └── ccle.to.latest.Rd
    │   │   └── naming.csv
    │   └── python
    │   │   ├── cell_line_mapper
    │   │       ├── __init__.py
    │   │       └── test_mapper.py
    │   │   ├── name_mapping.csv
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   └── test-data.csv
    ├── epigenetics
    │   ├── CREME.md
    │   ├── CREME.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── chipseq.py
    │   ├── docsCREME
    │   │   ├── MED1_before_pairplot.png
    │   │   ├── MED1_before_venn_venn.png
    │   │   ├── MED1_new_found_peaks_kdeplot.png
    │   │   └── igv-app-MED1-zoom.png
    │   └── plot.py
    ├── google
    │   ├── README.md
    │   ├── __init__.py
    │   ├── gcp.py
    │   ├── good-retention.json
    │   ├── google_sheet.py
    │   └── gsheet_upload.py
    ├── imaging
    │   └── fish.py
    ├── mutations
    │   ├── README.md
    │   └── __init__.py
    ├── rna
    │   ├── README.md
    │   ├── __init__.py
    │   ├── pyDESeq2.py
    │   └── ssGSEA.R
    ├── sequencing
    │   ├── README.md
    │   └── __init__.py
    ├── terra
    │   ├── README.md
    │   ├── __init__.py
    │   └── map_terra_workflow.py
    └── utils
    │   ├── Datanalytics.py
    │   ├── README.md
    │   ├── RScript.R
    │   ├── __init__.py
    │   ├── helper.py
    │   └── plot.py
├── mkdocs.yml
├── requirements-test.txt
├── requirements.txt
├── setup.cfg
└── setup.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [rochacbruno]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/release_message.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | previous_tag=$(git tag --sort=-creatordate | sed -n 2p)
3 | git shortlog "${previous_tag}.." | sed 's/^./    &/'
4 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # This is a basic workflow to help you get started with Actions
 2 | 
 3 | name: CI
 4 | 
 5 | # Controls when the workflow will run
 6 | on:
 7 |   # Triggers the workflow on push or pull request events but only for the main branch
 8 |   push:
 9 |     branches: [ main ]
10 |   pull_request:
11 |     branches: [ main ]
12 | 
13 |   # Allows you to run this workflow manually from the Actions tab
14 |   workflow_dispatch:
15 | 
16 | jobs:
17 |   linter:
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         python-version: [3.9]
22 |         os: [ubuntu-latest]
23 |     runs-on: ${{ matrix.os }}
24 |     steps:
25 |       - uses: actions/checkout@v2
26 |       - uses: actions/setup-python@v2
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install project
30 |         run: make install
31 |       - name: Run linter
32 |         run: make lint
33 | 
34 |   tests_linux:
35 |     needs: linter
36 |     strategy:
37 |       fail-fast: false
38 |       matrix:
39 |         python-version: [3.9]
40 |         os: [ubuntu-latest]
41 |     runs-on: ${{ matrix.os }}
42 |     steps:
43 |       - uses: actions/checkout@v2
44 |       - uses: actions/setup-python@v2
45 |         with:
46 |           python-version: ${{ matrix.python-version }}
47 |       - name: Install project
48 |         run: make install
49 |       - name: Run tests
50 |         run: make test
51 |       - name: "Upload coverage to Codecov"
52 |         uses: codecov/codecov-action@v1
53 |         # with:
54 |         #   fail_ci_if_error: true
55 | 
56 |   tests_mac:
57 |     needs: linter
58 |     strategy:
59 |       fail-fast: false
60 |       matrix:
61 |         python-version: [3.9]
62 |         os: [macos-latest]
63 |     runs-on: ${{ matrix.os }}
64 |     steps:
65 |       - uses: actions/checkout@v2
66 |       - uses: actions/setup-python@v2
67 |         with:
68 |           python-version: ${{ matrix.python-version }}
69 |       - name: Install project
70 |         run: make install
71 |       - name: Run tests
72 |         run: make test
73 | 
74 |   tests_win:
75 |     needs: linter
76 |     strategy:
77 |       fail-fast: false
78 |       matrix:
79 |         python-version: [3.9]
80 |         os: [windows-latest]
81 |     runs-on: ${{ matrix.os }}
82 |     steps:
83 |       - uses: actions/checkout@v2
84 |       - uses: actions/setup-python@v2
85 |         with:
86 |           python-version: ${{ matrix.python-version }}
87 |       - name: Install Pip
88 |         run: pip install --user --upgrade pip
89 |       - name: Install project
90 |         run: pip install -e .[test]
91 |       - name: run tests
92 |         run: pytest -s -vvvv -l --tb=long tests
93 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     # Sequence of patterns matched against refs/tags
 6 |     tags:
 7 |       - '*' # Push events to matching v*, i.e. v1.0, v20.15.10
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | jobs:
13 |   release:
14 |     name: Create Release
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |         with:
19 |           # by default, it uses a depth of 1
20 |           # this fetches all history so that we can read each commit
21 |           fetch-depth: 0
22 |       - name: Generate Changelog
23 |         run: .github/release_message.sh > release_message.md
24 |       - name: Release
25 |         uses: softprops/action-gh-release@v1
26 |         with:
27 |           body_path: release_message.md
28 | 
29 |   deploy:
30 |     needs: release
31 |     runs-on: ubuntu-latest
32 |     steps:
33 |     - uses: actions/checkout@v1
34 |     - name: Set up Python
35 |       uses: actions/setup-python@v1
36 |       with:
37 |         python-version: '3.x'
38 |     - name: Install dependencies
39 |       run: |
40 |         python -m pip install --upgrade pip
41 |         pip install setuptools wheel twine
42 |     - name: Build and publish
43 |       env:
44 |         TWINE_USERNAME: jkobject
45 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
46 |       run: |
47 |         python setup.py sdist bdist_wheel
48 |         twine upload dist/*
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/r,macos,python,sublimetext
  2 | # Edit at https://www.gitignore.io/?templates=r,macos,python,sublimetext
  3 | ### PERSO ###
  4 | data/*
  5 | 
  6 | ### macOS ###
  7 | # General
  8 | .DS_Store
  9 | .AppleDouble
 10 | .LSOverride
 11 | .biomart.csv
 12 | 
 13 | # Icon must end with two \r
 14 | Icon
 15 | 
 16 | # Thumbnails
 17 | ._*
 18 | temp/*
 19 | 
 20 | # Files that might appear in the root of a volume
 21 | .DocumentRevisions-V100
 22 | .fseventsd
 23 | .Spotlight-V100
 24 | .TemporaryItems
 25 | .Trashes
 26 | .VolumeIcon.icns
 27 | .com.apple.timemachine.donotpresent
 28 | 
 29 | # Directories potentially created on remote AFP share
 30 | .AppleDB
 31 | .AppleDesktop
 32 | Network Trash Folder
 33 | Temporary Items
 34 | .apdisk
 35 | 
 36 | ### Python ###
 37 | # Byte-compiled / optimized / DLL files
 38 | __pycache__/
 39 | *.py[cod]
 40 | *$py.class
 41 | 
 42 | # C extensions
 43 | *.so
 44 | 
 45 | # Distribution / packaging
 46 | .Python
 47 | build/
 48 | develop-eggs/
 49 | dist/
 50 | downloads/
 51 | eggs/
 52 | .eggs/
 53 | lib/
 54 | lib64/
 55 | parts/
 56 | sdist/
 57 | var/
 58 | wheels/
 59 | pip-wheel-metadata/
 60 | share/python-wheels/
 61 | *.egg-info/
 62 | .installed.cfg
 63 | *.egg
 64 | MANIFEST
 65 | 
 66 | # PyInstaller
 67 | #  Usually these files are written by a python script from a template
 68 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 69 | *.manifest
 70 | *.spec
 71 | 
 72 | # Installer logs
 73 | pip-log.txt
 74 | pip-delete-this-directory.txt
 75 | 
 76 | # Unit test / coverage reports
 77 | htmlcov/
 78 | .tox/
 79 | .nox/
 80 | .coverage
 81 | .coverage.*
 82 | .cache
 83 | nosetests.xml
 84 | coverage.xml
 85 | *.cover
 86 | .hypothesis/
 87 | .pytest_cache/
 88 | 
 89 | # Translations
 90 | *.mo
 91 | *.pot
 92 | 
 93 | # Django stuff:
 94 | *.log
 95 | local_settings.py
 96 | db.sqlite3
 97 | 
 98 | # Flask stuff:
 99 | instance/
100 | .webassets-cache
101 | 
102 | # Scrapy stuff:
103 | .scrapy
104 | 
105 | # Sphinx documentation
106 | docs/_build/
107 | 
108 | # PyBuilder
109 | target/
110 | 
111 | # Jupyter Notebook
112 | .ipynb_checkpoints
113 | 
114 | # IPython
115 | profile_default/
116 | ipython_config.py
117 | 
118 | # pyenv
119 | .python-version
120 | 
121 | # pipenv
122 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
123 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
124 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
125 | #   install all needed dependencies.
126 | #Pipfile.lock
127 | 
128 | # celery beat schedule file
129 | celerybeat-schedule
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | env.bak/
141 | venv.bak/
142 | 
143 | # Spyder project settings
144 | .spyderproject
145 | .spyproject
146 | 
147 | # Rope project settings
148 | .ropeproject
149 | 
150 | # mkdocs documentation
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | ### R ###
161 | # History files
162 | .Rhistory
163 | .Rapp.history
164 | 
165 | # Session Data files
166 | .RData
167 | 
168 | # User-specific files
169 | .Ruserdata
170 | 
171 | # Example code in package build process
172 | *-Ex.R
173 | 
174 | # Output files from R CMD build
175 | /*.tar.gz
176 | 
177 | # Output files from R CMD check
178 | /*.Rcheck/
179 | 
180 | # RStudio files
181 | .Rproj.user/
182 | 
183 | # produced vignettes
184 | vignettes/*.html
185 | vignettes/*.pdf
186 | 
187 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
188 | .httr-oauth
189 | 
190 | # knitr and R markdown default cache directories
191 | /*_cache/
192 | /cache/
193 | 
194 | # Temporary files created by R markdown
195 | *.utf8.md
196 | *.knit.md
197 | 
198 | ### R.Bookdown Stack ###
199 | # R package: bookdown caching files
200 | /*_files/
201 | 
202 | ### SublimeText ###
203 | # Cache files for Sublime Text
204 | *.tmlanguage.cache
205 | *.tmPreferences.cache
206 | *.stTheme.cache
207 | 
208 | # Workspace files are user-specific
209 | *.sublime-workspace
210 | 
211 | # Project files should be checked into the repository, unless a significant
212 | # proportion of contributors will probably not be using Sublime Text
213 | # *.sublime-project
214 | 
215 | # SFTP configuration file
216 | sftp-config.json
217 | 
218 | # Package control specific files
219 | Package Control.last-run
220 | Package Control.ca-list
221 | Package Control.ca-bundle
222 | Package Control.system-ca-bundle
223 | Package Control.cache/
224 | Package Control.ca-certs/
225 | Package Control.merged-ca-bundle
226 | Package Control.user-ca-bundle
227 | oscrypto-ca-bundle.crt
228 | bh_unicode_properties.cache
229 | 
230 | # Sublime-github package stores a github token in this file
231 | # https://packagecontrol.io/packages/sublime-github
232 | GitHub.sublime-settings
233 | 
234 | # tmp files
235 | tmp.py
236 | 
237 | # End of https://www.gitignore.io/api/r,macos,python,sublimetext
238 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "genepy/epigenetics/rose"]
2 | 	path = genepy/epigenetics/rose
3 | 	url = https://github.com/jkobject/rose.git
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.linting.flake8Enabled": true,
 3 |     "python.linting.enabled": true,
 4 |     "python.formatting.provider": "black",
 5 |     "editor.tabSize": 4,
 6 |     "editor.detectIndentation": false,
 7 |     "editor.formatOnSave": true,
 8 |     "editor.formatOnPaste": true,
 9 |     "editor.formatOnType": false,
10 |     "python.linting.flake8Args": [
11 |         "--max-line-length=120",
12 |         "--ignore=F403, E501, E226",
13 |     ],
14 |     "editor.autoClosingBrackets": true,
15 |     "editor.autoClosingQuotes": true,
16 |     "editor.autoSurround": true,
17 |     "editor.autoIndent": "full",
18 |     "editor.insertSpaces": true
19 | }


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # How to develop on this project
  2 | 
  3 | genepy welcomes contributions from the community.
  4 | 
  5 | **You need PYTHON3!**
  6 | 
  7 | This instructions are for linux base systems. (Linux, MacOS, BSD, etc.)
  8 | ## Setting up your own fork of this repo.
  9 | 
 10 | - On github interface click on `Fork` button.
 11 | - Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/genepy.git`
 12 | - Enter the directory `cd genepy`
 13 | - Add upstream repo `git remote add upstream https://github.com/broadinstitute/genepy`
 14 | 
 15 | ## Setting up your own virtual environment
 16 | 
 17 | Run `make virtualenv` to create a virtual environment.
 18 | then activate it with `source .venv/bin/activate`.
 19 | 
 20 | ## Install the project in develop mode
 21 | 
 22 | Run `make install` to install the project in develop mode.
 23 | 
 24 | ## Run the tests to ensure everything is working
 25 | 
 26 | Run `make test` to run the tests.
 27 | 
 28 | ## Create a new branch to work on your contribution
 29 | 
 30 | Run `git checkout -b my_contribution`
 31 | 
 32 | ## Make your changes
 33 | 
 34 | Edit the files using your preferred editor. (we recommend VIM or VSCode)
 35 | 
 36 | ## Format the code
 37 | 
 38 | Run `make fmt` to format the code.
 39 | 
 40 | ## Run the linter
 41 | 
 42 | Run `make lint` to run the linter.
 43 | 
 44 | ## Test your changes
 45 | 
 46 | Run `make test` to run the tests.
 47 | 
 48 | Ensure code coverage report shows `100%` coverage, add tests to your PR.
 49 | 
 50 | ## Build the docs locally
 51 | 
 52 | Run `make docs` to build the docs.
 53 | 
 54 | Ensure your new changes are documented.
 55 | 
 56 | ## Commit your changes
 57 | 
 58 | This project uses [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
 59 | 
 60 | Example: `fix(package): update setup.py arguments 🎉` (emojis are fine too)
 61 | 
 62 | ## Push your changes to your fork
 63 | 
 64 | Run `git push origin my_contribution`
 65 | 
 66 | ## Submit a pull request
 67 | 
 68 | On github interface, click on `Pull Request` button.
 69 | 
 70 | Wait CI to run and one of the developers will review your PR.
 71 | ## Makefile utilities
 72 | 
 73 | This project comes with a `Makefile` that contains a number of useful utility.
 74 | 
 75 | ```bash 
 76 | ❯ make
 77 | Usage: make <target>
 78 | 
 79 | Targets:
 80 | help:             ## Show the help.
 81 | install:          ## Install the project in dev mode.
 82 | fmt:              ## Format code using black & isort.
 83 | lint:             ## Run pep8, black, mypy linters.
 84 | test: lint        ## Run tests and generate coverage report.
 85 | watch:            ## Run tests on every change.
 86 | clean:            ## Clean unused files.
 87 | virtualenv:       ## Create a virtual environment.
 88 | release:          ## Create a new tag for release.
 89 | docs:             ## Build the documentation.
 90 | switch-to-poetry: ## Switch to poetry package manager.
 91 | init:             ## Initialize the project based on an application template.
 92 | ```
 93 | 
 94 | ## Making a new release
 95 | 
 96 | This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z`
 97 | Every time a new tag is created and pushed to the remote repo, github actions will
 98 | automatically create a new release on github and trigger a release on PyPI.
 99 | 
100 | For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, 
101 | this token can be generated on [pypi.org](https://pypi.org/account/).
102 | 
103 | To trigger a new release all you need to do is.
104 | 
105 | 1. If you have changes to add to the repo
106 |     * Make your changes following the steps described above.
107 |     * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
108 | 2. Run the tests to ensure everything is working.
109 | 4. Run `make release` to create a new tag and push it to the remote repo.
110 | 
111 | the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked.
112 | 
113 | > **CAUTION**:  The make release will change local changelog files and commit all the unstaged changes you have.
114 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/HISTORY.md


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include HISTORY.md
3 | include Containerfile
4 | graft tests
5 | graft genepy
6 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .ONESHELL:
  2 | ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')")
  3 | USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes")
  4 | 
  5 | .PHONY: help
  6 | help:             ## Show the help.
  7 | 	@echo "Usage: make <target>"
  8 | 	@echo ""
  9 | 	@echo "Targets:"
 10 | 	@fgrep "##" Makefile | fgrep -v fgrep
 11 | 
 12 | 
 13 | .PHONY: show
 14 | show:             ## Show the current environment.
 15 | 	@echo "Current environment:"
 16 | 	@if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi
 17 | 	@echo "Running using $(ENV_PREFIX)"
 18 | 	@$(ENV_PREFIX)python -V
 19 | 	@$(ENV_PREFIX)python -m site
 20 | 
 21 | .PHONY: install
 22 | install:          ## Install the project in dev mode.
 23 | 	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
 24 | 	@echo "Don't forget to run 'make virtualenv' if you got errors."
 25 | 	$(ENV_PREFIX)pip install -e .[test]
 26 | 
 27 | .PHONY: fmt
 28 | fmt:              ## Format code using black & isort.
 29 | 	$(ENV_PREFIX)isort genepy/
 30 | 	$(ENV_PREFIX)black -l 79 genepy/
 31 | 	$(ENV_PREFIX)black -l 79 tests/
 32 | 
 33 | .PHONY: lint
 34 | lint:             ## Run pep8, black, mypy linters.
 35 | 	$(ENV_PREFIX)flake8 genepy/
 36 | 	$(ENV_PREFIX)black -l 79 --check genepy/
 37 | 	$(ENV_PREFIX)black -l 79 --check tests/
 38 | 	$(ENV_PREFIX)mypy --ignore-missing-imports genepy/
 39 | 
 40 | .PHONY: test
 41 | test: lint        ## Run tests and generate coverage report.
 42 | 	$(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=genepy -l --tb=short --maxfail=1 tests/
 43 | 	$(ENV_PREFIX)coverage xml
 44 | 	$(ENV_PREFIX)coverage html
 45 | 
 46 | .PHONY: watch
 47 | watch:            ## Run tests on every change.
 48 | 	ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/
 49 | 
 50 | .PHONY: clean
 51 | clean:            ## Clean unused files.
 52 | 	@find ./ -name '*.pyc' -exec rm -f {} \;
 53 | 	@find ./ -name '__pycache__' -exec rm -rf {} \;
 54 | 	@find ./ -name 'Thumbs.db' -exec rm -f {} \;
 55 | 	@find ./ -name '*~' -exec rm -f {} \;
 56 | 	@rm -rf .cache
 57 | 	@rm -rf .pytest_cache
 58 | 	@rm -rf .mypy_cache
 59 | 	@rm -rf build
 60 | 	@rm -rf dist
 61 | 	@rm -rf *.egg-info
 62 | 	@rm -rf htmlcov
 63 | 	@rm -rf .tox/
 64 | 	@rm -rf docs/_build
 65 | 
 66 | .PHONY: virtualenv
 67 | virtualenv:       ## Create a virtual environment.
 68 | 	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
 69 | 	@echo "creating virtualenv ..."
 70 | 	@rm -rf .venv
 71 | 	@python3 -m venv .venv
 72 | 	@./.venv/bin/pip install -U pip
 73 | 	@./.venv/bin/pip install -e .[test]
 74 | 	@echo
 75 | 	@echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!"
 76 | 
 77 | .PHONY: release
 78 | release:          ## Create a new tag for release.
 79 | 	@echo "WARNING: This operation will create s version tag and push to github"
 80 | 	@read -p "Version? (provide the next x.y.z semver) : " TAG
 81 | 	@echo "creating git tag : $${TAG}"
 82 | 	@git tag $${TAG}
 83 | 	@echo "$${TAG}" > genepy/VERSION
 84 | 	@$(ENV_PREFIX)gitchangelog > HISTORY.md
 85 | 	@git add genepy/VERSION HISTORY.md
 86 | 	@git commit -m "release: version $${TAG} 🚀"
 87 | 	@git push -u origin HEAD --tags
 88 | 	@echo "Github Actions will detect the new tag and release the new version."
 89 | 
 90 | .PHONY: docs
 91 | docs:             ## Build the documentation.
 92 | 	@echo "building documentation ..."
 93 | 	@$(ENV_PREFIX)mkdocs gh-deploy
 94 | 	URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL
 95 | 
 96 | .PHONY: switch-to-poetry
 97 | switch-to-poetry: ## Switch to poetry package manager.
 98 | 	@echo "Switching to poetry ..."
 99 | 	@if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi
100 | 	@rm -rf .venv
101 | 	@poetry init --no-interaction --name=a_flask_test --author=rochacbruno
102 | 	@echo "" >> pyproject.toml
103 | 	@echo "[tool.poetry.scripts]" >> pyproject.toml
104 | 	@echo "genepy = 'genepy.__main__:main'" >> pyproject.toml
105 | 	@cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done
106 | 	@cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done
107 | 	@poetry install --no-interaction
108 | 	@mkdir -p .github/backup
109 | 	@mv requirements* .github/backup
110 | 	@mv setup.py .github/backup
111 | 	@echo "You have switched to https://python-poetry.org/ package manager."
112 | 	@echo "Please run 'poetry shell' or 'poetry run genepy'"
113 | 
114 | .PHONY: init
115 | init:             ## Initialize the project based on an application template.
116 | 	@./.github/init.sh
117 | 
118 | 
119 | # This project has been generated from rochacbruno/python-project-template
120 | # __author__ = 'rochacbruno'
121 | # __repo__ = https://github.com/rochacbruno/python-project-template
122 | # __sponsor__ = https://github.com/sponsors/rochacbruno/
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # genepy
 2 | 
 3 | _what is [genepy](https://en.wikipedia.org/wiki/G%C3%A9n%C3%A9pi)?_
 4 | 
 5 | A set of awesome functions & tools for Computational Geneticists
 6 | 
 7 | ![long genome](documentation/genome.jpg)
 8 | 
 9 | ## Content
10 | 
11 | - **utils**: where a bunch of helper functions and usefull general scripts are stored
12 |   - **plots**: a set of plotting tools based on [matplotlib]() and [bokeh]() to make volcano plots / CNV maps etc..
13 |   - **helper**: and additional helper functions to save data, do merging of dataframes...
14 | - **terra**: contains a set of functions that uses [dalmatian]() to interact with the [GCP]() powered genomics HPC platform: [Terra](). 
15 | - **sequencing**: contains a set of function to works with bed/bam/fastqs...
16 | - **rna**: contains function to work with RNAseq (and related) data.
17 |   - **pyDESeq2**: it is a python integration of [deseq2]() (the differential expression analyser) with [rpy2]()
18 | - **mutations**: a set of functions to work with maf files, vcf files etc..
19 | - **google**: functions and packages linked to google's apis
20 |   - **google_sheet**: function to upload a df as a google sheet
21 |   - **gcp**: sets of functions to interact with google storage (relies on `gsutil`)
22 | - **epigenetics**: where we have things related to epigenomics
23 |   - **chipseq**: has functions to read, merge, denoise, ChIP seq data.
24 |   - **plot**: has functions to plot ChIP seq data.
25 | 
26 | ### Helper tools
27 | 
28 | _tools that you do not need to use directly as they have binding functions in genepy._ 
29 | 
30 | - **epigenetics/rose:**: where an updated version of the rose algorithm is stored (as a git submodule) 
31 | - **cell_line_mapping-master/python/cell_line_mapper**: a set of functions to map cell line ids to other cell line ids based on an up to date google spreadsheet. 
32 | 
33 | 
34 | ## Install
35 | 
36 | ### with pip
37 | 
38 | `pip install broad-genepy`
39 | 
40 | and then use with `from genepy.utils/epigenetics/... import ...`
41 | 
42 | Please see the next step to get access to all bindings and tools.
43 | 
44 | ### dev mode
45 | 
46 | ```bash
47 | git clone git://github.com/BroadInstitute/genepy.git
48 | pip install -e genepy
49 | ```
50 | 
51 | then you can import files in python with e.g:
52 | ```python
53 | from genepy import terra
54 | from genepy.utils import helper as h
55 | from genepy.google import gcp
56 | from genepy.utils import plot
57 | from genepy.epigenetics import chipseq
58 | 
59 | ```
60 | 
61 | ## installation: to get access to all bindings and tools
62 | 
63 | Install the following tools:
64 | - [gcloud](https://cloud.google.com/sdk/docs/install-sdk)
65 | - [firecloud-dalmatian](https://github.com/getzlab/dalmatian) 
66 | - [gsheets](https://github.com/xflr6/gsheets)
67 | - [htslib/samtools](http://www.htslib.org/)
68 | - [bwa](https://github.com/lh3/bwa)
69 | just used once:
70 | - [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
71 | 
72 | Some of these packages like gsheets, gcloud, firecloud-dalmatian will require you to create google accounts, login on your machine or download oauth files.
73 | 
74 | Finaly you can install R packages (GSEABase, erccdashboard, GSVA, DESeq2):
75 | 
76 | ```bash
77 | R -e 'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager")};BiocManager::install(c("GSEABase", "erccdashboard", "GSVA", "DESeq2"));'
78 | ```
79 | 
80 | ## data:
81 | 
82 | hg38 genome sizes: from https://github.com/igvteam/igv/blob/master/genomes/sizes/hg38.chrom.sizes
83 | 
84 | ## About
85 | 
86 | please do contribute, we do not have time to fix all issues or work on feature requests
87 | 
88 | Jeremie Kalfon jkalfon@broadinstitute.org jkobject@gmail.com https://jkobject.com
89 | 
90 | Apache license 2.0.
91 | 


--------------------------------------------------------------------------------
/data/Annotations.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/data/Annotations.RData


--------------------------------------------------------------------------------
/data/variantFilter/snp_indels_rescue_list.txt:
--------------------------------------------------------------------------------
1 | gene	Chromosome	Start_position	end	type	classification	ref_allele	newbase	Protein_Change	patient
2 | RPL22	1	6257785	6257785	Frame_Shift_Del	DEL	T	-	p.K16fs	fh_22RV1_PROSTATE-Tumor
3 | SF3B2	11	65819899	65819900	In_Frame_Ins	INS	-	GCC	p.21_22insP	fh_HEC6_ENDOMETRIUM-Tumor
4 | 


--------------------------------------------------------------------------------
/docs/genepy.md:
--------------------------------------------------------------------------------
1 | # Reference
2 | 
3 | ::: genepy.utils.helper
4 | 
5 | ::: genepy.utils.plot
6 | 
7 | ::: genepy.epigenetics.chipseq


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to MkDocs
 2 | 
 3 | For full documentation visit [mkdocs.org](https://www.mkdocs.org).
 4 | 
 5 | ## Commands
 6 | 
 7 | * `mkdocs new [dir-name]` - Create a new project.
 8 | * `mkdocs serve` - Start the live-reloading docs server.
 9 | * `mkdocs build` - Build the documentation site.
10 | * `mkdocs -h` - Print help message and exit.
11 | 
12 | ## Project layout
13 | 
14 |     mkdocs.yml    # The configuration file.
15 |     docs/
16 |         index.md  # The documentation homepage.
17 |         ...       # Other markdown pages, images and other files.
18 | 


--------------------------------------------------------------------------------
/documentation/genome.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/documentation/genome.jpg


--------------------------------------------------------------------------------
/genepy/VERSION:
--------------------------------------------------------------------------------
1 | 1.2.7
2 | 


--------------------------------------------------------------------------------
/genepy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/__init__.py


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | *~
 6 | celllinemapr/SOP.html
 7 | __pycache__
 8 | python/.cache
 9 | *.egg-info
10 | *.pyc
11 | python/dist
12 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache:
 3 |   directories:
 4 |     - $HOME/.cache/pip
 5 | 
 6 | python:
 7 |   - 3.5
 8 | 
 9 | install:
10 |   - pip install -r requirements.txt
11 | 
12 | script:
13 |   - set -e && pytest && set +e


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/README.md:
--------------------------------------------------------------------------------
 1 | # cell_line_mapping
 2 | Code for mapping between different CCLE/DepMap cell line identifiers
 3 | 
 4 | ## Installation
 5 | ### R
 6 | ```
 7 | options(repos = c(
 8 | 	"https://iwww.broadinstitute.org/~datasci/R-packages",
 9 | 	"https://cran.cnr.berkeley.edu"))
10 | install.packages('celllinemapr')
11 | ```
12 | As one could not have access to intranet to download the name mapping. The name mapping file is directly available and can be put to work by executing this command:
13 | `mkdir ~/.celllinemapr && mkdir ~/.celllinemapr/data && cp naming.csv ~/.celllinemapr/data`
14 | 
15 | ### Python
16 | ```
17 | pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-latest.tar.gz
18 | ```
19 | 
20 | ## Usage
21 | See [here](https://github.com/broadinstitute/cell_line_mapping/blob/master/celllinemapr/SOP.Rmd) for examples of functions for the R package.
22 | Funtion names for the Python package are analogous to those for R, replacing `.` with `_`. For instance,
23 | 
24 | R: `ccle.to.arxspan`
25 | 
26 | Python: `ccle_to_arxspan`
27 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/.celllinemapr.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/cell_line_mapping-master/celllinemapr/.celllinemapr.Rds


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: celllinemapr
 2 | Title: Functions for mapping between cell line IDs
 3 | Version: 0.1
 4 | Authors@R: c(person("Philip", "Montgomery", email="pmontgom@broadinstitute.org", role = c("aut", "cre")))
 5 | Description: Streamline mapping between cell line IDs using the mapping which is periodically generated from ArxSpan and stored at https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv
 6 | Depends: R (>= 3.3.0)
 7 | License: CC0
 8 | Encoding: UTF-8
 9 | LazyData: true
10 | RoxygenNote: 6.0.1
11 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | export(arxspan.to.ccle)
4 | export(ccle.to.arxspan)
5 | export(ccle.to.latest)
6 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/R/cell_line_mapping.R:
--------------------------------------------------------------------------------
  1 | find.non.unique <- function(x) {
  2 |     b <- table(x) > 1
  3 |     names(b)[b]
  4 | }
  5 | 
  6 | make.mapping <- function(full.map, input.type, output.type, id.subset, check.unique.mapping) {
  7 |     x <- as.character(full.map[[output.type]])
  8 |     n <- as.character(full.map[[input.type]])
  9 | 
 10 |     mask <- n %in% id.subset
 11 | 
 12 |     # collapse duplicate rows
 13 |     df <- data.frame(x=x[mask], n=n[mask], stringsAsFactors =F)
 14 |     df <- unique(df)
 15 | 
 16 |     x <- df$x
 17 |     n <- df$n
 18 | 
 19 |     if(check.unique.mapping) {
 20 |         non.unique.inputs <- find.non.unique(n)
 21 |         non.unique.outputs <- find.non.unique(x)
 22 |         if(length(non.unique.inputs) > 0) {
 23 |             stop(paste0("The following had nonunique values: ", paste0(non.unique.inputs, collapse=", ")))
 24 |         }
 25 | 
 26 |         if(length(non.unique.outputs) > 0) {
 27 |             stop(paste0("The following had nonunique values: ", paste0(non.unique.outputs, collapse=", ")))
 28 |         }
 29 |     }
 30 | 
 31 |     names(x) <- n
 32 |     x
 33 | }
 34 | 
 35 | read.mapping <- (function() {
 36 |     # save value from previous read to avoid fetching from url every time
 37 |     cell.line.mapping.cache <- NULL
 38 |     mapping.url <- getOption("celllinemapr.url", "../naming.csv")
 39 |     cache.path <- getOption("celllinemapr.cache.path", "../.celllinemapr.Rds")
 40 | 
 41 |     function(force=F) {
 42 |       if(is.null(cell.line.mapping.cache) || force) {
 43 |             mapping <- try(read.csv("../naming.csv"))
 44 |             if(class(mapping) == "try-error") {
 45 |                 # if we got an error, then warn user that this failed and try loading from cache file.
 46 |                 warning(paste0("Could not fetch mapping from ", mapping.url, ", attempting to read most recent cached mapping from ", cache.path))
 47 |                 mapping <- readRDS(cache.path)
 48 |             } else {
 49 |               stopifnot(is.data.frame(mapping))
 50 |               saveRDS(mapping, file=cache.path)              
 51 |             }
 52 |             stopifnot(is.data.frame(mapping))
 53 |             cell.line.mapping.cache <<- mapping
 54 |         }
 55 |       cell.line.mapping.cache
 56 |     }
 57 | })()
 58 | 
 59 | 
 60 | name.mapper <- function(input.type, input.names, output.type, ignore.problems, check.unique.mapping, read.mapping.fn) {
 61 |     full.mapping <- read.mapping.fn()
 62 |     mapping <- make.mapping(full.mapping, input.type, output.type, input.names, check.unique.mapping)
 63 |     result <- mapping[input.names]
 64 |     if(!ignore.problems) {
 65 |         bad.names <- input.names[is.na(result)]
 66 |         if(length(bad.names) > 5) {
 67 |             bad.names <- c(bad.names[1:5], "...")
 68 |         }
 69 |         if(length(bad.names) > 0) {
 70 |             stop(paste0("Could not find cell lines (searching by ", input.type, ") for ", paste(bad.names, collapse=", ")))
 71 |         }
 72 |     }
 73 |     result
 74 | }
 75 | 
 76 | # returns a function to get cell line mapping. Returns the default function if 
 77 | pick.mapping.fn <- function(mapping) {
 78 |     if(!is.null(mapping)) {
 79 |         stopifnot(is.data.frame(mapping))
 80 |         return (function() {
 81 |             return (mapping)
 82 |         } )
 83 |     } else {
 84 |         return(read.mapping)
 85 |     }
 86 | }
 87 | 
 88 | #' Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names
 89 | #'
 90 | #' @param arxspan.ids A vector of arxspan ids. These are always of the form "ACH-XXXXXX"
 91 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.
 92 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same CCLE name (which could cause issues downstream)
 93 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest
 94 | #' @examples
 95 | #' ccle_names <- arxspan.to.ccle(c('ACH-000007', 'ACH-000008'))
 96 | #' @export arxspan.to.ccle
 97 | arxspan.to.ccle <- function(arxspan.ids, ignore.problems=F, check.unique.mapping=T, mapping=NULL) {
 98 |     name.mapper('broad_id', arxspan.ids, 'canonical_ccle_name', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping))
 99 | }
100 | 
101 | #' Map ccle names to Broad ID (aka ArxSpan IDs)
102 | #'
103 | #' @param ccle.names A vector of CCLE names
104 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.
105 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same arxspan id (which could cause issues downstream)
106 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest
107 | #' @examples
108 | #' broad_ids <- ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG'))
109 | #' @export ccle.to.arxspan
110 | ccle.to.arxspan <- function(ccle.names, ignore.problems=F, check.unique.mapping=T, mapping=NULL) {
111 |     name.mapper('ccle_name', ccle.names, 'broad_id', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping))
112 | }
113 | 
114 | #' Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed.
115 | #'
116 | #' @param ccle.names A vector of CCLE names
117 | #' @param ignore.problems if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.
118 | #' @param check.unique.mapping if set, will throw an error if it discovers two different IDs which map to the same ccle name (which could cause issues downstream)
119 | #' @param mapping if set, will use this dataframe for the mapping instead of fetching the latest
120 | #' @examples
121 | #' ccle_names <- ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE')
122 | #' @export ccle.to.latest
123 | ccle.to.latest <- function(arxspan.ids, ignore.problems=F, check.unique.mapping=T, mapping=NULL) {
124 |     name.mapper('ccle_name', arxspan.ids, 'canonical_ccle_name', ignore.problems, check.unique.mapping, pick.mapping.fn(mapping))
125 | }
126 | 
127 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/SOP.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'SOP: Mapping cell line IDs'
 3 | author: "Philip montgomery"
 4 | date: "7/11/2018"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## R Markdown
13 | 
14 | This SOP describes how to use the R library for mapping cell line identifiers based on the situation. The authorative source for all cell line names and mappings is recorded in ArxSpan. However, it is difficult to query ArxSpan so every day, we export the mapping from arxspan and publish it internally as a CSV file at https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv
15 | 
16 | The R package celllinemapr pulls directly from this URL automatically.
17 | 
18 | ### Loading legacy data which contains CCLE names
19 | 
20 | Moving forward, we are tracking cell lines by their Broad IDs which get assigned when the line is registered into
21 | ArxSpan. (As a result, we sometimes refer to these as ArxSpan IDs and are always of the form "ACH-XXXXXX")
22 | 
23 | In order to join old data which has CCLE names to a dataset which use Broad IDs, you will need to remap the CCLE names to the Broad IDs. This can be done via ccle.to.arxspan()
24 | 
25 | ```{r ccle.to.arxspan}
26 | library(celllinemapr)
27 | ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG'))
28 | ```
29 | 
30 | ### Getting latest CCLE names
31 | 
32 | Also, since CCLE names can change, the current name for a line may be different than with the one when a dataset was created. If you wish to get the latest name for a line you can use ccle.to.latest()
33 | 
34 | ```{r ccle.to.latest}
35 | ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE')
36 | ```
37 | 
38 | ### Looking up CCLE names by Broad ID
39 | 
40 | The Broad ID are opaque and so often you will want a human readable label when reporting info about cell lines. One
41 | can map back to CCLE name via arxspan.to.ccle().
42 | 
43 | ```{r arxspan.to.ccle}
44 |   arxspan.to.ccle(c('ACH-000007', 'ACH-000008'))
45 | ```
46 | 
47 | ### Using existing mapping
48 | 
49 | The mapping functions only work within the Broad's internal network. You can use these methods and if you cannot reach the internal network, they will use the most recently cached mapping. 
50 | 
51 | However, if you're running this code on a machine which does not have a cached mapping, you'll need to provide a copy of it yourself. You can do this by providing the "mapping" parameter to any of these methods.
52 | 
53 | ```{r arxspan.to.ccle.mapping}
54 |   map.df = data.frame(ccle_name=c("A101D_SKIN", "LS513_LARGE_INTESTINE"),
55 |                       canonical_ccle_name=c("A101D_FAKEMAPPING", "LS513_FAKEMAPPING"),
56 |                       broad_id=c('ACH-000008', "ACH-000007"))
57 | 
58 |   arxspan.to.ccle(c('ACH-000007', 'ACH-000008'), mapping=map.df)
59 | ```
60 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/celllinemapr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/man/arxspan.to.ccle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cell_line_mapping.R
 3 | \name{arxspan.to.ccle}
 4 | \alias{arxspan.to.ccle}
 5 | \title{Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names}
 6 | \usage{
 7 | arxspan.to.ccle(arxspan.ids, ignore.problems = F, check.unique.mapping = T)
 8 | }
 9 | \arguments{
10 | \item{arxspan.ids}{A vector of arxspan ids. These are always of the form "ACH-XXXXXX"}
11 | 
12 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.}
13 | 
14 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same CCLE name (which could cause issues downstream)}
15 | }
16 | \description{
17 | Map cell line Broad ID (aka ArxSpan IDs) to the latest CCLE names
18 | }
19 | \examples{
20 | ccle_names <- arxspan.to.ccle(c('ACH-000007', 'ACH-000008'))
21 | }
22 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/man/ccle.to.arxspan.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cell_line_mapping.R
 3 | \name{ccle.to.arxspan}
 4 | \alias{ccle.to.arxspan}
 5 | \title{Map ccle names to Broad ID (aka ArxSpan IDs)}
 6 | \usage{
 7 | ccle.to.arxspan(ccle.names, ignore.problems = F, check.unique.mapping = T)
 8 | }
 9 | \arguments{
10 | \item{ccle.names}{A vector of CCLE names}
11 | 
12 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.}
13 | 
14 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same arxspan id (which could cause issues downstream)}
15 | }
16 | \description{
17 | Map ccle names to Broad ID (aka ArxSpan IDs)
18 | }
19 | \examples{
20 | broad_ids <- ccle.to.arxspan(c('HS294T_SKIN','NCIH1581_LUNG'))
21 | }
22 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/celllinemapr/man/ccle.to.latest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cell_line_mapping.R
 3 | \name{ccle.to.latest}
 4 | \alias{ccle.to.latest}
 5 | \title{Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed.}
 6 | \usage{
 7 | ccle.to.latest(arxspan.ids, ignore.problems = F, check.unique.mapping = T)
 8 | }
 9 | \arguments{
10 | \item{ignore.problems}{if not set to True, any unknown cell lines will result in an error being thrown. If you set to True, then you'll get NA for unknown lines instead.}
11 | 
12 | \item{check.unique.mapping}{if set, will throw an error if it discovers two different IDs which map to the same ccle name (which could cause issues downstream)}
13 | 
14 | \item{ccle.names}{A vector of CCLE names}
15 | }
16 | \description{
17 | Map any ccle names to the current/latest ccle names. Useful for updating old names and correcting lines which have been renamed.
18 | }
19 | \examples{
20 | ccle_names <- ccle.to.latest('HEL9217_2013_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE')
21 | }
22 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/python/cell_line_mapper/__init__.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from collections import defaultdict, Counter
 3 | import pandas as pd
 4 | from io import StringIO
 5 | 
 6 | csv_url = 'https://intranet.broadinstitute.org/~datasci/cell_lines/name_mapping.csv'
 7 | __version__ = "0.1"
 8 | 
 9 | def read_file_to_dict(key_name, value_name):
10 |     return_dict = defaultdict(str)
11 | 
12 |     response = requests.get(csv_url)
13 |     assert response.status_code == 200, "Could not fetch mapping from {}, got, status_code={} reason={}".format(csv_url, response.status_code, response.reason)
14 |     mapping_request = StringIO(response.text)
15 |     df = pd.read_csv(mapping_request)
16 | 
17 |     for row in df.to_dict(orient='records'):
18 |         if return_dict[row[key_name]] == "":
19 |             return_dict[row[key_name]] = row[value_name]
20 |         else:
21 |             if row[value_name] != return_dict[row[key_name]]:      
22 |                 return_dict[row[key_name]] += str(", " + row[value_name])
23 | 
24 |     return return_dict
25 | 
26 | 
27 | def check_unique(input_list, result_list):
28 |     input_len = len(input_list)
29 |     # make sure one key doesn't point to multiple values
30 |     is_unique = True
31 |     for mapping in result_list:
32 |         if mapping is not None and ',' in mapping:
33 |             is_unique = False
34 |             break
35 | 
36 |     # make sure two keys don't point to the same value
37 |     if is_unique:
38 |         num_unique_values = len(Counter(tuple(mapping) for mapping in result_list if mapping is not None)) + int(None in result_list)
39 | 
40 |         is_unique = (input_len == num_unique_values)
41 | 
42 |     if not is_unique:
43 |         raise RuntimeError('Mappings are not unique')
44 | 
45 | 
46 | def name_mapper(input_type, input_names, output_type, ignore_problems=False, check_unique_mapping=True):
47 |     if len(input_names) == 0:
48 |         raise RuntimeError("Please input a non-empty list")
49 | 
50 |     mapping_dict = read_file_to_dict(input_type, output_type)
51 |     output_names = []
52 | 
53 |     for i_name in input_names:
54 |         o_name = mapping_dict.get(i_name)
55 |         if not ignore_problems and o_name is None:
56 |             raise KeyError(output_type + " " + "could not be found for the following " + input_type + ":  " + i_name)
57 |         output_names.append(o_name)
58 | 
59 |     if check_unique_mapping:
60 |         check_unique(input_names, output_names)
61 | 
62 |     assert len(output_names) == len(input_names)
63 |     return output_names
64 | 
65 | 
66 | def arxspan_to_ccle(arxspan_ids, ignore_problems=False, check_unique_mapping=True):
67 |     return name_mapper('broad_id', arxspan_ids, 'canonical_ccle_name', ignore_problems, check_unique_mapping)
68 | 
69 | 
70 | def ccle_to_arxspan(ccle_names, ignore_problems=False, check_unique_mapping=True):
71 |     return name_mapper('ccle_name', ccle_names, 'broad_id', ignore_problems, check_unique_mapping)
72 | 
73 | 
74 | def ccle_to_latest(ccle_names, ignore_problems=False, check_unique_mapping=True):
75 |     return name_mapper('ccle_name', ccle_names, 'canonical_ccle_name', ignore_problems, check_unique_mapping)
76 | 
77 | # alias for the old name of this function
78 | latest_ccle_names=ccle_to_latest
79 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/python/cell_line_mapper/test_mapper.py:
--------------------------------------------------------------------------------
  1 | from . import name_mapper, arxspan_to_ccle, ccle_to_arxspan, latest_ccle_names, csv_url
  2 | import cell_line_mapper
  3 | import pytest
  4 | import pandas as pd
  5 | import requests
  6 | from io import StringIO
  7 | from collections import defaultdict
  8 | import csv
  9 | ##################################################################################
 10 | ### TEST ARXSPAN_TO_CCLE
 11 | #just the first three broadid's in the doc
 12 | 
 13 | def test_real_data_fetch_check():
 14 |     return_dict = defaultdict(set)
 15 | 
 16 |     mapping_request = StringIO(requests.get(csv_url).text)
 17 |     df = pd.read_csv(mapping_request)
 18 |     assert list(df) == ['ccle_name', 'canonical_ccle_name', 'broad_id']
 19 | 
 20 | 
 21 | ##################################################################################
 22 | 
 23 | @pytest.fixture()
 24 | def fake_mapping_csv(monkeypatch):
 25 |     def mock_read_file_to_dict(key_name, value_name):
 26 |         return_dict = defaultdict(str)
 27 | 
 28 |         with open('test-data.csv', mode='r') as csvfile:
 29 |             map_reader = csv.DictReader(csvfile)
 30 |             for rows in map_reader:
 31 |                 if return_dict[rows[key_name]] == "":
 32 |                     return_dict[rows[key_name]] = rows[value_name]
 33 |                 else:
 34 |                     return_dict[rows[key_name]]+=str(", "+rows[value_name])
 35 |         print(return_dict)
 36 |         return return_dict
 37 | 
 38 |     monkeypatch.setattr(cell_line_mapper, 'read_file_to_dict', mock_read_file_to_dict)
 39 | 
 40 | 
 41 | ##################################################################################
 42 | ### TEST ARXSPAN_TO_CCLE
 43 | # just the first three broadid's in the doc
 44 | def test_arxspan_to_ccle_first_three_rows(fake_mapping_csv):
 45 |     assert arxspan_to_ccle(["1", "2", "3"]) == ["A", "B", "C"]
 46 | 
 47 | 
 48 | #  1.  unmappable ID and ignore_problems == false
 49 | def test_arxspan_to_ccle_unmappable_ID_ignore_problems_false(fake_mapping_csv):
 50 |     with pytest.raises(KeyError) as excinfo:
 51 |         arxspan_to_ccle(["1", "madeupfakename"])
 52 |     assert "canonical_ccle_name could not be found for the following broad_id:  madeupfakename" in str(excinfo.value)
 53 | 
 54 | 
 55 | #  2.  unmappable ID and ignore_problems == true
 56 | def test_arxspan_to_ccle_unmappable_ID_ignore_problems_true(fake_mapping_csv):
 57 |     assert arxspan_to_ccle(["1", "madeupfakename"], True) == ["A", None]
 58 | 
 59 | 
 60 | #  2.5.  arxspan id has multiple ccle names
 61 | def test_arxspan_to_ccle_one_key_many_values(fake_mapping_csv):
 62 |     assert arxspan_to_ccle(["7"], True, False) == ["D, F"]
 63 | 
 64 | 
 65 | #  3.  unique elements in arxspan_ids do not map to unique ccle names and check_unique_mapping == false
 66 | def test_arxspan_to_ccle_nonunique_mapping_check_unique_mapping_false(fake_mapping_csv):
 67 |     assert arxspan_to_ccle(["4", "5"], True, False) == ["D", "D"]
 68 | 
 69 | 
 70 | #  4.  unique elements in arxspan_ids do not map to unique ccle names and check_unique_mapping == true
 71 | def test_arxspan_to_ccle_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv):
 72 |     with pytest.raises(RuntimeError) as excinfo:
 73 |         arxspan_to_ccle(["4", "5"], True, True)
 74 |     assert 'Mappings are not unique' in str(excinfo.value)
 75 | 
 76 | 
 77 | ##################################################################################
 78 | ### TEST CCLE_ARXSPAN
 79 | #  0.  Just the first three rows
 80 | def test_ccle_to_arxspan_first_three_rows(fake_mapping_csv):
 81 |     assert ccle_to_arxspan(["a", "B", "c"]) == ["1", "2", "3"]
 82 | 
 83 | 
 84 | #  1.  unmappable ccle name and ignore_problems == false
 85 | def test_ccle_to_arxspan_unmappable_ID_ignore_problems_false(fake_mapping_csv):
 86 |     with pytest.raises(KeyError) as excinfo:
 87 |         ccle_to_arxspan(["a", "madeupfakename", "idk"], False)
 88 |         assert "broad_id could not be found for the following canonical_ccle_name:  madeupfakename" in str(excinfo.value)
 89 | 
 90 | 
 91 | #  2.  unmappable ccle name and ignore_problems == true
 92 | def test_ccle_to_arxspan_unmappable_ID_ignore_problems_true(fake_mapping_csv):
 93 |     assert ccle_to_arxspan(["a", "madeupfakename"], True) == ["1", None]
 94 | 
 95 | 
 96 | #  2.5.  ccle name has multiple arxspan_ids
 97 | def test_ccle_to_arxspan_one_key_many_values(fake_mapping_csv):
 98 |     assert ccle_to_arxspan(["d"], True, False) == ["4, 5, 7"]
 99 | 
100 | 
101 | #  3.  unique ccle names do not map to unique arxspan_ids and check_unique_mapping == false
102 | def test_ccle_to_arxspan_nonunique_mapping_check_unique_mapping_false(fake_mapping_csv):
103 |     assert ccle_to_arxspan(["e"], True, False) == ["6, 7"]
104 | 
105 | 
106 | #  4.  unique ccle names do not map to unique arxspan_ids and check_unique_mapping == true
107 | def test_ccle_to_arxspan_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv):
108 |     with pytest.raises(RuntimeError) as excinfo:
109 |         a = ccle_to_arxspan(["e"], True, True)
110 |     assert 'Mappings are not unique' in str(excinfo.value)
111 | 
112 | 
113 | ##################################################################################
114 | ### TEST LATEST_CCLE_NAMES
115 | 
116 | #  0.  Just a few samples that have different names
117 | def test_latest_ccle_names_first_three_rows(fake_mapping_csv):
118 |     assert latest_ccle_names(["a", "B", "c"]) == ["A", "B", "C"]
119 | 
120 | 
121 | #  1.  unmappable ccle name and ignore_problems == false
122 | def test_latest_ccle_names_unmappable_ignore_problems_false(fake_mapping_csv):
123 |     with pytest.raises(KeyError) as excinfo:
124 |         latest_ccle_names(["madeupfakename"], False)
125 |     assert "canonical_ccle_name could not be found for the following ccle_name:  madeupfakename" in str(excinfo.value)
126 | 
127 | 
128 | #  2.  unmappable ccle name and ignore_problems == true
129 | def test_latest_ccle_names_unmappable_ignore_problems_true(fake_mapping_csv):
130 |     assert latest_ccle_names(["madeupfakename", "a"], True) == [None, "A"]
131 | 
132 | 
133 | #    2.5  old ccle name maps to multiple latest names
134 | def test_latest_ccle_names_one_key_many_values(fake_mapping_csv):
135 |     assert latest_ccle_names(["e"], True, False) == ["E, F"]
136 | 
137 | def test_latest_ccle_names_one_key_many_values_check_unique_mapping_true(fake_mapping_csv):
138 |     with pytest.raises(RuntimeError) as excinfo:
139 |         latest_ccle_names(["e"], True, True)
140 |     assert 'Mappings are not unique' in str(excinfo.value)
141 | 
142 | #  4.  unique ccle names do not map to unique latest names and check_unique_mapping == true
143 | def test_latest_ccle_names_nonunique_mapping_check_unique_mapping_true(fake_mapping_csv):
144 |     with pytest.raises(RuntimeError) as excinfo:
145 |         latest_ccle_names(["d", "dd", "ddd"], True, True)
146 |     assert 'Mappings are not unique' in str(excinfo.value)
147 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/python/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.23.1
2 | pytest==3.6.1
3 | requests==2.20.0


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import ast
 4 | import re
 5 | from setuptools import setup, find_packages
 6 | 
 7 | _version_re = re.compile(r'__version__\s*=\s*(.*)')
 8 | 
 9 | with open('cell_line_mapper/__init__.py', 'rt') as f:
10 |     version = str(ast.literal_eval(_version_re.search(
11 |         f.read()).group(1)))
12 | 
13 | setup(name='cell_line_mapper',
14 |       version=version,
15 |       description='Functions for mapping between cell line identifiers',
16 |       author='Phoebe Moh',
17 |       author_email='pmoh@broadinstitute.org',
18 |       install_requires=['pandas', 'requests'],
19 |       packages=find_packages()
20 |      )
21 | 
22 | 


--------------------------------------------------------------------------------
/genepy/cell_line_mapping-master/python/test-data.csv:
--------------------------------------------------------------------------------
 1 | ccle_name,canonical_ccle_name,broad_id
 2 | a,A,1
 3 | B,B,2
 4 | c,C,3
 5 | d,D,4
 6 | d,D,5
 7 | d,D,7
 8 | e,E,6
 9 | e,F,7
10 | 


--------------------------------------------------------------------------------
/genepy/epigenetics/CREME.md:
--------------------------------------------------------------------------------
 1 | # genepy/CREME: ChIP REplicate MErger
 2 | 
 3 | CREME is part of the [genepy](https://github.com/broadinstitute/GenePy) package.
 4 | 
 5 | For Introduction we will link to the [article](https://ro-che.info/articles/2018-07-11-chip-seq-consensus) by Roman Cheplyaka on the subject.
 6 | 
 7 | We built this tool noticing the lack of publicly available simple Chip Merging tool working for [MACS2](https://github.com/macs3-project/MACS)'s output, with replicates of broadly different quality. We wanted a 1 function tool that would work in python.
 8 | 
 9 | We will although note tools such as:
10 | - [PePr](https://pubmed.ncbi.nlm.nih.gov/24894502/) [code](https://github.com/shawnzhangyx/PePr) which can substitute itself from MACS2 by ccalling on mutliple bam files at the same time. It will work by counting reads and looking at the peak shape.
11 | - [multiGPS](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003501), [code](https://github.com/seqcode/multigps) which is mostly  for differential binding chipseq but can work with replicates and work in java + R. 
12 | - [MSPC](https://academic.oup.com/bioinformatics/article/31/17/2761/183989), [code](https://github.com/Genometric/MSPC) in .NET, which is very well documented, simple and provide some QC by the user.
13 | 
14 | - [genoGAM](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2238-7) in R, [code](https://github.com/gstricker/GenoGAM). Which calls peaks by itself as well and seem to handle replicates.
15 | 
16 | - [sierra Platinum](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5025614/), [code](https://github.com/sierraplatinum/sierra), which does not seem to be maintained.
17 | 
18 | ## Our tool
19 | 
20 | The goal of creme is to be a simple, 1 function tool. It works with 1 to many sets of replicates for each pulled protein/mark.
21 | 
22 | CREME takes as input a pandas dataframe. This dataframe is the concatenation of each replicates' bed files and can be loaded from a set of MACS2 bedfiles using genepy's loadPeaks function.
23 | 
24 | CREME will output, amongst other thing, a dataframe representing a concatenation f bedfiles of merged replicates.
25 | 
26 | ## Process
27 | 
28 | ### Selection: Finding the best replicate
29 | 
30 | A first goal of CREME was to find the best replicate, to do so, it can take manual annotation of _BAD_ (bad/lower quality) replicates. These can be provided by visual inspection of bigwig tracks + bed files on IGV, from thresholding on QC results such as FRiP scores.
31 | 
32 | ![plot igv](docsCREME/igv-app-MED1-zoom.png)
33 | 
34 | Given all available replicates, CREME will compute a conscensus, considering any peaks at most 150 bp from another peak, to be in overlap. We have noticed that changing this parameter from 0 to 150 decreased the total number of peaks found by only 8%.
35 | 
36 | Non overlapping peaks are kept in the conscensus. When we have an overlap we take the mean of signals and the product of pvalues across overlapping replicates.
37 | 
38 | ![plot venn](docsCREME/MED1_before_venn_venn.png)
39 | 
40 | Then, CREME will look at their overlap and select the one that has the best overlap score:
41 | 
42 | $O_{score}(A) = \sum{i from 0 to m} \sum{K in comb(i, G)} i * \sum {j from 0 to n}  AND(A[j],...K[j])$
43 | 
44 | Where:
45 | - $G$ is a binary matrix of size (row/col) $m*n$ of $m$ replicates with $n$ conscensus peaks and a value of 1 if replicate $m_i$ has peak on conscensus peak $n_i$.
46 | - $comb(i, G)$ is a list of all possible matrices made from taking $i$ elements (row) from matrix $G$ without replacement.
47 | - $AND$ is a binary operation returning 1 if all passed elements are 1 else 0.
48 | 
49 | The non-bad quality replicate with the best score will be selected as the __main replicate__.
50 | 
51 | In addition to the venn diagram, correlation between each replicate's peak signal is computed and displayed to the user.
52 | 
53 | ![pairplot of replicates](docsCREME/MED1_before_pairplot.png)
54 | 
55 | ### Validation: Finding new peaks
56 | 
57 | For each additional replicate S, we will now look for new peaks. 
58 | First, if we find that the second best replicate and the first best replicate have both less than 30% of their peaks in common we __discard__ that protein/mark and only return the main replicate.
59 | 
60 | Taking peaks that are found in the main replicate, we will call peaks, using S's bigwig and a lower threshold than what MACS2 is using by default. We then do the same for peaks in S that were not in the main replicate.
61 | 
62 | If after calling new peaks we get less than 30% overlap in both replicates, we discard the replicate.
63 | 
64 | Else, we finalize the merging of overlapping peaks and update the __main replicate__ with this overlap.
65 | 
66 | ### Calling Peaks
67 | 
68 | The process of calling peaks is loosely based on MACS2's peak calling algorithm:
69 | 
70 | We compute a distance: the [KL divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence), between two poisson distributions. One is representing the distribution of signal from a bigwig file under a region. The other is the representing the same signal under the entire chromosome where that region lies. The region here is the peak in the other sample that we want to look for in the current sample.
71 | 
72 | If that distance is above a threshold: here 8, we validate the region as being a peak.
73 | 
74 | ### Output and QC
75 | 
76 | The output of our tool is a dataframe of concatenated merged replicates. The pipeline also outputs a set of bad quality replicates and bad quality proteins/marks.
77 | 
78 | Additionally, inforrmation on distribution of peak signal across replicates and number of peaks found is provided to the user.
79 | 
80 | ![kdeplot of new found peaks](docsCREME/MED1_new_found_peaks_kdeplot.png)
81 | 
82 | ## WIP and current issues
83 | 
84 | 1. For now, we are not using the exact same algorithm as MACS2, as we are comparing the peak's read distribution to overall reads in the chromosome using KL divergence. But MACS2 is comparing 4 terms: the distribution in the likely region of the sample BAM, the distribution in the likely region of the INPUT BAM, the distribution in the sample BAM's chromosome, the distribution in the INPUT BAM's chromosome. Moreover, MACS is comparing them using something like a fisher's exact test and corrects for FDR using the BH method.
85 | 
86 | 2. For now, we are not computing a perfect overall replicate quality ourselves. Our  scoring method did not work in 5% of cases. We might want to mitigate it by adding peaks' Qvalues and the replicate's Frip score and total read count in our analysis.
87 | 
88 | 3. For now, we do not compute pvalue when we compute new peaks.
89 | 
90 | 4. For now, we do not integrate the pvalue/ signal of newly found peaks in the conscensus merger.  
91 | 
92 | 5. More long term: We would hope to do something more akin to joint calling across replicates using graphical models to call peaks.
93 | 


--------------------------------------------------------------------------------
/genepy/epigenetics/CREME.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | from genepy.utils import helper as h
  5 | from genepy.utils import plot
  6 | from genepy.epigenetics.chipseq import *
  7 | import seaborn as sns
  8 | import pyBigWig
  9 | import matplotlib.pyplot as plt
 10 | from scipy.optimize import minimize
 11 | from scipy.special import factorial
 12 | import warnings
 13 | import itertools
 14 | 
 15 | 
 16 | def findpeakpath(folder, proteiname):
 17 |     """
 18 |     given a folder of bigwigs and a protein name, finds the right bigwig
 19 |     """
 20 |     res = None
 21 |     for val in os.listdir(folder):
 22 |         if str(proteiname) in val:
 23 |             if res:
 24 |                 raise ValueError('more than 1 bigwig file found')
 25 |             res = val
 26 |     if res:
 27 |         return res
 28 |     raise ValueError('no bigwig file found')
 29 | 
 30 | 
 31 | def findBestPeak(presence):
 32 |     """
 33 |     given a list of -sets of peak locations for each replicate- will return the best replicate given a simple metric
 34 |     """
 35 |     tot = []
 36 |     for ind, el in enumerate(presence):
 37 |         val = len(el)
 38 |         pres = [x for j, x in enumerate(presence) if j != ind]
 39 |         for jnd in range(1, len(pres)+1):
 40 |             for comb in itertools.combinations(pres, jnd):
 41 |                 ov = el
 42 |                 for knd in range(jnd):
 43 |                     ov = ov & comb[knd]
 44 |                 val += len(ov)*(jnd+1)
 45 |         tot.append(val)
 46 |     return np.argsort(tot)[::-1]
 47 | 
 48 | 
 49 | def mergeReplicatePeaks(peaks, bigwigfolder, markedasbad=None, window=100,
 50 |                                                 sampling=3000, mincov=4, doPlot=True, cov={}, minKL=8, use='max',
 51 |                                                 MINOVERLAP=0.3, lookeverywhere=True, only='', saveloc=''):
 52 |     """
 53 |     /!/ should only be passed peaks with at least one good replicate
 54 |     for each TFpeaksets,
 55 |     1. find the replicate that have the most peaks
 56 |     2. correlate peaks and get in highest correlation order with the replicate found in 1
 57 |     3. find overlap of both and get size of second replicate
 58 |     4. if small(er)-> use only to increase statistics
 59 |         1. if a lot of uncalled peaks in replicate 2 at replicate 1 peaks (flag for mergebam)
 60 |     5. if similar size -> get only intersect
 61 |         2. add to intersect, find uncalled peaks in both replicates which are called in the other
 62 |     6. repeat for all replicates
 63 |     -------------------------
 64 |     if full overlap of one of the peak replicate, only use the overlapped one to increase confidence on peak
 65 |     if >80% average non overlap,
 66 |         print warning and percentage of overlap
 67 | 
 68 |     if <20% average non overlap,
 69 |         take the overlap and increase confidence and avg logfold
 70 | 
 71 |     if one is <20%:
 72 |         if other <40% average non overlap,
 73 |         take the overlap and increase confidence and avg logfold
 74 |         else
 75 |         take
 76 | 
 77 |     gets the max cov at the genomic window and if above some threshold, accepts the peak.
 78 | 
 79 |     extend peak by X bp if no TSS
 80 |     remove TSS from peaks
 81 | 
 82 | 
 83 |         create a new data frame containing merged peak size, reassembled peak data (p value etc..) and
 84 |         a the value for presence of each TF listed in previous df
 85 |         ------------------------------------
 86 | 
 87 |         args:
 88 |         ----
 89 |         peaks: df[bed-like] all the peaks into the sameBam with a column containing the 'name'
 90 |         being the id of the sample, the 'replicate' number of this sample, the 'tf' chiped here
 91 |         bamfolder: str, foldername
 92 |         avgCov: dict(filename:int) a dict where for each bam filename is given an averageCoverage
 93 |         if use=='max':
 94 |                 window:
 95 |                 mincov:
 96 | 
 97 |         if use=='max':
 98 | 
 99 | 
100 |         returns:
101 |         -------
102 |         mergedpeaks: dict{df-peakslike}
103 |         bamtomerge: [[bam1,bam2]]
104 |     """
105 |     def col_nan_scatter(x, y, **kwargs):
106 |         df = pd.DataFrame({'x': x[:], 'y': y[:]})
107 |         df = df[df.sum(0) != 0]
108 |         x = df['x']
109 |         y = df['y']
110 |         plt.gca()
111 |         plt.scatter(x, y)
112 | 
113 |     def col_nan_kde_histo(x, **kwargs):
114 |         df = pd.DataFrame({'x': x[:]})
115 |         df = df[df['x'] != 0]
116 |         x = df['x']
117 |         plt.gca()
118 |         sns.kdeplot(x)
119 |     print("/!/ should only be passed peaks with at least one good replicate")
120 |     # for a df containing a set of peaks in bed format and an additional column of different TF
121 |     tfs = list(set(peaks['tf']))
122 |     totpeaknumber = 0
123 |     mergedpeaksdict = {}
124 |     remove = []
125 |     tomergebam = []
126 |     ratiosofunique = {}
127 |     h.createFoldersFor(saveloc)
128 |     f = open(saveloc+'results.txt', 'w')
129 |     warnings.simplefilter("ignore")
130 |     for tf in tfs:
131 |         if only and tf != only:
132 |             continue
133 |         cpeaks = peaks[peaks.tf == tf]
134 |         print('_____________________________________________________')
135 |         f.write('_____________________________________________________' + '\n')
136 |         if len(set(cpeaks['replicate'])) == 1:
137 |             if cpeaks.name.tolist()[0] in markedasbad:
138 |                 print('the only replicate is considered bad!')
139 |                 f.write('the only replicate is considered bad!'+"\n")
140 |                 print('wrong TF: '+tf)
141 |                 f.write('wrong TF: '+tf+"\n")
142 |                 mergedpeaksdict.update({tf: cpeaks})
143 |                 remove.append(tf)
144 |                 continue
145 |             print("we only have one replicate for " + tf + " .. pass")
146 |             f.write("we only have one replicate for " + tf + " .. pass"+"\n")
147 |             mergedpeaksdict.update({tf: cpeaks})
148 |             continue
149 |         print("merging " + tf + " peaks")
150 |         f.write("merging " + tf + " peaks"+"\n")
151 |         merged = simpleMergePeaks(cpeaks, window=window, maxp=False)
152 |         merged_bed = merged[merged.columns[8:]]
153 |         finalpeaks = merged[merged.columns[:8]]
154 |         print('--> finish first overlaps lookup')
155 |         f.write('--> finish first overlaps lookup'+"\n")
156 |         # flag when  biggest is <1000 peaks
157 |         if len(finalpeaks) < 1000:
158 |             print('!TF has less than 1000 PEAKS!')
159 |             f.write('!TF has less than 1000 PEAKS!'+"\n")
160 |         # for each TF (replicates), compute number of peaks
161 |         peakmatrix = merged_bed.values.astype(bool)
162 | 
163 |         presence = []
164 |         for peakpres in peakmatrix.T:  # https://github.com/tctianchi/pyvenn
165 |             presence.append(set([i for i, val in enumerate(peakpres) if val == 1]))
166 |         # compute overlap matrix (venn?)
167 |         if peakmatrix.shape[1] < 7 and doPlot:
168 |             plot.venn(presence, [i+'_BAD' if i.split('-')[0]
169 |                                                 in markedasbad else i for i in merged_bed.columns], title=tf+"_before_venn", folder=saveloc)
170 |             plt.show()
171 |         else:
172 |             print('too many replicates for Venn: '+str(peakmatrix.shape[1]))
173 |             f.write('too many replicates for Venn: '+str(peakmatrix.shape[1])+"\n")
174 |         if doPlot:
175 |             fig = sns.pairplot(merged_bed, corner=True, diag_kind="kde",
176 |                                                  kind="reg", plot_kws={"scatter_kws": {"alpha": .05}})
177 |             #fig = fig.map_upper(col_nan_scatter)
178 |             #fig = fig.map_upper(col_nan_kde_histo)
179 |             plt.suptitle("correlation of peaks in each replicate", y=1.08)
180 |             if saveloc:
181 |                 fig.savefig(saveloc+tf+"_before_pairplot.pdf")
182 |             plt.show()
183 |             for i, val in enumerate(merged_bed):
184 |                 unique_inval = np.logical_and(
185 |                     np.delete(peakmatrix, i, axis=1).sum(1).astype(bool) == 0, peakmatrix[:, i])
186 |                 sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None))
187 |             plt.title("distribution of unique peaks in each replicate")
188 |             if saveloc:
189 |                 plt.savefig(saveloc+tf+"_before_unique_kdeplot.pdf")
190 |             plt.show()
191 | 
192 |         bigwigs = os.listdir(bigwigfolder)
193 | 
194 |         foundgood = False
195 |         sort = findBestPeak(presence)
196 |         for ib, sb in enumerate(sort):
197 |             if merged_bed.columns[sb].split('-')[0] not in markedasbad:
198 |                 foundgood = True
199 |                 break
200 |         if not foundgood:
201 |             print('no peaks were good enough quality')
202 |             f.write('no peaks were good enough quality'+"\n")
203 |             print('bad TF: '+tf)
204 |             f.write('bad TF: '+tf+"\n")
205 |             remove.append(tf)
206 |             ib = 0
207 |         # distplot
208 |         # correlation plot
209 | 
210 |         biggest_ind = sort[ib]
211 |         peakmatrix = peakmatrix.T
212 |         biggest = merged_bed.columns[biggest_ind]
213 |         print('-> main rep is: '+str(biggest))
214 |         f.write('-> main rep is: '+str(biggest)+'\n')
215 |         tot = peakmatrix[biggest_ind].copy().astype(int)
216 |         # starts with highest similarity and go descending
217 |         j = 0
218 |         recovered = 0
219 |         additionalpeaksinbig = np.array([])
220 |         for i, val in enumerate(sort):
221 |             if i == ib:
222 |                 continue
223 |             j += 1
224 |             # if avg non overlap > 60%, and first, and none small flag TF as unreliable.
225 |             overlap = len(presence[biggest_ind] & presence[val]
226 |                                         ) / len(presence[biggest_ind])
227 |             peakname = merged_bed.columns[val]
228 |             print('- '+peakname)
229 |             f.write('- '+peakname+'\n')
230 |             print('  overlap: ' + str(overlap*100)+"%")
231 |             f.write('  overlap: ' + str(overlap*100)+"%"+'\n')
232 |             if overlap < MINOVERLAP:
233 |                 smallsupport = len(presence[biggest_ind] &
234 |                                                      presence[val]) / len(presence[val])
235 |                 print(' --> not enough overlap')
236 |                 f.write(' --> not enough overlap'+'\n')
237 |                 if smallsupport < MINOVERLAP:
238 |                     # if the secondary does not have itself the required support
239 |                     if j == 1 and merged_bed.columns[val].split('-')[0] not in markedasbad:
240 |                         print("  Wrong TF: "+tf)
241 |                         f.write("  Wrong TF: "+tf+'\n')
242 |                         remove.append(tf)
243 |                         break
244 |                     # if not first, throw the other replicate and continue
245 |                     print("  not using this replicate from the peakmatrix")
246 |                     f.write("  not using this replicate from the peakmatrix"+'\n')
247 |                     continue
248 |             if lookeverywhere:
249 |                 tolookfor = peakmatrix[val] == 0
250 |             else:
251 |                 tolookfor = np.logical_and(peakmatrix[biggest_ind], peakmatrix[val] == 0)
252 |             # ones that we have in the Primary but not in the secondary
253 |             additionalpeaksinsec = findAdditionalPeaks(finalpeaks, tolookfor, bigwigfolder + findpeakpath(
254 |                 bigwigfolder, peakname), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use)
255 |             if len(additionalpeaksinsec[additionalpeaksinsec > 0]) > 0:
256 |                 sns.kdeplot(additionalpeaksinsec[additionalpeaksinsec > 0],
257 |                                         label=peakname, legend=True).set(xlim=(0, None))
258 |                 print('  min,max from newly found peaks: ' +
259 |                             str((additionalpeaksinsec[additionalpeaksinsec > 0].min(), additionalpeaksinsec[additionalpeaksinsec > 0].max())))
260 |                 f.write('  min,max from newly found peaks: '+str((additionalpeaksinsec[additionalpeaksinsec > 0].min(
261 |                 ), additionalpeaksinsec[additionalpeaksinsec > 0].max()))+'\n')
262 |             # for testing purposes mainly
263 |             finalpeaks[additionalpeaksinsec.astype(bool)].to_csv(
264 |                 'additionalpeaksinsec_mp'+merged_bed.columns[val]+'.bed', sep='\t', index=None, header=False)
265 |             peakmatrix[val] = np.logical_or(
266 |                 peakmatrix[val], additionalpeaksinsec.astype(bool))
267 |             overlap = np.sum(np.logical_and(
268 |                 peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[biggest_ind])
269 |             if overlap < MINOVERLAP:
270 |                 newsmalloverlap = np.sum(np.logical_and(
271 |                     peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[val])
272 |                 print("  we did not had enough initial overlap.")
273 |                 f.write("  we did not had enough initial overlap."+'\n')
274 |                 if newsmalloverlap < MINOVERLAP:
275 |                     if merged_bed.columns[val].split('-')[0] in markedasbad:
276 |                         print('  replicate ' +
277 |                                     merged_bed.columns[val] + ' was too bad and had not enough overlap')
278 |                         f.write('  replicate ' +
279 |                                         merged_bed.columns[val] + ' was too bad and had not enough overlap'+'\n')
280 |                         continue
281 |                     elif h.askif("we have two good quality peaks that don't merge well at all: "+merged_bed.columns[val] +
282 |                                                                                         " and " + merged_bed.columns[biggest_ind] + " can the first one be removed?:\n  \
283 |                             overlap: "+str(overlap*100)+'%\n  smalloverlap: '+str(smalloverlap*100)+'%\n  new smalloverlap: '+str(newsmalloverlap*100)+"%"):
284 |                         continue
285 |                     else:
286 |                         print("  enough from small overlaps")
287 |                         f.write("  enough from small overlaps"+'\n')
288 |             print(' --> enough overlap')
289 |             f.write(' --> enough overlap'+'\n')
290 |             recovered += np.sum(additionalpeaksinsec.astype(bool))
291 |             if merged_bed.columns[val].split('-')[0] not in markedasbad:
292 |                 tot += peakmatrix[val].astype(int)
293 |             # ones that we have in the Primary but not in the secondary
294 |             if not lookeverywhere or len(additionalpeaksinbig) == 0:
295 |                 tolookfor = peakmatrix[biggest_ind] == 0 if lookeverywhere else np.logical_and(
296 |                     peakmatrix[biggest_ind] == 0, peakmatrix[val])
297 |                 additionalpeaksinbig = findAdditionalPeaks(finalpeaks, tolookfor, bigwigfolder + findpeakpath(
298 |                     bigwigfolder, biggest), sampling=sampling, mincov=mincov, window=window, minKL=minKL, use=use)
299 |                 if len(additionalpeaksinbig[additionalpeaksinbig > 0]) > 0:
300 |                     sns.kdeplot(additionalpeaksinbig[additionalpeaksinbig > 0],
301 |                                             label=biggest, legend=True).set(xlim=(0, None))
302 |                     print('  min,max from newly found peaks: ' +
303 |                                 str((additionalpeaksinbig[additionalpeaksinbig > 0].min(), additionalpeaksinbig[additionalpeaksinbig > 0].max())))
304 |                     f.write('  min,max from newly found peaks: '+str((additionalpeaksinbig[additionalpeaksinbig > 0].min(
305 |                     ), additionalpeaksinbig[additionalpeaksinbig > 0].max()))+'\n')
306 | 
307 |                 peakmatrix[biggest_ind] = np.logical_or(
308 |                     peakmatrix[biggest_ind], additionalpeaksinbig)
309 |                 tot += additionalpeaksinbig.astype(bool).astype(int)
310 |                 recovered += np.sum(additionalpeaksinbig.astype(bool))
311 |             print('  we have recovered ' + str(recovered)+' peaks, equal to ' + str(100*recovered/np.sum(peakmatrix[biggest_ind])) +
312 |                                                         '% of the peaks in main replicate')
313 |             f.write('  we have recovered ' + str(recovered)+' peaks, equal to ' + str(100*recovered/np.sum(peakmatrix[biggest_ind])) +
314 |                                                         '% of the peaks in main replicate'+'\n')
315 |             if overlap < (MINOVERLAP+0.2)/1.2:
316 |                 # we recompute to see if the overlap changed
317 |                 newoverlap = np.sum(np.logical_and(
318 |                     peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[biggest_ind])
319 |                 smalloverlap = np.sum(np.logical_and(
320 |                     peakmatrix[val], peakmatrix[biggest_ind]))/np.sum(peakmatrix[val])
321 |                 if newoverlap < (MINOVERLAP+0.2)/1.2:
322 |                     if smalloverlap < (2+MINOVERLAP)/3:
323 |                         print("  not enough overlap to advice to merge the bams.\n  oldnew overlap: "+str(overlap*100)+'%\n  \
324 |                             new overlap: '+str(newoverlap*100)+"%")
325 |                         f.write("  not enough overlap to advice to merge the bams.\n  oldnew overlap: "+str(overlap*100)+'%\n  \
326 |                             new overlap: '+str(newoverlap*100)+"%"+'\n')
327 |                         continue
328 |                     else:
329 |                         print('  enough from small overlap to advice to merge the peaks')
330 |                         f.write('  enough from small overlap to advice to merge the peaks'+'\n')
331 |             tomergebam.append([biggest, peakname])
332 |             #the quality is good enough in the end we can pop from the list if it exists
333 |             if tf in remove:
334 |                 remove.remove(tf)
335 |         plt.title('distribution of new found peaks')
336 |         if saveloc:
337 |             plt.savefig(saveloc+tf+"_new_found_peaks_kdeplot.pdf")
338 |         plt.show()
339 |         # new distplot
340 |         # new correlation plot
341 |         ratiosofunique[tf] = len(np.argwhere(
342 |             peakmatrix.sum(0) == 1))/peakmatrix.shape[1]
343 |         if doPlot:
344 |             sns.pairplot(merged_bed, corner=True, diag_kind="kde",
345 |                                      kind="reg", plot_kws={"scatter_kws": {"alpha": .05}})
346 |             #fig = fig.map_upper(col_nan_scatter)
347 |             #fig = fig.map_upper(col_nan_kde_histo)
348 |             plt.suptitle("correlation and distribution of peaks after recovery", y=1.08)
349 |             if saveloc:
350 |                 plt.savefig(saveloc+tf+"_after_pairplot.pdf")
351 |             plt.show()
352 |             for i, val in enumerate(merged_bed):
353 |                 unique_inval = np.logical_and(
354 |                     np.delete(peakmatrix, i, axis=0).sum(0).astype(bool) == 0, peakmatrix[i])
355 |                 sns.kdeplot(merged_bed[val][unique_inval], legend=True).set(xlim=(0, None))
356 |             plt.title("distribution of unique peaks in each replicate after recovery")
357 |             if saveloc:
358 |                 plt.savefig(saveloc+tf+"_after_unique_kdeplot.pdf")
359 |             plt.show()
360 |         if len(peakmatrix.shape) > 1 and doPlot:
361 |             if peakmatrix.shape[0] < 7:
362 |                 presence = []
363 |                 for peakpres in peakmatrix:  # https://github.com/tctianchi/pyvenn
364 |                     presence.append(set([i for i, val in enumerate(peakpres) if val == 1]))
365 |                 title = tf + '_recovered (TOREMOVE)' if tf in remove else tf+'_recovered'
366 |                 plot.venn(presence, [i+'_BAD' if i.split('-')[0]
367 |                                                                         in markedasbad else i for i in merged_bed.columns], title=title, folder=saveloc)
368 |                 plt.show()
369 |             else:
370 |                 print('too many replicates for Venn')
371 |                 f.write('(too many replicates for Venn)'+'\n')
372 |             finalpeaks = finalpeaks[np.logical_or(tot > 1, peakmatrix[biggest_ind])]
373 |         finalpeaks['name'] = biggest
374 |         finalpeaks['tf'] = tf
375 |         mergedpeaksdict.update({tf: finalpeaks})
376 |         print(str((tf, len(finalpeaks))))
377 |         f.write(str((tf, len(finalpeaks)))+'\n')
378 |     mergedpeak = pd.concat(
379 |         [peaks for _, peaks in mergedpeaksdict.items()]).reset_index(drop=True)
380 |     if doPlot:
381 |         df = pd.DataFrame(data=ratiosofunique, index=['percentage of unique'])
382 |         df['proteins'] = df.index
383 |         fig = sns.barplot(data=df)
384 |         plt.xticks(rotation=60, ha='right')
385 |         plt.title("ratios of unique in replicates across experiments")
386 |         if saveloc:
387 |             plt.savefig(saveloc+"All_ratios_unique.pdf")
388 |         plt.show()
389 |     f.close()
390 |     mergedpeak['name'] = mergedpeak.tf
391 |     return mergedpeak, tomergebam, remove, ratiosofunique
392 | 
393 | 
394 | def findAdditionalPeaks(peaks, tolookfor, filepath, sampling=1000, mincov=4,
395 |                                                 window=100, cov={}, minKL=8, use='max'):
396 | 
397 |     """
398 |     findAdditionalPeaks: for all peaks in A and/or B find in coverage file if zone has relative cov
399 |     of more than thresh then add to peak
400 |     if B is small and > 20% of peaks in A are found back, increase confidence and
401 |     flag for mergeBams
402 |     if < 20% don't flag for merge bam
403 |     f B is big and now mean non overlap < 40%, take union and flag for mergeBam else, throw B.
404 | 
405 |     Args:
406 |     -----
407 |         peaks
408 |         tolookfor
409 |         filepath
410 |         sampling
411 |         mincov
412 |         window
413 |         cov
414 |         minKL
415 |         use
416 |     returns:
417 |     -------
418 |         np.array(bool) for each peaks in peakset, returns a binary
419 |     """
420 |     # def poisson(k, lamb, scale): return scale * (lamb**k / factorial(k)) * np.exp(-lamb)
421 | 
422 |     def KLpoisson(lamb1, lamb2): return lamb1 * \
423 |                         np.log(lamb1 / lamb2) + lamb2 - lamb1
424 | 
425 |     def poisson(k, lamb): return (lamb**k/factorial(k)) * np.exp(-lamb)
426 | 
427 |     def negLogLikelihood(params, data): return - \
428 |                         np.sum(np.log(poisson(data, params[0])))
429 | 
430 |     def poissonFit(data): return float(
431 |         minimize(negLogLikelihood, x0=np.ones(1), args=(data,), method='Powell').x)
432 |     bw = pyBigWig.open(filepath)
433 |     res = np.zeros(len(peaks))
434 |     prevchrom = ''
435 |     lamb = {}
436 |     cov = {}
437 |     #ignore by message
438 |     warnings.filterwarnings("ignore", message="encountered in")
439 |     for i, has in enumerate(tolookfor):
440 |         if has:
441 |             val = peaks.iloc[i]
442 |             if val.chrom not in chroms:
443 |                 continue
444 |             if val.chrom != prevchrom:
445 |                 if val.chrom not in cov:
446 |                     cov[val.chrom] = bw.stats(str(val.chrom))[0]
447 |                     prevchrom = val.chrom
448 |                     if use == 'poisson':
449 |                         #TODO: compute on INPUT file instead
450 |                         samples = np.zeros(window * sampling)
451 |                         sam = np.random.rand(sampling)
452 |                         sam = sam * (bw.chroms(str(val.chrom))-window)
453 |                         for j, sample in enumerate(sam.astype(int)):
454 |                             samples[j*window:(j + 1)*window] = np.nan_to_num(
455 |                                 bw.values(str(val.chrom), sample, sample + window), 0)
456 |                         scale = np.unique(samples)[1]
457 |                         samples = (samples/scale).astype(int)
458 |                         lamb[val.chrom] = (poissonFit(samples), scale)
459 | 
460 |             start = max([val.start - window, 0])
461 |             end = min(val.end + window, bw.chroms(str(val.chrom)))
462 |             zone = np.nan_to_num(bw.values(str(val.chrom), start, end), 0)
463 |             if use == 'max':
464 |                 if max(zone) / cov[val.chrom] > mincov*1.5 or sum(zone) / (cov[val.chrom] * (end - start)) > mincov:
465 |                     res[i] = max(zone) / cov[val.chrom]
466 |             elif use == 'poisson':
467 |                 #TODO: compute -log10pvalue
468 |                 la = poissonFit((zone/lamb[val.chrom][1]).astype(int))
469 |                 kl = KLpoisson(la, lamb[val.chrom][0])
470 |                 if kl > minKL:
471 |                     res[i] = max(zone) / cov[val.chrom]  # foldchange from macs3
472 | 
473 |     return res
474 | 


--------------------------------------------------------------------------------
/genepy/epigenetics/README.md:
--------------------------------------------------------------------------------
 1 | # epigenomics
 2 | 
 3 | Especially targeted to functions related to the analysis of epigenomics data. It has functions to read, merge, denoise, ChIP seq data.
 4 | 
 5 | ## Available functions:
 6 | 
 7 | ### chipseq.py
 8 | 
 9 | - bigWigFrom: run the bigwig command line for a set of bam files in a folder
10 | - ReadRoseSuperEnhancers: reads ROSE2's output and returns its superenhancer bedfile as a pd dataframe. 
11 | - loadPeaks: loads 1 to many peak bedfile into one pandas dataframe.
12 | - simpleMergePeaks: simply merges bedfiles from peak callers. providing a concaneted dataframe of bed-like tables
13 | - putInBed: given a conscensus bed-like dataframe and another one, will merge the second one into the first
14 | - pairwiseOverlap: compute pairwise overlap and correlation on this overlap for a set of peaks mappe to a conscensus 
15 | - enrichment: compute pairwise enrichment and correlation for a set of peaks mappe to a conscensus 
16 | - fullDiffPeak: will use macs3 to call differential peak binding from two bam files and their control
17 | - diffPeak: calls MACS2 bdgdiff given some parameters
18 | - MakeSuperEnhancers: Calls super enhancer from H3K27ac with the ROSE algorithm
19 | - runChromHMM: runs the chromHMM algorithm
20 | - loadMEMEmotifs: loads motif from the output file of MEME after running fimo.
21 | - simpleMergeMotifs: aggregates the motifs if they overlap, into one motif file
22 | - substractPeaksTo: removes all peaks that are not within a bp distance to a set of loci
23 | 
24 | ### CREME.py
25 | 
26 | The goal of creme is to be a simple, 1 function tool. It works with 1 to many sets of replicates for each pulled protein/mark.
27 | 
28 | CREME takes as input a pandas dataframe. This dataframe is the concatenation of each replicates' bed files and can be loaded from a set of MACS2 bedfiles using genepy's loadPeaks function.
29 | 
30 | CREME will output, amongst other thing, a dataframe representing a concatenation f bedfiles of merged replicates.
31 | 
32 | find out more at __CREME.md__
33 | 
34 | ## highly recommended packages
35 | 
36 | *This package won't contain anything that overlap with those and might use those packages for what it is doing.*
37 | - Bedtools
38 | - deepTools
39 | - MACS2
40 | - ROSE
41 | - MEME
42 | - ChromHMM
43 | 


--------------------------------------------------------------------------------
/genepy/epigenetics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/__init__.py


--------------------------------------------------------------------------------
/genepy/epigenetics/docsCREME/MED1_before_pairplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_before_pairplot.png


--------------------------------------------------------------------------------
/genepy/epigenetics/docsCREME/MED1_before_venn_venn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_before_venn_venn.png


--------------------------------------------------------------------------------
/genepy/epigenetics/docsCREME/MED1_new_found_peaks_kdeplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/MED1_new_found_peaks_kdeplot.png


--------------------------------------------------------------------------------
/genepy/epigenetics/docsCREME/igv-app-MED1-zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/epigenetics/docsCREME/igv-app-MED1-zoom.png


--------------------------------------------------------------------------------
/genepy/epigenetics/plot.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | from matplotlib import cm
  6 | from genepy.epigenetics import chipseq as chip
  7 | from genepy.utils import helper as h
  8 | 
  9 | def plotAverageOfSamples(samples, folder="", showAll=False, maxv=None, minv=None):
 10 |   res = [] 
 11 |   plt.figure()
 12 |   plt.ylim(minv,maxv)
 13 |   for sample in samples:
 14 |     data = pd.read_csv(sample, sep='\t', skiprows=1, header=None, names=['chr', 'start', 'end', 'name', "foldchange","."]+list(range(600)))
 15 |     r = data[list(range(600))].mean().tolist()
 16 |     res.append(r)
 17 |     if showAll:
 18 |       sns.lineplot(data=np.array(r), color="#BFBFFF")
 19 |   sns.lineplot(data=np.array(res).mean(0))
 20 |   if folder:
 21 |     plt.savefig(folder+"_averageofsamples.pdf", color="#1F1FFF")
 22 |   return res
 23 | 
 24 | 
 25 | def pysam_getPeaksAt(peaks, bams, folder='data/seqs/', window=1000, numpeaks=1000, numthreads=8):
 26 | 
 27 |   # get pysam data
 28 |   # ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks
 29 |   # for each counts, do a rolling average (or a convolving of the data) with numpy
 30 |   # append to an array
 31 |   # return array, normalized
 32 |   loaded = {}
 33 |   res = {i: np.zeros((len(peaks), window * 2)) for i in bams}
 34 |   peaks = peaks.sort_values(by="foldchange", ascending=False).iloc[:numpeaks]
 35 |   peaks.chrom = peaks.chrom.astype(str)
 36 |   for val in bams:
 37 |     loaded.update({val: pysam.AlignmentFile(
 38 |             folder + val, 'rb', threads=numthreads)})
 39 |   for k, bam in loaded.items():
 40 |     for num, (i, val) in enumerate(peaks.iterrows()):
 41 |       print(int(num / len(peaks)), end='\r')
 42 |       center = int((val['start'] + val['end']) / 2)
 43 |       for pileupcolumn in bam.pileup(val['chrom'], start=center - window,
 44 |                                   stop=center + window, truncate=True):
 45 |         res[k][num][pileupcolumn.pos - (center - window)] = pileupcolumn.n
 46 |   fig, ax = plt.subplots(1, len(res))
 47 |   for i, (k, val) in enumerate(res.items()):
 48 |     sns.heatmap(val, ax=ax[i])
 49 |     ax[i].set_title(k.split('.')[0])
 50 |   fig.show()
 51 |   return res, fig
 52 | 
 53 | 
 54 | def bedtools_getPeaksAt(peaks, bams, folder='data/seqs/', window=1000, numpeaks=1000, numthreads=8):
 55 |   """
 56 |   get pysam data
 57 |   ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks
 58 |   for each counts, do a rolling average (or a convolving of the data) with numpy
 59 |   append to an array
 60 |   return array, normalized
 61 |   """
 62 |   loaded = {}
 63 |   center = [int((val['start'] + val['end']) / 2) for k, val in peaks.iterrows()]
 64 |   peaks['start'] = [c - window for c in center]
 65 |   peaks['end'] = [c + window - 1 for c in center]
 66 |   peaks[peaks.columns[:3]].sort_values(by=['chrom', 'start']).to_csv(
 67 |             'temp/peaks.bed', sep='\t', index=False, header=False)
 68 |   bedpeaks = BedTool('temp/peaks.bed')
 69 | 
 70 |   fig, ax = plt.subplots(1, len(bams))
 71 |   peakset = peaks["foldchange"].values.argsort()[::-1][:numpeaks]
 72 |   for i, val in enumerate(bams):
 73 |     coverage = BedTool(folder + val).intersect(bedpeaks).genome_coverage(bga=True, split=True)\
 74 |       .intersect(bedpeaks).to_dataframe(names=['chrom', 'start', 'end', 'coverage'])
 75 |     cov = np.zeros((len(peaks), window * 2), dtype=int)
 76 |     j = 0
 77 |     pdb.set_trace()
 78 |     for i, (k, val) in enumerate(peaks.iterrows()):
 79 |       print(i / len(peaks), end='\r')
 80 |       while coverage.iloc[j].start > val.start:
 81 |         j -= 1
 82 |       while coverage.iloc[j].start < val.end:
 83 |         cov[i][coverage.iloc[j].start - val.start:coverage.iloc[j].end - val.start] =\
 84 |           coverage.iloc[j].coverage
 85 |         j += 1
 86 |     sns.heatmap(coverage, ax=ax[i])
 87 |     ax[i].set_title(val.split('.')[0])
 88 |   fig.show()
 89 |   return None, fig
 90 | 
 91 | 
 92 | def makeProfiles(matx=[], folder='', matnames=[], title='',
 93 |                  name='temp/peaksat.pdf', refpoint="TSS", scale=None,
 94 |                  sort=False, withDeeptools=True, cluster=1, vmax=None, vmin=None, overlap=False,
 95 |                  legendLoc=None):
 96 |   if withDeeptools:
 97 |     if not (len(matnames) == 2 and len(matx) == 2):
 98 |       raise ValueError('you need two mat.gz files and two names')
 99 |     h.createFoldersFor(name)
100 |     cmd = 'computeMatrixOperations relabel -m '
101 |     cmd += matx[0] + ' -o '+matx[0]+' --groupLabels '+matnames[0]
102 |     cmd += ' && computeMatrixOperations relabel -m '
103 |     cmd += matx[1] + ' -o '+matx[1]+' --groupLabels '+matnames[1]
104 |     cmd += ' && computeMatrixOperations rbind -m '
105 |     cmd += matx[0] + ' ' + matx[1] + " -o " + \
106 |                     '.'.join(name.split('.')[:-1]) + ".gz"
107 |     cmd += ' && plotProfile'
108 |     cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz"
109 |     cmd += " --outFileName " + name
110 |     cmd += " --refPointLabel " + refpoint
111 |     if vmax is not None:
112 |       cmd += " -max "+str(vmax)
113 |     if vmin is not None:
114 |       cmd += " -min "+str(vmin)
115 |     if cluster > 1:
116 |       cmd += " --perGroup --kmeans "+str(cluster)
117 |     if legendLoc:
118 |       cmd += " --legendLocation "+legendLoc
119 |     if title:
120 |       cmd += " --plotTitle " + title
121 |     data = subprocess.run(cmd, shell=True, capture_output=True)
122 |     print(data)
123 | 
124 | 
125 | def getPeaksAt(peaks, bigwigs, folder='', bigwignames=[], peaknames=[], window=1000, title='', numpeaks=4000, numthreads=8,
126 |                width=5, length=10, torecompute=False, name='temp/peaksat.pdf', refpoint="TSS", scale=None,
127 |                sort=False, withDeeptools=True, onlyProfile=False, cluster=1, vmax=None, vmin=None, overlap=False,
128 |                legendLoc=None):
129 |   """
130 |   get pysam data
131 |   ask for counts only at specific locus based on windows from center+-size from sorted MYC peaks
132 |   for each counts, do a rolling average (or a convolving of the data) with numpy
133 |   append to an array
134 |   return array, normalized
135 |   """
136 |   if withDeeptools:
137 |     if isinstance(peaks, pd.DataFrame):
138 |       peaks = 'peaks.bed '
139 |       peaks.to_csv('peaks.bed', sep='\t', index=False, header=False)
140 |     elif type(peaks) == list:
141 |       pe = ''
142 |       i = 0
143 |       for n, p in enumerate(peaks):
144 |         if 20 < int(os.popen('wc -l ' + p).read().split(' ')[0]):
145 |           pe += p + ' '
146 |         elif len(peaknames) > 0:
147 |           peaknames.pop(n-i)
148 |           i += 1
149 |       peaks = pe
150 |     elif type(peaks) == str:
151 |       peaks += ' '
152 |     else:
153 |       raise ValueError(' we dont know this filetype')
154 |     if type(bigwigs) is list:
155 |       pe = ''
156 |       for val in bigwigs:
157 |         pe += folder + val + ' '
158 |       bigwigs = pe
159 |     else:
160 |       bigwigs = folder + bigwigs + ' '
161 |     h.createFoldersFor(name)
162 |     cmd = ''
163 |     if not os.path.exists('.'.join(name.split('.')[:-1]) + ".gz") or torecompute:
164 |       cmd += "computeMatrix reference-point -S "
165 |       cmd += bigwigs
166 |       cmd += " --referencePoint "+refpoint
167 |       cmd += " --regionsFileName " + peaks
168 |       cmd += " --missingDataAsZero"
169 |       cmd += " --outFileName " + '.'.join(name.split('.')[:-1]) + ".gz"
170 |       cmd += " --upstream " + str(window) + " --downstream " + str(window)
171 |       cmd += " --numberOfProcessors " + str(numthreads) + ' && '
172 |     cmd += "plotHeatmap" if not onlyProfile else 'plotProfile'
173 |     if type(name) is list:
174 |       if not onlyProfile:
175 |         raise ValueError('needs to be set to True, can\'t average heatmaps')
176 |       cmd += " --matrixFile " + '.gz '.join(name) + ".gz"
177 |       if average:
178 |         cmd += "--averageType mean"
179 |     else:
180 |       cmd += " --matrixFile " + '.'.join(name.split('.')[:-1]) + ".gz"
181 |     cmd += " --outFileName " + name
182 |     cmd += " --refPointLabel " + refpoint
183 |     if vmax is not None:
184 |       cmd += " -max "+str(vmax)
185 |     if vmin is not None:
186 |       cmd += " -min "+str(vmin)
187 |     if cluster > 1:
188 |       cmd += " --perGroup --kmeans "+str(cluster)
189 |     if overlap:
190 |       if onlyProfile:
191 |         cmd += " --plotType overlapped_lines"
192 |       else:
193 |         raise ValueError("overlap only works when onlyProfile is set")
194 |     if legendLoc:
195 |       cmd += " --legendLocation "+legendLoc
196 | 
197 |     if len(peaknames) > 0:
198 |       pe = ''
199 |       for i in peaknames:
200 |         pe += ' ' + i
201 |       cmd += " --regionsLabel" + pe
202 |     if type(bigwigs) is list:
203 |       if len(bigwignames) > 0:
204 |         pe = ''
205 |         for i in bigwignames:
206 |           pe += ' "' + i + '"'
207 |         cmd += " --samplesLabel" + pe
208 |     if title:
209 |       cmd += " --plotTitle '"+title+"'"
210 |     data = subprocess.run(cmd, shell=True, capture_output=True)
211 |     print(data)
212 |   else:
213 |     if 'relative_summit_pos' in peaks.columns:
214 |       center = [int((val['start'] + val['relative_summit_pos']))
215 |                     for k, val in peaks.iterrows()]
216 |     else:
217 |       center = [int((val['start'] + val['end']) / 2)
218 |                     for k, val in peaks.iterrows()]
219 |     pd.set_option('mode.chained_assignment', None)
220 |     peaks['start'] = [c - window for c in center]
221 |     peaks['end'] = [c + window for c in center]
222 |     fig, ax = plt.subplots(1, len(bigwigs), figsize=[
223 |                            width, length], title=title if title else 'Chip Heatmap')
224 |     if sort:
225 |       peaks = peaks.sort_values(by=["foldchange"], ascending=False)
226 |     if numpeaks > len(peaks):
227 |       numpeaks = len(peaks) - 1
228 |     cov = {}
229 |     maxs = []
230 |     for num, bigwig in enumerate(bigwigs):
231 |       bw = pyBigWig.open(folder + bigwig)
232 |       co = np.zeros((numpeaks, window * 2), dtype=int)
233 |       scale = scale[bigwig] if scale is dict else 1
234 |       for i, (k, val) in enumerate(peaks.iloc[:numpeaks].iterrows()):
235 |         try:
236 |           co[i] = np.nan_to_num(bw.values(str(val.chrom), val.start, val.end), 0)
237 |         except RuntimeError as e:
238 |           print(str(val.chrom), val.start, val.end)
239 |           pass
240 |       cov[bigwig] = co
241 |       maxs.append(co.max())
242 |     for num, bigwig in enumerate(bigwigs):
243 |       sns.heatmap(cov[bigwig] * scale, ax=ax[num], vmax=max(maxs), yticklabels=[], cmap=cmaps[num],
244 |                     cbar=True)
245 |       ax[num].set_title(bigwig.split('.')[0])
246 |     fig.subplots_adjust(wspace=0.1)
247 |     fig.show()
248 |     fig.savefig(name)
249 |     return cov, fig
250 | 
251 | 
252 | def andrew(groups, merged, annot, enr=None, pvals=None, cols=8, precise=True, title = "sorted clustermap of cobindings clustered", folder="", rangeval=4, okpval=10**-3, size=(20,15),vmax=3, vmin=0):
253 |   if enr is None or pvals is None:
254 |     enr, pvals = chip.enrichment(merged, groups=groups)
255 |   rand = np.random.choice(merged.index,5000)
256 |   subgroups = groups[rand]
257 |   sorting = np.argsort(subgroups)
258 |   redblue = cm.get_cmap('RdBu_r',256)
259 |   subenr = enr.iloc[annot-cols:]
260 |   subenr[subenr>rangeval]=rangeval
261 |   subenr[subenr<-rangeval]=-rangeval
262 |   subenr = subenr/rangeval
263 |   data = []
264 |   #colors = []
265 |   impv = pvals.values
266 |   for i in subgroups[sorting]:
267 |     #colors.append(viridis(i))
268 |     a = redblue((128+(subenr[i]*128)).astype(int)).tolist()
269 |     for j in range(len(a)):
270 |       a[j] = [1.,1.,1.,1.] if impv[j,i] > okpval else a[j]
271 |     data.append(a)
272 |   data = pd.DataFrame(data=data,columns=list(subenr.index),index= rand[sorting])
273 |   #data["clusters"]  = colors
274 |   
275 |   a = np.log2(1.01+merged[merged.columns[cols:annot]].iloc[rand].iloc[sorting].T)
276 |   if not precise:
277 |     for i in set(groups):
278 |       e = a[a.columns[subgroups[sorting]==i]].mean(1)
279 |       e = pd.DataFrame([e for i in range((subgroups[sorting]==i).sum())]).T
280 |       a[a.columns[subgroups[sorting]==i]] = e
281 |   
282 |   fig = sns.clustermap(a, vmin=vmin, vmax=vmax, figsize=size, z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
283 |   fig.ax_col_dendrogram.set_visible(False)
284 |   fig.fig.suptitle(title)
285 |   fig.savefig(folder + str(len(set(groups))) + '_clustermap_cobinding_enrichment_andrewplot.pdf')
286 |   plt.show()
287 | 


--------------------------------------------------------------------------------
/genepy/google/README.md:
--------------------------------------------------------------------------------
 1 | # google
 2 | 
 3 | ## contains
 4 | 
 5 | _in ./gcp.py_
 6 | 
 7 | - mvFiles: move files to a folder.
 8 | - lsFiles: list all files.
 9 | - cpFiles: copy many files to a foler.
10 | - catFiles: get data in many files.
11 | - rmFiles: remove many files.
12 | - recoverFiles: if bucket has versioning enabled, retrieve list of files that have been deleted.
13 | - patternRN: following a renaminig dict, rename a bunch of files in a set of locations.
14 | - get_all_sizes: get file sizes in a folder.
15 | - exists: given a list of file paths, get if files exist or not.
16 | - extractSize: extract file size from ls command.
17 | - extractPath: extract file path from ls command.
18 | - extractHash: extract file hash from ls command.
19 | 
20 | _in google\_sheet.py_
21 | 
22 | - dfToSheet: uploads a given dataframe to a given googleSheet location
23 | 
24 | GSheet (class) *WIP*
25 |   - get_last_modified_date
26 |   - get_size
27 |   - read_sheet
28 |   - read_row
29 |   - read_column
30 |   - write_column
31 | 
32 |   # highly recommended:
33 | 
34 |   - gsutil
35 |   - pygsheet


--------------------------------------------------------------------------------
/genepy/google/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/google/__init__.py


--------------------------------------------------------------------------------
/genepy/google/gcp.py:
--------------------------------------------------------------------------------
  1 | # GCPFunction.py
  2 | 
  3 | from google.cloud import storage
  4 | import os
  5 | import subprocess
  6 | import re
  7 | from genepy.utils import helper as h
  8 | import signal
  9 | 
 10 | 
 11 | def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
 12 |     """Lists all the blobs in the bucket that begin with the prefix.
 13 | 
 14 |     This can be used to list all blobs in a "folder", e.g. "public/".
 15 | 
 16 |     The delimiter argument can be used to restrict the results to only the
 17 |     "files" in the given "folder". Without the delimiter, the entire tree under
 18 |     the prefix is returned. For example, given these blobs:
 19 | 
 20 |             /a/1.txt
 21 |             /a/b/2.txt
 22 | 
 23 |     If you just specify prefix = '/a', you'll get back:
 24 | 
 25 |             /a/1.txt
 26 |             /a/b/2.txt
 27 | 
 28 |     However, if you specify prefix='/a' and delimiter='/', you'll get back:
 29 | 
 30 |             /a/1.txt
 31 | 
 32 |     """
 33 |     storage_client = storage.Client()
 34 |     bucket = storage_client.get_bucket(bucket_name)
 35 |     ret = []
 36 |     blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)
 37 |     for blob in blobs:
 38 |         ret.append(blob.name)
 39 |     return ret
 40 | 
 41 | 
 42 | def mvFiles(files, location, group=50, listen_to_errors=False):
 43 |     """
 44 |     move a set of files in parallel (when the set is huge)
 45 | 
 46 |     Args:
 47 |     ----
 48 |         files: gs paths
 49 |         location: to move the files to
 50 |         group: files to do in parallel
 51 |     """
 52 |     by = len(files) if len(files) < group else group
 53 |     for sfiles in h.grouped(files, by):
 54 |         a = ""
 55 |         for val in sfiles:
 56 |             a += val + " "
 57 |         code = os.system("gsutil -m mv " + a + location)
 58 |         if code != 0 and listen_to_errors:
 59 |             print("pressed ctrl+c or command failed")
 60 |             break
 61 | 
 62 | 
 63 | def lsFiles(files, add="", group=50):
 64 |     """
 65 |     list a set of files in parallel (when the set is huge)
 66 | 
 67 |     Args:
 68 |     ----
 69 |         files: gs paths
 70 |         add: additional params to add
 71 |         group: files to do in parallel
 72 |     """
 73 |     print("listing files in gs")
 74 |     by = len(files) if len(files) < group else group
 75 |     res = []
 76 |     for sfiles in h.grouped(files, by):
 77 |         a = ""
 78 |         for val in sfiles:
 79 |             a += val + " "
 80 |         data = subprocess.run(
 81 |             "gsutil -m ls " + add + " '" + a + "'", capture_output=True, shell=True
 82 |         )
 83 |         if data.returncode != 0:
 84 |             if "One or more URLs matched no objects" not in str(data.stderr):
 85 |                 raise ValueError("issue with the command: " + str(data.stderr))
 86 |         if len(str(data.stdout)) < 4:
 87 |             return []
 88 |         res += (
 89 |             str(data.stdout)[2:-1].split("\\n")[:-1]
 90 |             if "L" not in add
 91 |             else ["gs://" + i for i in str(data.stdout).split("\\ngs://")]
 92 |         )
 93 |         if "TOTAL:" in res[-1] and "L" not in add:
 94 |             res = res[:-1]
 95 |     return res
 96 | 
 97 | 
 98 | def cpFiles(files, location, group=50):
 99 |     """
100 |     copy a set of files in parallel (when the set is huge)
101 | 
102 |     Args:
103 |     ----
104 |         files: gs paths
105 |         location to copy
106 |         group: files to do in parallel
107 |     """
108 |     by = len(files) if len(files) < group else group
109 |     for sfiles in h.grouped(files, by):
110 |         a = ""
111 |         for val in sfiles:
112 |             a += val + " "
113 |         code = os.system("gsutil -m cp " + a + location)
114 |         if code != 0:
115 |             print("pressed ctrl+c or command failed")
116 |             break
117 | 
118 | 
119 | def catFiles(files, group=50, split=False, cut=False):
120 |     """
121 |     copy a set of files in parallel (when the set is huge)
122 | 
123 |     Args:
124 |     ----
125 |         files: gs paths
126 |         location to copy
127 |         group: files to do in parallel
128 |         cut: split all lines into chunks of size cut
129 |         split: split lines by split e.g. \\n
130 |     """
131 |     by = len(files) if len(files) < group else group
132 |     res = []
133 |     for i, sfiles in enumerate(h.grouped(files, by)):
134 |         print(i / (len(files) / by))
135 |         a = ""
136 |         for val in sfiles:
137 |             a += val + " "
138 |         data = subprocess.run("gsutil -m cat " + a, capture_output=True, shell=True)
139 |         if data.returncode != 0:
140 |             if "One or more URLs matched no objects" not in str(data.stderr):
141 |                 print(ValueError("issue with the command: " + str(data.stderr)))
142 |                 return res
143 |         if len(str(data.stdout)) < 4:
144 |             return []
145 |         resa = str(data.stdout)[2:-1]
146 |         if cut:
147 |             res += [resa[i * cut : (i + 1) * cut] for i in range(int(len(resa) / cut))]
148 |         elif split:
149 |             res += resa.split(split)
150 |         else:
151 |             res += [resa]
152 |     return res
153 | 
154 | 
155 | def rmFiles(files, group=50, add="", dryrun=True):
156 |     """
157 |     remove a set of files in parallel (when the set is huge)
158 | 
159 |     Args:
160 |     ----
161 |         files: gs paths
162 |         group: number to do in parallel
163 |         add: additional gsutil cp params
164 |     """
165 |     by = len(files) if len(files) < group else group
166 |     for sfiles in h.grouped(files, by):
167 |         a = ""
168 |         for val in sfiles:
169 |             a += " " + val
170 |         if add:
171 |             add = " " + add
172 |         if dryrun:
173 |             print("gsutil -m rm" + add + a)
174 |         else:
175 |             code = os.system("gsutil -m rm" + add + a)
176 |             if code != 0:
177 |                 print("pressed ctrl+c or command failed")
178 |                 break
179 | 
180 | 
181 | def recoverFiles(files, cores=1):
182 |     """
183 |     recover a set of files in parallel that were erased
184 | 
185 |     files need to have their #id appended found using ls -al file
186 | 
187 |     Args:
188 |     ----
189 |         files: gs paths
190 |         location: to move the files to
191 |     """
192 |     cmd = ["gsutil mv " + f + " " + f.split("#")[0] for f in files]
193 |     h.parrun(cmd, cores=cores)
194 | 
195 | 
196 | def folderRN(gspath, newpath, cores=1):
197 |     """ """
198 |     lis = lsFiles([gspath])
199 |     if lis != 0:
200 |         h.parrun(["gsutil -m mv " + val + " " + newpath for val in lis], cores=cores)
201 |     else:
202 |         raise ValueError("no such folder")
203 | 
204 | 
205 | def patternRN(
206 |     rename_dict,
207 |     location,
208 |     wildcards,
209 |     types=[],
210 |     dryrun=True,
211 |     check_dependencies=True,
212 |     cores=1,
213 | ):
214 |     """
215 |     rename/move a bunch of GCP objects found in some specific places
216 | 
217 |     Args:
218 |     -----
219 |         rename_dict: dict(prevName,newName)
220 |         location:
221 |         wildcards: list[str] can be one of  ['**', '.*', '*.','-.*'] if needs to be
222 |                     ** means any occurence of this file in any folder will change its name
223 |                     .* means all file unregarding of the suffix, will rename them all a.bam [a]da.bai to b.bam, [b]da.bai
224 |                     *. means all files with the suffix, will change the suffix of these files from a to b
225 |                     -.* means all file unregarding of the suffix, will rename them. not just replacing the a part with a to b but the full file name [a]dea.bam to b.bam
226 |         types: Nothing yet
227 |         test: if test, just shows the command but does not run it
228 |         cores:  cores tells on how many processor to parallelize the tas#k
229 |     """
230 |     val = []
231 |     for k, v in rename_dict.items():
232 |         val.append(v)
233 |         if k in val and check_dependencies:
234 |             raise ValueError("circular dependency in the rename with key " + k)
235 |     for k, v in rename_dict.items():
236 |         loc = location
237 |         if "**" in wildcards:
238 |             loc += "**/"
239 |         if "*." in wildcards or "-.*" in wildcards:
240 |             loc += "*"
241 |         loc += k
242 |         if ".*" in wildcards or "-.*" in wildcards:
243 |             loc += "*"
244 |         res = os.popen("gsutil -m ls " + loc).read().split("\n")[:-1]
245 |         print("found " + str(len(res)) + " files to rename")
246 |         if "-.*" in wildcards:
247 |             cmd = [
248 |                 "gsutil mv "
249 |                 + val
250 |                 + " "
251 |                 + "/".join(val.split("/")[:-1])
252 |                 + "/"
253 |                 + v
254 |                 + "."
255 |                 + ".".join(val.split("/")[-1].split(".")[1:])
256 |                 for val in res
257 |             ]
258 |         else:
259 |             cmd = ["gsutil mv " + val + " " + val.replace(k, v) for val in res]
260 |         if dryrun:
261 |             print(cmd)
262 |         else:
263 |             h.parrun(cmd, cores=cores)
264 | 
265 | 
266 | def get_all_sizes(folder, suffix="*"):
267 |     """
268 |     will sort and list all the files by their sizes.
269 | 
270 |     If some files have the same size, will list them together
271 | 
272 |     Args:
273 |     ----
274 |             folder: gs folder path
275 |             suffix: of a specific file type
276 | 
277 |     Returns:
278 |     -------
279 |             dict(sizes:[paths])
280 |     """
281 |     samples = os.popen("gsutil -m ls -al " + folder + "**." + suffix).read().split("\n")
282 |     # compute size filepath
283 |     sizes = {
284 |         "gs://"
285 |         + val.split("gs://")[1].split("#")[0]: int(
286 |             re.split("\d{4}-\d{2}-\d{2}", val)[0]
287 |         )
288 |         for val in samples[:-2]
289 |     }
290 |     names = {}
291 |     for k, val in sizes.items():
292 |         if val in names:
293 |             names[val].append(k)
294 |         else:
295 |             names[val] = [k]
296 |     if names == {}:
297 |         # we didn't find any valid file paths
298 |         print("We didn't find any valid file paths in folder: " + str(folder))
299 |     return names
300 | 
301 | 
302 | def exists(val):
303 |     """
304 |     tells if a gcp path exists
305 |     """
306 |     if type(val) is str:
307 |         return os.popen("gsutil ls " + val).read().split("\n")[0] == val
308 |     elif type(val) is list:
309 |         rest = set(val) - set(lsFiles(val))
310 |         return len(rest) == 0, rest
311 | 
312 | 
313 | def extractSize(val):
314 |     """
315 |     extract the size from the string returned by an ls -l|a command
316 |     """
317 |     return "gs://" + val.split("gs://")[1].split("#")[0], int(
318 |         re.split("\d{4}-\d{2}-\d{2}", val)[0]
319 |     )
320 | 
321 | 
322 | def extractTime(val):
323 |     """
324 |     extract the size from the string returned by an ls -l|a command
325 |     """
326 |     return val.split("  ")[1].split("T")[0]
327 | 
328 | 
329 | def extractPath(val):
330 |     """
331 |     extract the path from the string returned by an ls -l|a command
332 |     """
333 |     return "gs://" + val.split("gs://")[1].split("#")[0]
334 | 
335 | 
336 | def extractHash(val, typ="crc32c"):
337 |     """
338 |     extract the crc32 from the string returned by an ls -L command
339 | 
340 |     Args:
341 |     ----
342 |         type: flag ['crc32c','md5']
343 |     """
344 |     if "    Hash (crc32c):" in val and typ == "crc32c":
345 |         return (
346 |             val.split("    Hash (crc32c):          ")[-1]
347 |             .split("\\\\n")[0]
348 |             .split("\\n")[0]
349 |         )
350 |     elif "    Hash (md5):" in val and typ == "md5":
351 |         return (
352 |             val.split("    Hash (md5):          ")[-1].split("\\\\n")[0].split("\\n")[0]
353 |         )
354 |     else:
355 |         return None
356 | 
357 | 
358 | async def shareFiles(flist, users):
359 |     """
360 |     will share a list of files from gcp with a set of users.
361 | 
362 |     Args:
363 |     ----
364 |       users: list[str] of users' google accounts
365 |       flist: list[str] of google storage path for which you want to share data
366 | 
367 |     """
368 |     if type(users) is str:
369 |         users = [users]
370 |     for user in users:
371 |         files = ""
372 |         for i in flist:
373 |             files += " " + i
374 |         code = os.system("gsutil -m acl ch -ru " + user + ":R " + files)
375 |         if code == signal.SIGINT:
376 |             print("Awakened")
377 |             break
378 |     print("the files are stored here:\n\n")
379 |     print(flist)
380 |     print("\n\njust install and use gsutil to copy them")
381 |     print("https://cloud.google.com/storage/docs/gsutil_install")
382 |     print("https://cloud.google.com/storage/docs/gsutil/commands/cp")
383 | 
384 | 
385 | def deleteOldVersions(path, onlymetagene=None, **kwargs):
386 |     """
387 |     given a path to a folder in google cloud storage, will delete all the old versions of the files in the path.
388 |     """
389 |     data = subprocess.run("gsutil -m ls -alh " + path, capture_output=True, shell=True)
390 |     if data.returncode != 0:
391 |         if "One or more URLs matched no objects" not in str(data.stderr):
392 |             print(ValueError("issue with the command: " + str(data.stderr)))
393 |             return res
394 |     if len(str(data.stdout)) < 4:
395 |         return []
396 |     resa = str(data.stdout)[2:-1].split("\\n")[:-2]
397 |     torm = []
398 |     for i, val in enumerate(resa):
399 |         if onlymetagene is not None:
400 |             print("torework")
401 |             # if "metageneration=" + str(metagene) in val:
402 |             #    name = "gs://" + val.split(" gs://")[1].split("  ")[0]
403 |             #    torm.append(name)
404 |             # else:
405 |             #    prevname = torm[-1].split("#")[0]
406 |             #    name = "gs://" + val.split(" gs://")[1].split("#")[0]
407 |             #    if prevname != name:
408 |             #        print(prevname + " is unique and won't be deleted")
409 |             #        torm.pop()
410 |         else:
411 |             if "metageneration=" + str(1) not in val:
412 |                 name = "gs://" + val.split(" gs://")[1].split("  ")[0]
413 |                 torm.append(name)
414 |                 if (
415 |                     "gs://" + resa[i + 1].split(" gs://")[1].split("#")[0]
416 |                     != name.split("#")[0]
417 |                 ):
418 |                     print(name + " is unique and won't be deleted")
419 |                     torm.pop()
420 |     print(h.dups([val.split("#")[0] for val in torm]))
421 | 
422 |     return rmFiles(torm, **kwargs)
423 | 


--------------------------------------------------------------------------------
/genepy/google/good-retention.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "lifecycle": {
 3 |         "rule": [
 4 |             {
 5 |                 "action": {
 6 |                     "type": "Delete"
 7 |                 },
 8 |                 "condition": {
 9 |                     "daysSinceNoncurrentTime": 90
10 |                 }
11 |             },
12 |             {
13 |                 "action": {
14 |                     "type": "Delete"
15 |                 },
16 |                 "condition": {
17 |                     "numNewerVersions": 2
18 |                 }
19 |             }
20 |         ]
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/genepy/google/google_sheet.py:
--------------------------------------------------------------------------------
 1 | import gspread
 2 | from oauth2client.service_account import ServiceAccountCredentials
 3 | 
 4 | scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
 5 |          "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
 6 | 
 7 | 
 8 | def dfToSheet(df, sheetid, secret='~/.credentials.json'):
 9 |   credentials = ServiceAccountCredentials.from_json_keyfile_name(secret, scope)
10 |   client = gspread.authorize(credentials)
11 |   spreadsheet = client.open(sheetid)
12 |   df.to_csv('/tmp/sheet.csv')
13 |   with open("/tmp/sheet.csv", 'r') as file_obj:
14 |     content = file_obj.read()
15 |     client.import_csv(spreadsheet.id, data=content)
16 | 


--------------------------------------------------------------------------------
/genepy/google/gsheet_upload.py:
--------------------------------------------------------------------------------
 1 | import gspread
 2 | from oauth2client.service_account import ServiceAccountCredentials
 3 | 
 4 | scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
 5 |          "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
 6 | 
 7 | credentials = ServiceAccountCredentials.from_json_keyfile_name('~/.client_secret.json', scope)
 8 | client = gspread.authorize(credentials)
 9 | 
10 | spreadsheet = client.open('https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE')
11 | 
12 | with open(file, 'r') as file_obj:
13 |   content = file_obj.read()
14 |   client.import_csv(spreadsheet.id, data=content)
15 | 


--------------------------------------------------------------------------------
/genepy/imaging/fish.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial import distance_matrix
  3 | from genepy.utils import helper as h
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import pandas as pd
  7 | 
  8 | 
  9 | # make a plot of averaged binned signal strength by distance from locis
 10 | def computeDistsFromClass(dots, seconddots, conds=['DMSO', 'VHL'], groupcol="group",
 11 |                           sclass='green', signal="mean_green", area="area"):
 12 |   """
 13 |   """
 14 |   dists= {}
 15 |   twodists = {}
 16 |   for val in set(dots.exp):
 17 |     for e in conds:
 18 |       d = dots[(dots.exp==val)&(dots.treat==e)]
 19 |       dist = []
 20 |       weight = []
 21 |       newdist = []
 22 |       ind=[]
 23 |       m = seconddots[(seconddots.exp==val)&(seconddots.treat==e)]
 24 |       print(val, e)
 25 |       for i,(k, v) in enumerate(m.iterrows()):
 26 |         h.showcount(i, len(m))
 27 |         dist.append(
 28 |           distance_matrix(d[(d['class']==sclass)&
 29 |                             (d[groupcol]==v[groupcol])][['x', "y", "z"]].values,
 30 |                           np.array([v[['x_mean', "y_mean", "z_mean"]]])).T[0].astype(float))
 31 |         weight.append(d[(d['class'] == sclass)&(d[groupcol]==v[groupcol])][signal])
 32 |         dat = d[(d['class'] == sclass) &
 33 |                                     (d[groupcol] == v[groupcol])][['x', "y", "z", signal, area, "m_id"]]
 34 |         a = dat.values
 35 |         a[:,:3] = a[:,:3] - v[['x_mean', "y_mean", "z_mean"]].values
 36 |         newdist.append(a)
 37 |         ind.extend(dat.index.tolist())
 38 |       twodists[val+e] = pd.DataFrame(data=np.vstack(newdist),
 39 |                                       columns=['x', 'y', 'z', signal, area, "m_id"],
 40 |                             index=ind)
 41 |       dists[val+e] = [np.hstack(dist), np.hstack(weight)]
 42 |   return twodists, dists
 43 | 
 44 | 
 45 | def drawDots(dists, scenter=False, size=1000, zsize=1000,
 46 |              folder="", signal="signal", levels=20,
 47 |              area="area", vmin=None, vmax=None,
 48 |               norm=None, norm_dots=None, second=None,
 49 |             color="seagreen",
 50 |             seccolor=sns.light_palette("orange", as_cmap=True), **kwargs):
 51 |   """
 52 |   """
 53 |   sm = []
 54 |   m = []
 55 |   sca=1.2
 56 |   if second is not None:
 57 |     for _, a in dists.items():
 58 |       sm.append(a[second(a)][signal].max())
 59 |   for _, a in dists.items():
 60 |     m.append(a[signal].mean())
 61 |   for i, (k,a) in enumerate(dists.items()):
 62 |     a = a.copy()
 63 |     a[area] = ((a[area]/(3.14))**(1/2)).astype(float)
 64 | 
 65 |     a = a[(abs(a.x)<size*sca) & (abs(a.y)<size*sca) & (abs(a.z)<zsize*sca)]
 66 |     #ax = sns.scatterplot(data=a, x='x', y='y', hue_norm=(None,max(m)) if norm is None else norm,
 67 |                       #	hue=signal, size=area, palette=color, **kwargs)
 68 |     if type(norm) is dict:
 69 |       n=norm[k]
 70 |     elif type(norm) is list:
 71 |       n=norm[i]
 72 |     else:
 73 |       n=None
 74 |     ax=sns.kdeplot(data=a[['x', 'y', signal]].astype(float),
 75 |                  x='x', fill=True, y='y', weights=signal, color=color,
 76 |                  thresh=False, levels=levels, cbar=m[i] == max(m),
 77 |                  hue_norm=n, vmin=vmin/(m[i]/max(m)), vmax=vmax/(m[i]/max(m)))
 78 |     # (None, max(sm)/sca) if norm is None else norm)
 79 |     if second is not None:
 80 |       print('adding second color')
 81 |       ex = sns.scatterplot(data=a[second(a)].sort_values(by=signal), x='x', y='y',
 82 |                         hue_norm=(None, max(sm)/sca) if norm_dots is None else norm_dots,
 83 |                         hue=signal, palette=seccolor, size=area, **kwargs)
 84 |       ex.legend(bbox_to_anchor=(2, 1), loc=1)
 85 |     plt.title(k)
 86 |     if scenter:
 87 |       ax.plot([0], [0], 'o', ms=scenter, markerfacecolor="None",
 88 |       markeredgecolor='red', markeredgewidth=1)
 89 | 
 90 |     ax.set_xlim((-size,size))
 91 |     ax.set_ylim((-size,size))
 92 |     plt.show()
 93 |     ax.get_figure().savefig(folder+k+"_scatter_representation_size_to_center.pdf")
 94 | 
 95 | 
 96 | def colocalize(dots, groupcol='group', zcol='z', xcol="x", ycol="y",
 97 |               default_maxdist=None, distance_scale=1.2, areacol="area",
 98 |               mergedidcol='m_id'):
 99 |   """
100 |   """
101 |   idcount = 0
102 |   merged = dots.copy()
103 |   for group in set(dots[groupcol]):
104 |     print(group)
105 |     # merging cells
106 |     gdot = dots[(dots[groupcol] == group)]
107 |     maxdist = default_maxdist if default_maxdist else np.sqrt(
108 |         gdot[areacol].mean() / 3.14) * distance_scale
109 | 
110 |     gdot[mergedidcol]=None
111 | 
112 |     pos = gdot[[xcol, ycol, zcol]].values
113 |     dist = distance_matrix(pos, pos)
114 | 
115 |     # closest needs to not be too far away otherwise the dot
116 |     # is considered as finished
117 |     for val in np.tril(dist < maxdist):
118 |       con = np.argwhere(val > 0).T[0].tolist()
119 |       # we get all its connections
120 |       con_val = gdot.iloc[con]
121 |       ids = list(set(con_val[mergedidcol]) - set([None]))
122 |       # if connections are already connected we use this id
123 |       if len(ids) > 0:
124 |         def_id = ids[0]
125 |         # for each connection, if have another id,
126 |         # replace this id with the current one
127 |         for i in ids:
128 |           con.extend(np.argwhere(
129 |               (gdot[mergedidcol] == i).values).T[0].tolist())
130 |         con = list(set(con))
131 |       # if none we create a new id
132 |       else:
133 |         idcount+=1
134 |         def_id = "id_"+str(idcount)
135 |       gdot.loc[gdot.iloc[con].index.tolist(), mergedidcol] = def_id
136 |     #except:
137 |     #	pdb.set_trace()
138 |     merged.loc[gdot.index.tolist(), mergedidcol] = gdot[mergedidcol].tolist()
139 |   return merged
140 | 
141 | def mergeAnnotated(annot, minzstack=2, groupdefault={}, todrop=[], coltocount="image",
142 |                     id="m_id", colocName="cobinding"):
143 |   """
144 |   """
145 |   annot = annot.drop(columns=todrop)
146 |   grouping = {i: "mean" for i in annot.columns}
147 |   if groupdefault:
148 |     grouping.update(groupdefault)
149 |   grouping.pop(id)
150 |   # merge into a same sample
151 |   groups = annot.groupby(id)
152 |   counts = groups[coltocount].count()
153 |   merged = groups.agg(grouping)
154 |   merged['counts'] = counts
155 |   merged = merged[merged['counts'] >= minzstack]
156 |   merged.columns = [i[0] if "first" in i[1]
157 |                   else '_'.join(i) for i in merged.columns]
158 |   #rename colors
159 |   merged['class'] = [i[0] if len(
160 |     i) == 1 else colocName for i in merged["class_unique"]]
161 |   return merged.drop(columns="class_unique")
162 | 


--------------------------------------------------------------------------------
/genepy/mutations/README.md:
--------------------------------------------------------------------------------
 1 | # Mutations
 2 | 
 3 | A set of functions to help process any types of mutations
 4 | 
 5 | 
 6 | ## contains:
 7 | 
 8 | - vcf_to_df: transforms a vcf file into a dataframe file as best as it can
 9 | - mafToMat: turns a maf file into a matrix of mutations x samples (works with multiple sample file)
10 | - mergeAnnotations: merges two maf files, taking carre of duplicate samples and duplicate (works with multiple sample file)
11 | - filterAllelicFraction: filters a MAF file based on allelic fraction (works with multiple sample file)
12 | - filterCoverage: filters a MAF file based on read coverage (works with multiple sample file)
13 | - manageGapsInSegments: extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file)
14 | - toGeneMatrix: makes a geneXsample matrix from segment level copy number (works with multiple sample file)
15 | - checkAmountOfSegments: will compute the number of segments for each samples from a df of segments from RSEM (works with multiple sample file)
16 | - checkGeneChangeAccrossAll: used to find poor quality genes in CN data (works with multiple sample file)
17 | 


--------------------------------------------------------------------------------
/genepy/mutations/__init__.py:
--------------------------------------------------------------------------------
  1 | # Jeremie Kalfon
  2 | # for BroadInsitute
  3 | # in 2019
  4 | 
  5 | from __future__ import print_function
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from genepy.utils import helper as h
 10 | import gzip
 11 | import seaborn as sns
 12 | 
 13 | 
 14 | def vcf_to_df(
 15 |     path,
 16 |     additional_cols=[],
 17 |     additional_filters=[],
 18 |     parse_filter=False,
 19 |     drop_null=False,
 20 |     force_keep=[],
 21 |     cols_to_drop=[
 22 |         "clinvar_vcf_mc",
 23 |         "oreganno_build",
 24 |         "gt",
 25 |         "ad",
 26 |         "af",
 27 |         "dp",
 28 |         "f1r2",
 29 |         "f2r1",
 30 |         "fad",
 31 |         "sb",
 32 |         "pid",
 33 |     ],
 34 |     **kwargs,
 35 | ):
 36 |     """
 37 |     transforms a vcf file into a dataframe file as best as it can
 38 | 
 39 |     Args:
 40 |     -----
 41 |         path: str filepath to the vcf file
 42 |         additional_filters: list[str] additional values added by the filtering tool looks for PASS, base_qual,
 43 |             clustered_events, fragment, germline, haplotype, map_qual, multiallelic,
 44 |             panel_of_normals, position, slippage, strand_bias, weak_evidence
 45 |         additional_cols: list[str] of additional colnames in the vcf already looks for 'DB',
 46 |             'SOMATIC', 'GERMLINE', "OVERLAP", "IN_PON", "STR", "ReverseComplementedAlleles"
 47 |         parse_filter: bool if true, will parse the filter field and add it to the dataframe
 48 |         drop_null: bool if a column appears to be fully empty, will drop it
 49 |         force_keep: list[str] columns to force keep even if they are empty
 50 |         cols_to_drop: list[str] columns to drop even if they are not empty
 51 | 
 52 |     Returns:
 53 |     --------
 54 |       a dataframe fo the vcf
 55 |       a dict associating each column with its description (gathered from the vcf header)
 56 |       a list of the columns that have been dropped
 57 |     """
 58 |     uniqueargs = [
 59 |         "DB",
 60 |         "SOMATIC",
 61 |         "GERMLINE",
 62 |         "OVERLAP",
 63 |         "IN_PON",
 64 |         "STR",
 65 |         "ReverseComplementedAlleles",
 66 |     ] + additional_cols
 67 | 
 68 |     filters = [
 69 |         "PASS",
 70 |         "base_qual",
 71 |         "clustered_events",
 72 |         "fragment",
 73 |         "germline",
 74 |         "haplotype",
 75 |         "map_qual",
 76 |         "multiallelic",
 77 |         "panel_of_normals",
 78 |         "position",
 79 |         "slippage",
 80 |         "strand_bias",
 81 |         "weak_evidence",
 82 |     ] + additional_filters
 83 | 
 84 |     FUNCO_DESC = "Functional annotation from the Funcotator tool."
 85 | 
 86 |     dropped_cols = []
 87 | 
 88 |     def read_comments(f):
 89 |         description = {}
 90 |         colnames = []
 91 |         rows = 0
 92 |         for l in f:
 93 |             l = l.decode("utf-8") if type(l) is not str else l
 94 |             if l.startswith("##"):
 95 |                 rows += 1
 96 |                 if "FORMAT" in l[:20]:
 97 |                     res = l.split("ID=")[1].split(",")[0]
 98 |                     desc = l.split("Description=")[1][:-2]
 99 |                     description.update({res: desc})
100 |                 if "INFO" in l[:20]:
101 |                     res = l.split("ID=")[1].split(",")[0]
102 |                     if res == "FUNCOTATION":
103 |                         print("parsing funcotator special")
104 |                         for val in l.split("Description=")[1][:-2].split("|"):
105 |                             val = val.split("Funcotation fields are: ")[-1]
106 |                             description.update({val: FUNCO_DESC})
107 |                     else:
108 |                         desc = l.split("Description=")[1][:-2]
109 |                         description.update({res: desc})
110 |             elif l.startswith("#"):
111 |                 colnames = l[1:-1].split("\t")
112 |                 rows += 1
113 |             else:
114 |                 break
115 |         return description, colnames, rows
116 | 
117 |     if path.endswith(".gz"):
118 |         with gzip.open(path, "r") as f:
119 |             description, colnames, nrows_toskip = read_comments(f)
120 |     else:
121 |         with open(path, "r") as f:
122 |             description, colnames, nrows_toskip = read_comments(f)
123 |     colnames = [i for i in colnames]
124 |     csvkwargs = {
125 |         "sep": "\t",
126 |         "index_col": False,
127 |         "header": None,
128 |         "names": colnames,
129 |         "skiprows": nrows_toskip + kwargs.get("skiprows", 0),
130 |     }
131 |     data = pd.read_csv(path, **{**kwargs, **csvkwargs})
132 |     print(description)
133 |     funco_fields = [k for k, v in description.items() if FUNCO_DESC in v]
134 |     fields = {k: [] for k, _ in description.items()}
135 |     try:
136 |         for j, info in enumerate(data["INFO"].str.split(";").values.tolist()):
137 |             res = {}
138 |             # show speed
139 |             if j % 10_000 == 0:
140 |                 print(j, end="\r")
141 |             for annot in info:
142 |                 if annot in uniqueargs:
143 |                     res.update({annot: True})
144 |                 elif "=" in annot:
145 |                     # taking care of the funcotator special fields
146 |                     if "FUNCOTATION" in annot:
147 |                         # for multi allelic site:
148 |                         annot = annot.replace("FUNCOTATION=", "")[1:-1]
149 |                         res.update({name: [] for name in funco_fields})
150 |                         for site in annot.split("],["):
151 |                             if "]#[" in site:
152 |                                 site = site.split("]#[")[0]
153 |                             site = (
154 |                                 site.replace("_%7C_", " ")
155 |                                 .replace("_%20_", " ")
156 |                                 .replace("_%2C_", ",")
157 |                                 .replace("_%3D_", "=")
158 |                                 .split("|")
159 |                             )
160 |                             for i, sub_annot in enumerate(site):
161 |                                 res[funco_fields[i]].append(sub_annot)
162 |                         for k in funco_fields:
163 |                             res[k] = ",".join(res[k])
164 |                     else:
165 |                         k, annot = annot.split("=")
166 |                         res.update({k: annot})
167 |                 else:
168 |                     raise ValueError("unknown argument: " + annot)
169 |             for k in list(fields.keys()):
170 |                 fields[k].append(res.get(k, None))
171 |     except ValueError:
172 |         print(annot)
173 |         raise ValueError("unknown field")
174 | 
175 |     data = pd.concat(
176 |         [data.drop(columns="INFO"), pd.DataFrame(data=fields, index=data.index)], axis=1
177 |     )
178 |     if drop_null:
179 |         to_drop = []
180 |         for f in funco_fields:
181 |             # drop columns that have the same value across all rows
182 |             uniq = data[f].unique()
183 |             if len(uniq) == 1 and f.lower() not in force_keep:
184 |                 to_drop.append(f)
185 |                 continue
186 |             elif len(uniq) < 10:
187 |                 # checking multi allelic stuff
188 |                 multi = []
189 |                 for v in uniq:
190 |                     multi += v.split(",")
191 |                 if len(set(multi)) == 1 and f.lower() not in force_keep:
192 |                     to_drop.append(f)
193 |         print("dropping uninformative columns:", to_drop)
194 |         data = data.drop(columns=to_drop)
195 |         dropped_cols += to_drop
196 |     data.columns = [i.lower() for i in data.columns]
197 |     samples = [i.lower() for i in colnames[9:]]
198 |     print("\nthe samples are:", samples)
199 |     sorting = data["format"][0].split(":")
200 |     for sample in samples:
201 |         res = data[sample].str.split(":").values.tolist()
202 |         maxcols = max([len(v) for v in res])
203 |         if maxcols - len(sorting) > 0:
204 |             for i in range(maxcols - len(sorting)):
205 |                 sorting.append(sorting[-1] + "_" + str(i + 1))
206 |         if len(samples) > 1:
207 |             sorting = [sample + "_" + v for v in sorting]
208 |         data = pd.concat(
209 |             [
210 |                 data.drop(columns=sample),
211 |                 pd.DataFrame(data=res, columns=sorting, index=data.index),
212 |             ],
213 |             axis=1,
214 |         )
215 | 
216 |     # subsetting filters
217 |     if parse_filter:
218 |         data[filters] = False
219 |         for f in filters:
220 |             data.loc[data["filter"].str.contains(f), f] = True
221 |         data = data.drop(columns="filter")
222 |         dropped_cols.append("filter")
223 | 
224 |     # cleaning empty cols
225 |     data = data.drop(columns="format")
226 |     dropped_cols.append("format")
227 | 
228 |     todrop = []
229 |     for val in cols_to_drop:
230 |         if val in data.columns.tolist():
231 |             todrop.append(val)
232 |     data = data.drop(columns=todrop)
233 | 
234 |     if drop_null:
235 |         empty = data.columns[data.isna().sum() == len(data)].tolist()
236 |         empty = list(set(empty) - set(force_keep))
237 |         print("dropping empty columns:", empty)
238 |         data = data.drop(columns=empty)
239 |         dropped_cols += empty
240 | 
241 |     # weird bug sometimes
242 |     if "SB_1" in data.columns.tolist():
243 |         loc = ~data.SB_1.isna()
244 |         data.loc[loc, "PGT"] = data.loc[loc, "SB"]
245 |         data.loc[loc, "SB"] = data.loc[loc, "SB_1_2_3"]
246 |         data = data.drop(columns=["SB_1", "SB_1_2_3"])
247 |         data = data.rename(columns={"SB_1_2": "PS", "SB_1": "PID"})
248 |     else:
249 |         loc = data.SB.isna()
250 |         data.loc[loc, "SB"] = data.loc[loc, "PGT"]
251 |         data.loc[loc, "PGT"] = ""
252 |     # sorting out issue with
253 |     return data, description, dropped_cols
254 | 
255 | 
256 | def mafToMat(
257 |     maf,
258 |     mode="bool",
259 |     freqcol="tumor_f",
260 |     samplesCol="DepMap_ID",
261 |     mutNameCol="Hugo_Symbol",
262 |     minfreqtocall=0.2,
263 | ):
264 |     """
265 |     turns a maf file into a matrix of mutations x samples (works with multiple sample file)
266 | 
267 |     Args:
268 |     -----
269 |       maf: dataframe of the maf file
270 |       sample_col: str colname for samples
271 |       mode: flag  "bool" to convert the matrix into a boolean (mut/no mut)
272 |                   "float" to keep the allele frequencies as is (0.x)
273 |                   "genotype" to have either 1, 0.5 or 0
274 |       freqcol: str colname where ref/alt frequencies are stored
275 |       mutNameCol: str colname where mutation names are stored, will merge things over that column name
276 | 
277 |     Returns:
278 |     --------
279 |       the dataframe matrix
280 |     """
281 |     samples = set(maf[samplesCol])
282 |     maf = maf[maf[freqcol] >= minfreqtocall]
283 |     maf = maf.sort_values(by=mutNameCol)
284 |     mut = pd.DataFrame(
285 |         data=np.zeros((len(set(maf[mutNameCol])), 1)),
286 |         columns=["fake"],
287 |         index=set(maf[mutNameCol]),
288 |     ).astype(float)
289 |     for i, val in enumerate(samples):
290 |         h.showcount(i, len(samples))
291 |         if mode == "genotype":
292 |             mut = mut.join(
293 |                 maf[maf[samplesCol] == val]
294 |                 .set_index(mutNameCol)[freqcol]
295 |                 .groupby(mutNameCol)
296 |                 .agg("sum")
297 |                 .rename(val)
298 |             )
299 |         else:
300 |             mut = mut.join(
301 |                 maf[maf[samplesCol] == val]
302 |                 .drop_duplicates(mutNameCol)
303 |                 .set_index(mutNameCol)[freqcol]
304 |                 .rename(val)
305 |             )
306 |     mut = mut.fillna(0).astype(bool if mode == "bool" else float).drop(columns=["fake"])
307 |     if mode == "genotype":
308 |         mut[(mut > 1.3)] = 3
309 |         mut[(mut >= 0.7) & (mut <= 1.3)] = 2
310 |         mut[(mut > 0.3) & (mut < 0.7)] = 1
311 |         mut[mut <= 0.3] = 0
312 |     return mut
313 | 
314 | 
315 | def mergeAnnotations(
316 |     firstmaf,
317 |     additionalmaf,
318 |     mutcol="mutation",
319 |     Genome_Change="Genome_Change",
320 |     Start_position="Start_position",
321 |     Chromosome="Chromosome",
322 |     samplename="DepMap_ID",
323 |     useSecondForConflict=True,
324 |     dry_run=False,
325 | ):
326 |     """
327 |     merges two maf files, taking carre of duplicate samples and duplicate (works with multiple sample file)
328 | 
329 |     Args:
330 |     -----
331 |     firstmaf: dataframe the first maf file
332 |     additionalmaf: dataframe the second maf file (need to contain same colnames)
333 |     Genome_Change: str colnames of the Genome_Change column
334 |     Start_position: str colnames of the Start_position column
335 |     Chromosome: str colnames of the Chromosome column
336 |     samplename: str colnames of the samplename column (for multiple samples, even if one, needs to have this column)
337 |     useSecondForConflict: bool if false use the first df as reference else use the second one
338 |     dry_run: if true, will just output conflict regions and not merge the dataframes
339 | 
340 |     Returns:
341 |     -------
342 |       dataframe of the maf file if not dryrun, else an np array of the merge issues
343 |     """
344 |     mutations = firstmaf.copy()
345 |     mutations["ind"] = mutations[samplename] + "_" + mutations[Genome_Change]
346 |     mutations["loci"] = (
347 |         mutations[samplename]
348 |         + "_"
349 |         + mutations[Chromosome]
350 |         + "_"
351 |         + mutations[Start_position].astype(str)
352 |     )
353 |     additionalmaf["ind"] = (
354 |         additionalmaf[samplename] + "_" + additionalmaf[Genome_Change]
355 |     )
356 |     additionalmaf["loci"] = (
357 |         additionalmaf[samplename]
358 |         + "_"
359 |         + additionalmaf[Chromosome]
360 |         + "_"
361 |         + additionalmaf[Start_position].astype(str)
362 |     )
363 |     inboth = set(additionalmaf["loci"]) & set(mutations["loci"])
364 |     notineach = set(additionalmaf["ind"]) ^ set(mutations["ind"])
365 |     submut = mutations[mutations.loci.isin(inboth) & mutations.ind.isin(notineach)]
366 |     subother = additionalmaf[
367 |         additionalmaf.loci.isin(inboth) & additionalmaf.ind.isin(notineach)
368 |     ]
369 |     issues = None
370 |     if len(submut) > 0:
371 |         print("found " + str(len(submut)) + " nonmatching mutations")
372 |         issues = np.vstack(
373 |             [
374 |                 submut.sort_values(by="loci")[Genome_Change].values,
375 |                 subother.sort_values(by="loci")[Genome_Change].values,
376 |             ]
377 |         ).T
378 |         if dry_run:
379 |             print(issues)
380 |     if not dry_run:
381 |         if issues is not None:
382 |             if useSecondForConflict:
383 |                 mutations = mutations[~mutations.ind.isin(set(submut.ind))]
384 |             else:
385 |                 additionalmaf = additionalmaf[
386 |                     ~additionalmaf.ind.isin(set(subother.ind))
387 |                 ]
388 |             mutations = mutations.append(
389 |                 additionalmaf[
390 |                     additionalmaf["ind"].isin(
391 |                         set(additionalmaf["ind"]) - set(mutations["ind"])
392 |                     )
393 |                 ]
394 |             )
395 |         subother = additionalmaf[
396 |             additionalmaf.loci.isin(inboth) & ~additionalmaf.ind.isin(notineach)
397 |         ].set_index("ind")
398 |         mutations = mutations.set_index("ind")
399 |         mutations.loc[subother.index.tolist(), mutcol] = subother[mutcol].tolist()
400 |         return (
401 |             mutations.drop(columns=["loci"])
402 |             .sort_values(by=[samplename, Chromosome, Start_position])
403 |             .reset_index(drop=True)
404 |         )
405 |     else:
406 |         return issues
407 | 
408 | 
409 | def filterAllelicFraction(maf, loc=["CGA_WES_AC"], sep=":", frac=0.1):
410 |     """
411 |     filters a MAF file based on allelic fraction (works with multiple sample file)
412 | 
413 |     Args:
414 |     -----
415 |       maf: dataframe of the maf file
416 |       loc: list[str] colnames with the ref:alt
417 |       sep: str separato between ref:alt
418 |       frac: float min fraction
419 | 
420 |     Returns:
421 |     -------
422 |       dataframe of the maf file
423 |     """
424 |     muts = np.zeros((len(maf), 2))
425 |     for val in loc:
426 |         muts += np.array(
427 |             [
428 |                 [v[0], 0] if "NA" in v else v
429 |                 for v in maf[val]
430 |                 .fillna("0" + sep + "0")
431 |                 .astype(str)
432 |                 .str.split(sep)
433 |                 .tolist()
434 |             ]
435 |         ).astype(int)
436 |     muts = muts[:, 0] / (muts[:, 0] + muts[:, 1])
437 |     return maf[muts >= frac]
438 | 
439 | 
440 | def filterCoverage(maf, loc=["CGA_WES_AC"], sep=":", cov=4, altloc=0):
441 |     """
442 |     filters a MAF file based on read coverage (works with multiple sample file)
443 | 
444 |     Args:
445 |     -----
446 |       maf: dataframe of the maf file
447 |       loc: list[str] colnames with the ref:alt
448 |       sep: str separato between ref:alt
449 |       cov: min coverage
450 |       altloc: 0 to filter on alt and 1 to filter on ref
451 | 
452 |     Returns:
453 |     -------
454 |       dataframe of the maf file
455 |     """
456 |     muts = np.zeros((len(maf), 2))
457 |     for val in loc:
458 |         muts += np.array(
459 |             [
460 |                 [v[0], 0] if "NA" in v else v
461 |                 for v in maf[val]
462 |                 .fillna("0" + sep + "0")
463 |                 .astype(str)
464 |                 .str.split(sep)
465 |                 .tolist()
466 |             ]
467 |         ).astype(int)
468 |     return maf[muts[:, altloc] >= cov]
469 | 
470 | 
471 | def manageGapsInSegments(
472 |     segtocp, Chromosome="Chromosome", End="End", Start="Start", cyto=None
473 | ):
474 |     """
475 |     extends the ends of segments in a segment file from GATK so as to remove all gaps ove the genome (works with multiple sample file)
476 | 
477 |     Args:
478 |     ----
479 |       segtocp: dataframe of segments from GATK CN pipeline
480 |       Chromosome: str the value for the Chromosome columns
481 |       End: str the value for the End columns
482 |       Start: str the value for the Start columns
483 |       cyto: dataframe with chrom;end; columns giving the size of each chromosome (else puts last segment to 1000000000)
484 |     """
485 |     prevchr = ""
486 |     prevend = 0
487 |     count = 0
488 |     l = []
489 |     segments = segtocp.copy()
490 |     le = len(segments)
491 |     for k, val in segments.iterrows():
492 |         h.showcount(count, le)
493 |         count += 1
494 |         if val[Chromosome] != prevchr:  # we changed chromosome
495 |             # we extend the previous segment (last of the prev chrom) to.. way enough
496 |             if len(l) > 0:
497 |                 l[-1][2] = (
498 |                     1000000000
499 |                     if cyto is None
500 |                     else cyto[cyto["chrom"] == prevchr]["end"].values[-1]
501 |                 )
502 |             # we extend the first segment to 0
503 |             l.append([val[Chromosome], 0, val[End]])
504 |         else:
505 |             if val[Start] > prevend + 1:  # we have a gap in the same chrom
506 |                 sizeofgap = val[Start] - prevend
507 |                 # we add to the previous one half of the gap
508 |                 l[-1][2] += (
509 |                     int(sizeofgap / 2) if sizeofgap % 2 == 0 else int(sizeofgap / 2) + 1
510 |                 )
511 |                 # the rest to the other
512 |                 l.append([val[Chromosome], val[Start] - int(sizeofgap / 2), val[End]])
513 |             elif val[Start] < prevend:  # this should never happen
514 |                 # import pdb; pdb.set_trace()
515 |                 raise ValueError("start comes after end")
516 |             else:
517 |                 l.append([val[Chromosome], val[Start], val[End]])
518 |         prevchr = val[Chromosome]
519 |         prevend = val[End]
520 |     # we extend the last one
521 |     l[-1][2] = (
522 |         1000000000 if cyto is None else cyto[cyto["chrom"] == prevchr]["end"].values[-1]
523 |     )
524 |     segments[[Chromosome, Start, End]] = l
525 |     return segments.reset_index(drop=True)
526 | 
527 | 
528 | def toGeneMatrix(
529 |     segments,
530 |     gene_mapping,
531 |     style="weighted",
532 |     missingchrom=["Y"],
533 |     gene_names_col="gene_name",
534 | ):
535 |     """
536 |     makes a geneXsample matrix from segment level copy number (works with multiple sample file)
537 | 
538 |     Args:
539 |     ----
540 |       style: str one of "weighted","mean","closest"
541 |       segments: dataframe of segments containing: [Chromosome, Segment_Mean, Chromosome, start, end] columns
542 |       gene_mapping: dataframe with symbol, ensembl_id columns for each gene
543 |       missingchrom: list[str] chromosomes not to look into
544 | 
545 |     Returns:
546 |     -------
547 |       pd.dataframe: the matrix
548 |     """
549 |     samples = list(set(segments.DepMap_ID))
550 |     data = np.zeros((len(samples), len(gene_mapping)))
551 |     for i, sample in enumerate(samples):
552 |         segs = segments[segments.DepMap_ID == sample][
553 |             ["Chromosome", "Start", "End", "Segment_Mean"]
554 |         ].values
555 |         hasmissing = set(missingchrom) - set(segs[:, 0])
556 |         j = 0
557 |         h.showcount(i, len(samples))
558 |         for k, gene in enumerate(gene_mapping[["Chromosome", "start", "end"]].values):
559 |             # print(i,j)
560 |             if gene[0] in hasmissing:
561 |                 data[i, k] = np.nan
562 |                 continue
563 |             try:
564 |                 while gene[0] != segs[j][0] or gene[1] >= segs[j][2]:
565 |                     # print("went beyong",gene, segs[j])
566 |                     j += 1
567 |                 # some genes are within other genes, we need to go back in the list of segment in that case
568 |             except:
569 |                 raise ValueError("forgot to sort one of the DF?")
570 |             while gene[1] < segs[j][1]:
571 |                 j -= 1
572 |                 # print("decrease gene",gene)
573 |             # we are entirely within the segment
574 |             c = 1
575 |             if gene[2] <= segs[j][2]:
576 |                 data[i, k] = segs[j][3]
577 |             else:
578 |                 # how much of the gene is covered by the segment
579 |                 coef = (segs[j][2] - gene[1]) / (gene[2] - gene[1])
580 |                 # print('coef',coef)
581 |                 val = segs[j][3] * coef if style == "weighted" else segs[j][3]
582 |                 end = segs[j][2]
583 |                 # until the end of a segments goes beyond the end of the gene (say if we have X segments within the gene)
584 |                 while end < gene[2]:
585 |                     # pdb.set_trace()
586 |                     j += 1
587 |                     c += 1
588 |                     nextend = segs[j][2] if segs[j][2] < gene[2] else gene[2]
589 |                     # here, end (of prevsegment) is the next segment's start
590 |                     ncoef = (nextend - end) / (gene[2] - gene[1])
591 |                     # print('multi',gene, ncoef)
592 |                     if style == "closest":
593 |                         if ncoef > coef:
594 |                             val = segs[j][3]
595 |                         else:
596 |                             # we switch it back (see line 894)
597 |                             ncoef = coef
598 |                     else:
599 |                         val += segs[j][3] * ncoef if style == "weighted" else segs[j][3]
600 |                     end = segs[j][2]
601 |                     coef = ncoef
602 |                 data[i, k] = val if style == "weighted" else val / c
603 |     return pd.DataFrame(data=data, index=samples, columns=gene_mapping[gene_names_col])
604 | 
605 | 
606 | def checkAmountOfSegments(segmentcn, thresh=850, samplecol="DepMap_ID"):
607 |     """
608 |     if there is too many segments, something might be wrong (works with multiple sample file)
609 | 
610 |     will compute the number of segments for each samples from a df of segments from RSEM
611 | 
612 |     Args:
613 |     ----
614 |       segmentcn: segment dataframe
615 |       thresh: max ok amount
616 |     """
617 |     failed = []
618 |     celllines = set(segmentcn[samplecol].tolist())
619 |     amounts = []
620 |     for cellline in celllines:
621 |         val = segmentcn[segmentcn[samplecol] == cellline].shape[0]
622 |         amounts.append(val)
623 |         if val > thresh:
624 |             failed.append(cellline)
625 |             print(cellline, val)
626 |     sns.kdeplot(amounts)
627 |     return failed
628 | 
629 | 
630 | def checkGeneChangeAccrossAll(genecn, thresh=0.2):
631 |     """
632 |     used to find poor quality genes in CN data (works with multiple sample file)
633 | 
634 |     compute given a df of gene x sample CN counts, how much change there is accross samples for
635 |     a same gene and returns ones that are below the threshold
636 | 
637 |     Args:
638 |     -----
639 |       genecn: gene cn data frame
640 |       thresh: threshold in logfold change accross all of them
641 |     """
642 |     return genecn.columns[genecn.var() < thresh].tolist()
643 | 
644 | 
645 | def renameColumns(df):
646 |     """
647 |     rename some of the main columns names from RSEM, GATK.. to more readable column names
648 |     Args:
649 |     -----
650 |       df: the df to rename
651 |     Returns:
652 |     ------
653 |       df the renamed df
654 |     """
655 |     return df.rename(
656 |         columns={
657 |             "Sample": "DepMap_ID",
658 |             "CONTIG": "Chromosome",
659 |             "START": "Start",
660 |             "END": "End",
661 |             "seqnames": "Chromosome",
662 |             "start": "Start",
663 |             "end": "End",
664 |         }
665 |     )
666 | 


--------------------------------------------------------------------------------
/genepy/rna/README.md:
--------------------------------------------------------------------------------
 1 | # RNA
 2 | 
 3 | A set of functions to work with RNAseq (and related) data type
 4 | 
 5 | ## contains
 6 | 
 7 | 
 8 | - filterProteinCoding: removes all non protein coding genes from a list (you need taiga access)
 9 | - convertGenes: converts genes from a naming to another (you need taiga access)
10 | - getSpikeInControlScales: extracts the spike in control values from a set of bam files
11 | - GSEAonExperiments: perform GSEA to compare a bunch of conditions at once
12 | - runERCC: creates an ERCC dashboard and extract the RNA spike ins from it (need rpy2 and ipython and R's ERCCdashboard installed)
13 | 
14 | ## recommended tools
15 | 
16 | - ERCCdashboard (R)
17 | - DESeq2 (R)
18 | - slamdunk
19 | - GSVA (R)
20 | - gseapy (python)


--------------------------------------------------------------------------------
/genepy/rna/pyDESeq2.py:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | #
 3 | # PYDESEQ
 4 | #
 5 | ##################################################################
 6 | 
 7 | from __future__ import print_function
 8 | import numpy as np
 9 | import rpy2.robjects as robjects
10 | from rpy2.robjects import pandas2ri, Formula, numpy2ri
11 | pandas2ri.activate()
12 | import rpy2
13 | from rpy2.robjects.packages import importr
14 | deseq = importr('DESeq2')
15 | from rpy2.robjects.conversion import localconverter
16 | import rpy2.robjects as ro
17 | import sys
18 | '''
19 | Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2
20 | '''
21 | 
22 | to_dataframe = robjects.r('function(x) data.frame(x)')
23 | 
24 | 
25 | class pyDESeq2:
26 |   '''
27 |   DESeq2 object through rpy2
28 |   input:
29 |   count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id
30 |       example:
31 |       id    sampleA    sampleB
32 |       geneA    5    1
33 |       geneB    4    5
34 |       geneC    1    2
35 |   design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames
36 |               treatment
37 |   sampleA1        A
38 |   sampleA2        A
39 |   sampleB1        B
40 |   sampleB2        B
41 |   design_formula: see DESeq2 manual, example: "~ treatment""
42 |   gene_column: column name of gene id columns, exmplae "id"
43 |   '''
44 | 
45 |   def __init__(self, count_matrix, design_matrix, design_formula, gene_column='gene_id'):
46 |     print("you need to have R installed with the DESeq2 library installed")
47 |     try:
48 |       assert gene_column == count_matrix.columns[0], 'no $gene_column name in 1st column\'s name'
49 |       gene_id = count_matrix[gene_column]
50 |     except AttributeError:
51 |       sys.exit('Wrong Pandas dataframe?')
52 |     print(rpy2.__version__)
53 |     self.deseq_result = None
54 |     self.resLFC = None
55 |     self.comparison = None
56 |     self.normalized_count_matrix = None
57 |     self.gene_column = gene_column
58 |     self.gene_id = count_matrix[self.gene_column]
59 |     with localconverter(ro.default_converter + pandas2ri.converter):
60 |       self.count_matrix = pandas2ri.py2rpy(count_matrix.drop(gene_column, axis=1).astype(int))
61 |       self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool))
62 |     self.design_formula = Formula(design_formula)
63 |     self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
64 |                                             colData=self.design_matrix,
65 |                                             design=self.design_formula)
66 | 
67 |   def run_estimate_size_factors(self, **kwargs):  # OPTIONAL
68 |     """
69 |     args:
70 |       geoMeans: cond*gene matrix
71 |     """
72 |     self.dds = deseq.estimateSizeFactors_DESeqDataSet(self.dds, **kwargs)
73 | 
74 |   def run_deseq(self, **kwargs):
75 |     self.dds = deseq.DESeq(self.dds, **kwargs)
76 | 
77 |   def getSizeFactors(self):
78 |     return deseq.sizeFactors_DESeqDataSet(self.dds)
79 | 
80 |   def setSizeFactors(self, factors):
81 |     val = self.dds.do_slot('colData').do_slot('listData')
82 |     val[2] = ro.vectors.FloatVector(np.array(factors))
83 |     self.dds.do_slot('colData').do_slot_assign('listData', val)
84 | 
85 |   def get_deseq_result(self, **kwargs):
86 | 
87 |     self.comparison = deseq.resultsNames(self.dds)
88 | 
89 |     self.deseq_result = deseq.results(self.dds, **kwargs)
90 |     self.deseq_result = to_dataframe(self.deseq_result)
91 |     with localconverter(ro.default_converter + pandas2ri.converter):
92 |       self.deseq_result = ro.conversion.rpy2py(self.deseq_result)  # back to pandas dataframe
93 |     self.deseq_result[self.gene_column] = self.gene_id.values
94 | 


--------------------------------------------------------------------------------
/genepy/rna/ssGSEA.R:
--------------------------------------------------------------------------------
 1 | args<-commandArgs(TRUE)
 2 | 
 3 | countfile <- args[1];
 4 | gmtfile <- args[2];
 5 | method <- args[3]
 6 | 
 7 | library(GSEABase)
 8 | library(GSVA)
 9 | counts <- read.csv(countfile, row.names=1)
10 | mat <- data.matrix(counts, rownames.force = T)
11 | colnames(mat) <- colnames(counts)
12 | gsc_obj <- GSEABase::getGmt(gmtfile,
13 | collectionType = GSEABase::BroadCollection(),
14 | geneIdType = GSEABase::EntrezIdentifier())
15 | gsea <- GSVA::gsva(mat, gsc_obj, method = method)
16 | write.table(gsea, file = "/tmp/res_genepy_ssGSEA.tsv", sep = '\t', quote = F)
17 | 


--------------------------------------------------------------------------------
/genepy/sequencing/README.md:
--------------------------------------------------------------------------------
 1 | # Sequencing
 2 | 
 3 | A set of function to help work with sequencing data (bed files, bam files, fastq files etc...)
 4 | 
 5 | ## Contains
 6 | 
 7 | - fromGTF2BED: transforms a GTF file to a BED file, only works for some GTFs for now
 8 | getBamDate: parses a bam file header to try to compute when it was generated (as best as it can, if it has had many modification done to it across a long span of time, you will receive the average of that)
 9 | - getBamDate
10 | - indexBams
11 | - dropWeirdChromosomes
12 | - extractPairedSingleEndFrom
13 | - findReplicates
14 | - singleEnd
15 | - pairedEnd
16 | - mergeBams
17 | 
18 | ## Other very recommended tools
19 | 
20 | _I am not building anything that overlaps with these tools_
21 | 
22 | - Bedtools
23 | - samtools
24 | - pyBedtools
25 | - pysam


--------------------------------------------------------------------------------
/genepy/sequencing/__init__.py:
--------------------------------------------------------------------------------
  1 | # Jeremie Kalfon
  2 | # for BroadInsitute
  3 | # in 2019
  4 | 
  5 | from __future__ import print_function
  6 | from multiprocessing.sharedctypes import Value
  7 | import os
  8 | import signal
  9 | import re
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | from genepy.google import gcp
 15 | from genepy.utils import helper as h
 16 | from tqdm import tqdm
 17 | 
 18 | size = {"GRCh37": 2864785220, "GRCh38": 2913022398}
 19 | 
 20 | cmaps = [
 21 |     "Greys",
 22 |     "Purples",
 23 |     "Blues",
 24 |     "Greens",
 25 |     "Oranges",
 26 |     "Reds",
 27 |     "YlOrBr",
 28 |     "YlOrRd",
 29 |     "OrRd",
 30 |     "PuRd",
 31 |     "RdPu",
 32 |     "BuPu",
 33 |     "GnBu",
 34 |     "PuBu",
 35 |     "YlGnBu",
 36 |     "PuBuGn",
 37 |     "BuGn",
 38 |     "YlGn",
 39 | ]
 40 | 
 41 | chroms = {
 42 |     "chr1",
 43 |     "chr10",
 44 |     "chr11",
 45 |     "chr12",
 46 |     "chr13",
 47 |     "chr14",
 48 |     "chr15",
 49 |     "chr16",
 50 |     "chr17",
 51 |     "chr18",
 52 |     "chr19",
 53 |     "chr2",
 54 |     "chr20",
 55 |     "chr21",
 56 |     "chr22",
 57 |     "chr3",
 58 |     "chr4",
 59 |     "chr5",
 60 |     "chr6",
 61 |     "chr7",
 62 |     "chr8",
 63 |     "chr9",
 64 |     "chrX",
 65 |     "chrY",
 66 |     "1",
 67 |     "10",
 68 |     "11",
 69 |     "12",
 70 |     "13",
 71 |     "14",
 72 |     "15",
 73 |     "16",
 74 |     "17",
 75 |     "18",
 76 |     "19",
 77 |     "2",
 78 |     "20",
 79 |     "21",
 80 |     "22",
 81 |     "3",
 82 |     "4",
 83 |     "5",
 84 |     "6",
 85 |     "7",
 86 |     "8",
 87 |     "9",
 88 |     "X",
 89 |     "Y",
 90 | }
 91 | 
 92 | 
 93 | def fromGTF2BED(gtfname, bedname, gtftype="geneAnnot"):
 94 |     """
 95 |     transforms a  gtf file into a bed file
 96 | 
 97 |     Args:
 98 |     ----
 99 |       gtfname: filepath to gtf file
100 |       bedname: filepath to beddfile
101 |       gtftype: only geneAnnot for now
102 | 
103 |     Returns:
104 |     --------
105 |       newbed: the bedfile as a pandas.df
106 | 
107 |     """
108 |     if gtftype == "geneAnnot":
109 |         gtf = pd.read_csv(
110 |             gtfname,
111 |             sep="\t",
112 |             header=0,
113 |             names=[
114 |                 "chr",
115 |                 "val",
116 |                 "type",
117 |                 "start",
118 |                 "stop",
119 |                 "dot",
120 |                 "strand",
121 |                 "loc",
122 |                 "name",
123 |             ],
124 |         )
125 |         gtf["name"] = [
126 |             i.split('gene_id "')[-1].split('"; trans')[0] for i in gtf["name"]
127 |         ]
128 |         prevname = ""
129 |         newbed = {"chr": [], "start": [], "end": [], "gene": []}
130 |         for i, val in gtf.iterrows():
131 |             h.showcount(i, len(gtf))
132 |             if val["name"] == prevname:
133 |                 newbed["end"][-1] = val["stop"]
134 |             else:
135 |                 newbed["chr"].append(val["chr"])
136 |                 newbed["start"].append(val["start"])
137 |                 newbed["end"].append(val["stop"])
138 |                 newbed["gene"].append(val["name"])
139 |             prevname = val["name"]
140 |         newbed = pd.DataFrame(newbed)
141 |         newbed = newbed[~newbed.chr.str.contains("_fix")]
142 |         newbed.to_csv(bedname + ".bed", sep="\t", index=None)
143 |         newbed.to_csv(bedname + "_genes.bed", sep="\t", index=None)
144 |         return newbed
145 | 
146 | 
147 | def getBamDate(bams, split="-", order="des", unknown="U"):
148 |     """
149 |     from bam files (could be in a google bucket) returns their likely sequencing date if available in the header
150 | 
151 |     Args:
152 |     -----
153 |       bams: the bams file|bucket paths
154 |       split: the splitter in the output date
155 |       unknown: maybe the some dates can't be found the program will output unknown for them
156 |       order: if 'asc', do d,m,y else do y,m,d
157 | 
158 |     Returns:
159 |     -------
160 |       a list of likely dates or [unknown]s
161 |     """
162 |     DTs = []
163 |     for i, bam in enumerate(tqdm(bams)):
164 |         data = os.popen(
165 |             "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`\
166 |     && samtools view -H "
167 |             + bam
168 |             + ' | grep "^@RG"'
169 |         )
170 |         if data == signal.SIGINT:
171 |             print("Awakened")
172 |             break
173 |         else:
174 |             res = data.read()
175 |             dt = re.findall("(?<=\tDT:).+?\t", res)
176 |         if len(dt) > 1:
177 |             arr = np.array(dt[0].split("T")[0].split(split)).astype(int)
178 |             for val in dt[1:]:
179 |                 arr = np.vstack(
180 |                     (arr, np.array(val.split("T")[0].split(split)).astype(int))
181 |                 )
182 |             arr = arr.T
183 |             i = (
184 |                 arr[0] * 365 + arr[1] * 31 + arr[2]
185 |                 if order == "asc"
186 |                 else arr[2] * 365 + arr[1] * 31 + arr[0]
187 |             )
188 |             DTs.append(dt[np.argsort(i)[0]].split("T")[0])
189 |         elif len(dt) == 1:
190 |             DTs.append(dt[0].split("T")[0])
191 |         else:
192 |             DTs.append(unknown)
193 |     return DTs
194 | 
195 | 
196 | async def indexBams(bams=None, bucketpath=None, cores=4):
197 |     """
198 |     given a bucket path, will index all .bam files without an associated index and return their paths
199 |     """
200 |     if bams is None:
201 |         if bucketpath is None:
202 |             raise ValueError("need one of bams or bucketpath")
203 |         files = gcp.lsFiles([bucketpath])
204 |         bams = [val for val in files if ".bam" in val[-4:]]
205 |         unindexed = [
206 |             val
207 |             for val in bams
208 |             if val[:-4] + ".bai" not in files and val[:4] + ".bam.bai" not in files
209 |         ]
210 |         print("found " + str(len(unindexed)) + " files to reindex")
211 |     else:
212 |         unindexed = bams
213 |     h.parrun(
214 |         [
215 |             "export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` && samtools index "
216 |             + val
217 |             for val in unindexed
218 |         ],
219 |         cores,
220 |     )
221 |     return {val: val[:-4] + ".bam.bai" for val in unindexed}
222 | 
223 | 
224 | def dropWeirdChromosomes(bedfile, keep=[], skip=0):
225 |     """
226 |     given a bedfile path, removes chromosomes that are not one of chroms
227 | 
228 |     Args:
229 |     ----
230 |       bedfile: str the filepath to the bedfile
231 |       keep: list[str] of additional chromosomes to keep
232 |     """
233 |     if skip >= 20:
234 |         raise ValueError("too many header lines!")
235 |     try:
236 |         bed = pd.read_csv(bedfile, sep="\t", header=None, skiprows=skip)
237 |     except ParserError:
238 |         dropWeirdChromosomes(bedfile, keep, skip + 1)
239 |         return
240 |     except EmptyDataError:
241 |         print("empty bed")
242 |         return
243 |     initlen = len(bed)
244 |     if initlen == 0:
245 |         print("empty bed")
246 |         return
247 |     bed = bed[bed[0].isin(chroms | set(keep))]
248 |     if len(bed) < skip and skip > 5:
249 |         raise ValueError("too many header lines!")
250 |     print("found " + str(skip) + " header line... removing")
251 |     if len(bed) != initlen:
252 |         print("removed " + str(initlen - len(bed)) + " lines")
253 |     bed.to_csv(bedfile, sep="\t", header=None, index=None)
254 | 
255 | 
256 | def extractPairedSingleEndFrom(folder, sep="-", namepos=2):
257 |     """
258 |     given a folder, find fastq files and sorts paired and single end based on the R1/R2 patterns
259 | 
260 |     Args:
261 |     -----
262 |       folder: the folder where the fastqs are
263 |       sep: the separator in filename
264 |       namepos: the location of the name in this separated list of name from filepath
265 | 
266 |     Returns:
267 |     -------
268 |       list of filepath to single end files
269 |       df with R1 and R2 filepath
270 |     """
271 |     single = []
272 |     paired = {}
273 |     for val in os.listdir(folder):
274 |         if ".fastq" in val or ".fq" in val:
275 |             if "R1" in val:
276 |                 name = val.split(sep)[namepos]
277 |                 paired[name] = {"R1": val}
278 |             elif "R2" in val:
279 |                 name = val.split(sep)[namepos]
280 |                 paired[name].update({"R2": val})
281 |             else:
282 |                 single.append(val)
283 |     return single, pd.DataFrame(paired)
284 | 
285 | 
286 | def findReplicatesBams(folder, sep="-", namings="-r([0-9])", namepos=2):
287 |     """
288 |     creates a dict of name and replicate files given a regexp namging scheme
289 |     """
290 |     rep = {}
291 |     for val in os.listdir(folder):
292 |         if val[-4:] == ".bam":
293 |             match = re.search(namings, val)
294 |             if match:
295 |                 name = val.split(sep)[namepos]
296 |                 if name in rep:
297 |                     rep[name].append(val)
298 |                 else:
299 |                     rep[name] = [val]
300 | 
301 |     return rep
302 | 
303 | 
304 | def singleEnd(
305 |     singlend,
306 |     folder="data/seqs/",
307 |     numthreads=8,
308 |     peaksFolder="peaks/",
309 |     ismapped=False,
310 |     mappedFolder="mapped/",
311 |     refFolder="data/reference/index",
312 | ):
313 |     """
314 |     run the singleEnd pipeline
315 |     for alignment etc, one can use pysam ready made implementation of samtools
316 |     """
317 |     print(
318 |         "you need to have bowtie2 installed: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml"
319 |     )
320 |     for val in singlend:
321 |         out1 = folder + mappedFolder + val.split(".")[0] + ".mapped.sam"
322 |         if not ismapped:
323 |             in1 = folder + val
324 |             os.system(
325 |                 "bowtie2 -x "
326 |                 + refFolder
327 |                 + " --threads "
328 |                 + str(numthreads)
329 |                 + " -t -k 1 --very-sensitive -U "
330 |                 + in1
331 |                 + " -S "
332 |                 + out1
333 |             )
334 |         out2 = folder + peaksFolder + val.split(".")[0]
335 |         print(out1)
336 |         os.system("macs2 callpeak -f SAM -t " + out1 + " --outdir " + out2)
337 |         # it can take many TB so better delete
338 | 
339 | 
340 | def pairedEnd(
341 |     pairedend,
342 |     folder="",
343 |     numthreads=8,
344 |     peaksFolder="peaks/",
345 |     ismapped=False,
346 |     mappedFolder="mapped/",
347 |     refFolder="data/reference/index",
348 | ):
349 |     """
350 |     # run the paired end pipeline
351 |     """
352 |     print(
353 |         "you need to have bowtie2 installed: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml"
354 |     )
355 |     for _, val in pairedend.items():
356 |         out1 = folder + mappedFolder + val[0].split(".")[0] + ".mapped.sam"
357 |         in1 = folder + val[0]
358 |         in2 = folder + val[1]
359 |         os.system(
360 |             "bowtie2 -x "
361 |             + refFolder
362 |             + " --threads "
363 |             + str(numthreads)
364 |             + " -t -k 1 \
365 |     --very-sensitive -1 "
366 |             + in1
367 |             + " -2 "
368 |             + in2
369 |             + " - S "
370 |             + out1
371 |         )
372 |         out2 = folder + peaksFolder + val[0].split(".")[0]
373 |         print(out1)
374 |         changefrom = out1
375 |         changeto = out1[:-4] + ".bam"
376 |         os.system("samtools view -b " + changefrom + " -o " + changeto)
377 |         os.system(
378 |             "macs2 callpeak --format 'BAMPE' --treatment "
379 |             + changeto
380 |             + " --outdir "
381 |             + out2
382 |         )
383 |         # it can take many TB so better delete
384 | 
385 | 
386 | async def mergeBams(rep):
387 |     """
388 |     uses samtools to merge a set of replicates considered into one file
389 |     """
390 |     in1 = ""
391 |     for i, val in rep.items():
392 |         out1 = i + ".merged.bam"
393 |         for bam in val:
394 |             in1 += " " + bam
395 |         os.system("samtools merge " + out1 + in1)
396 | 
397 | 
398 | def compare_gcloud_vcfs_overlap_methods(vcfs_met1_path, vcfs_met2_path):
399 |     for i, j in zip(vcfs_met1_path, vcfs_met2_path):
400 |         compare_gcloud_vcf_overlap(i, j)
401 | 
402 | 
403 | def compare_gcloud_vcf_overlap(vcf1, vcf2, cols=["chr", "start", ".", "ref", "alt"]):
404 |     import subprocess
405 | 
406 |     name1 = vcf1.split("/")[-1].split(".")[0] + "_1" + ".tsv"
407 |     cmd1 = "gsutil cat " + vcf1 + " | gunzip | cut -f -5 > " + name1
408 |     name2 = vcf2.split("/")[-1].split(".")[0] + "_2" + ".tsv"
409 |     cmd2 = "gsutil cat " + vcf2 + " | gunzip | cut -f -5 > " + name2
410 |     try:
411 |         subprocess.run(
412 |             cmd1,
413 |             shell=True,
414 |             check=True,
415 |             stdout=subprocess.PIPE,
416 |             stderr=subprocess.PIPE,
417 |         )
418 |         subprocess.run(
419 |             cmd2,
420 |             shell=True,
421 |             check=True,
422 |             stdout=subprocess.PIPE,
423 |             stderr=subprocess.PIPE,
424 |         )
425 |     except subprocess.CalledProcessError as e:
426 |         print(e.stderr)
427 |         raise e
428 |     val2 = pd.read_csv(name2, sep="\t", comment="#", names=cols)
429 |     val1 = pd.read_csv(name1, sep="\t", comment="#", names=cols)
430 |     val1["loc"] = (
431 |         val1["chr"].astype(str)
432 |         + ":"
433 |         + val1["start"].astype(str)
434 |         + ":"
435 |         + val1["alt"].astype(str)
436 |     )
437 |     val2["loc"] = (
438 |         val2["chr"].astype(str)
439 |         + ":"
440 |         + val2["start"].astype(str)
441 |         + ":"
442 |         + val2["alt"].astype(str)
443 |     )
444 |     print("length of vcf1:" + str(len(val1)))
445 |     print("length of vcf2:" + str(len(val2)))
446 |     print("overlap: " + str(len(set(val1["loc"]).intersection(val2["loc"]))))
447 |     return val1, val2
448 | 


--------------------------------------------------------------------------------
/genepy/terra/README.md:
--------------------------------------------------------------------------------
 1 | # terra
 2 | 
 3 | a file containing a set of functions that uses [dalmatian](github.com/broadinstitute/dalmatian) to interact with the [GCP](https://cloud.google.com/storage/docs/gsutil) powered genomics HPC platform: [Terra](www.terra.bio). 
 4 | They contain a list of additional functions to do more than what is available in dalmatian
 5 | 
 6 | The goal is to improve reproducibility and productionalization of pipelines working with Terra.
 7 | 
 8 | #### Available functions:
 9 | 
10 | - createManySubmissions: allows you to create many terra jobs in parallel
11 | - waitForSubmission: an await function on Terra jobs
12 | - removeSamples: a function that removes samples on a workspace and takes care of more edge cases (linked sample sets and pair sets..).
13 | - uploadFromFolder: uploads fastq samples from a folder into a Terra workspace with the right namings etc..
14 | - updateAllSampleSet: updates a sample set with all samples
15 | - addToSampleSet: updates a sample set with some new samples
16 | - addToPairSet: updates a pair set with some new pairs
17 | - saveOmicsOutput: *WIP*
18 | - changeGSlocation: Function to move data around from one workspace to a bucket or to another workspace. can also work on dataframes containing lists of paths
19 | - renametsvs: *WIP*
20 | - findBackErasedDuplicaBamteFromTerraBucket:   If you have erased bam files in gcp with bai files still present and the bam files are stored elsewhere and their location is in a terra workspace. Will find them back by matching bai sizes and copy them back to their original locations.
21 | - shareTerraBams: will share some files from gcp with a set of users using terra as metadata repo. Only works with files that are listed on a terra workspace tsv but actually point to a regular google bucket and not a terra bucket.
22 | - shareCCLEbams: same as shareTerraBams but is completed to work with CCLE bams from the CCLE sample tracker.
23 | - saveConfigs: will save everything about a workspace into a csv and json file
24 | - cleanWorkspace: removes all processing folder in a terra workspace easily
25 | - changeToBucket: moves all bam/bai files in a sampleList from Terra, to another gs bucket and rename them in the sample list
26 | - delete_job:  removes files generated by a job on Terra.
27 | - removeFromFailedWorkflows: Lists all files from all jobs that have failed and deletes them.
28 | - deleteHeavyFiles: deletes all files above a certain size in a workspace (that are used or unused).
29 | - findFilesInWorkspaces: given All your terra workspaces, find a given gs filename
30 | 
31 | ## highly recommanded
32 | 
33 | *This package won't contain anything that overlap with those and might use those packages for what it is doing.*
34 | - firecloud-dalmatian (python)
35 | - gsutil
36 | - nextflow (better than terra)
37 | 


--------------------------------------------------------------------------------
/genepy/terra/map_terra_workflow.py:
--------------------------------------------------------------------------------
  1 | # 1. make sure dalmatian is installed
  2 | # pip install firecloud-dalmatian
  3 | # 2. make sure graphviz is installed (this script will run "dot" to generate images)
  4 | # brew install graphviz
  5 | 
  6 | import subprocess
  7 | import requests
  8 | import pandas as pd
  9 | import dalmatian
 10 | import subprocess
 11 | 
 12 | 
 13 | def resolve_dot_path(root_entity_type, name):
 14 |   name = name.replace(" ", "")
 15 |   parts = name.split(".")
 16 |   assert parts[0] == "this"
 17 |   del parts[0]
 18 |   cur = root_entity_type
 19 |   while len(parts) > 1:
 20 |     next_part = parts[0]
 21 |     if cur == "sample_set" and next_part == "samples":
 22 |       cur = "sample"
 23 |     elif cur == "pair" and next_part == "case_sample":
 24 |       cur = "sample"
 25 |     elif cur == "pair" and next_part == "control_sample":
 26 |       cur = "sample"
 27 |     elif cur == "sample" and next_part == "participant":
 28 |       cur = "participant"
 29 |     else:
 30 |       raise Exception(f"Unknown case: {cur} -> {next_part} (root_entity_type={root_entity_type} name={name})")
 31 |     del parts[0]
 32 |   return cur+"."+parts[0]
 33 | 
 34 | def extract_config_summary(workspace_name, workflows=None):
 35 |   wm = dalmatian.WorkspaceManager(workspace_name)
 36 |   configs = wm.get_configs()
 37 | 
 38 |   config_summaries = []
 39 |   for rec in configs.to_records():
 40 |     cfgname = rec['namespace']+"/"+rec['name']
 41 |     if workflows is not None:
 42 |       if cfgname not in workflows:
 43 |         continue
 44 |     config = wm.get_config(cfgname)
 45 |     config['inputs'] = {k:v.strip() for k,v in config['inputs'].items()}
 46 |     config['outputs'] = {k:v.strip() for k,v in config['outputs'].items()}
 47 |     inputs=[resolve_dot_path(config['rootEntityType'], x) for x in config['inputs'].values() if x.startswith("this.")]
 48 |     outputs=[resolve_dot_path(config['rootEntityType'], x) for x in config['outputs'].values() if x.startswith("this.")]
 49 |     config_summaries.append(dict(inputs=inputs, outputs=outputs, entity_type=rec['rootEntityType'], name=cfgname))
 50 |   return config_summaries
 51 | 
 52 | 
 53 | def write_dependency_graph_image(filename, config_summaries):
 54 |   with open("/tmp/sample.dot", "wt") as fd:
 55 |     node_names = {}
 56 |     def nn(name, is_var):
 57 |       if name in node_names:
 58 |         return node_names[name]
 59 |       node_name = "n{}".format(len(node_names))
 60 |       node_names[name] = node_name
 61 |       fd.write("{} [label=\"{}\" {}];\n".format(node_name, name, {True: "shape=oval", False: "shape=box fillcolor=yellow style=filled"}[is_var]))
 62 |       return node_names[name]
 63 | 
 64 |     fd.write("digraph { rankdir=LR;\n")
 65 |     for config in config_summaries:
 66 |       for name in config['inputs']:
 67 |         fd.write("{} -> {};\n".format(nn(name, True), nn(config['name'], False)))
 68 | 
 69 |       for name in config['outputs']:
 70 |         fd.write("{} -> {};\n".format(nn(config['name'], False), nn(name, True)))
 71 |     fd.write("}\n")
 72 | 
 73 | 
 74 |   subprocess.check_call(["dot", "/tmp/sample.dot", "-Tpng", "-o", filename])
 75 | 
 76 | def write_config_summary_table(filename, config_summaries):
 77 |   from collections import defaultdict
 78 |   variables = defaultdict(lambda: dict(used_by=[], produced_by=[]))
 79 |   for config in config_summaries:
 80 |     for name in config['inputs']:
 81 |       variables[name]['used_by'].append(config['name'])
 82 | 
 83 |     for name in config['outputs']:
 84 |       variables[name]['produced_by'].append(config['name'])
 85 | 
 86 |   var_col = []
 87 |   used_by = []
 88 |   produced_by = []
 89 |   inputs = []
 90 |   for k, v in variables.items():
 91 |     if len(v["produced_by"]) == 0:
 92 |       inputs.append(k)
 93 |     var_col.append(k)
 94 |     used_by.append(", ".join(v["used_by"]))
 95 |     produced_by.append(", ".join(v["produced_by"]))
 96 |   df = pd.DataFrame(dict(variable=var_col, used_by=used_by, produced_by=produced_by))
 97 |   df.to_csv(filename)
 98 | 
 99 | def map_workspace_diagram(workspace_name, output_path='terra-workflows', workflows=None):
100 |   """
101 |   -- adapted from scripts written by Philip Montgomery --
102 |   this function creates a graph of the workflows within a Terra workspace
103 | 
104 |   inputs:
105 |       workspace_name (str): name of the workspace
106 |       output_path (str): path where outputs will be saved
107 |       workflows (list[str]): list of workflows to consider in the graph. If None is provided (default),
108 |           all the workflows in the workspace will be used.
109 | 
110 |   example code:
111 |       workflows = ['stewart/pipette_wgs_SV', 'stewart/manta', 'stewart/SvABA_xtramem',
112 |               'stewart/svaba_snowmanvcf2dRangerForBP', 'stewart/mantavcf2dRangerForBP',
113 |               'stewart/extract_dRanger_intermediates','stewart/pcawg_snowmanvcf2dRangerForBP',
114 |               'stewart/SV_cluster_forBP', 'stewart/breakpointer', 'stewart/Breakpointer_fix_sample',
115 |               'stewart/REBC_SV_consensus_filter_v3']
116 | 
117 |       workspace_name = 'broad-firecloud-ccle/REBC_methods_only-tmp'
118 |       print(workspace_name)
119 |       config = mtw.map_workspace_diagram(workspace_name, workflows=workflows)
120 |   """
121 |   configs = extract_config_summary(workspace_name, workflows=workflows)
122 |   write_dependency_graph_image(output_path+'/'+workspace_name.replace("/", " ")+".png", configs)
123 |   write_config_summary_table(output_path+'/'+workspace_name.replace("/", " ")+".csv", configs)
124 |   return configs
125 | 


--------------------------------------------------------------------------------
/genepy/utils/Datanalytics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def getDFinfo(df):
 5 |   val = df
 6 |   print("sums over cell lines! ------mean, var, totmin, meanmin, totmax, meanmax")
 7 |   print(val.sum(1).mean(), val.sum(1).var(), val.sum(1).min(), val.mean(1).min(), val.sum(1).idxmin(), val.sum(1).max(), val.mean(1).max(), val.sum(1).idxmax())
 8 |   print("sums over features! ------mean, var, totmin, meanmin, totmax, meanmax")
 9 |   print(val.sum(0).mean(), val.sum(0).var(), val.sum(0).min(), val.mean(0).min(), val.sum(0).idxmin(), val.sum(0).max(), val.mean(0).max(), val.sum(0).idxmax())
10 |   print("nans!")
11 |   print(np.count_nonzero(np.isnan(val)))
12 | 
13 | 
14 | def compare(df1, df2):
15 |   df = pd.concat([df1, df2])
16 |   df = df.reset_index(drop=True)
17 |   df_gpby = df.groupby(list(df.columns))
18 |   idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
19 |   df.reindex(idx)
20 |   return df
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/genepy/utils/README.md:
--------------------------------------------------------------------------------
 1 | # Utils
 2 | 
 3 | 1. helper functions to save data, generate random strings, run tasks in parallel etc.
 4 | 2. a set of plotting tools based on [matplotlib]() and [bokeh]() 
 5 | 
 6 | ## Contains:
 7 | 
 8 | _in ./helper.py_
 9 | 
10 | - fileToList: convert a txt with a list of values to a python list
11 | - listToFile: converts a list of values to a txt
12 | - dictToFile: converts a dict to a json
13 | - fileToDict: converts a json to a dict
14 | - batchMove: move a lot of file in batch (can provide different locations)
15 | - batchRename: rename a bunch of files in batch
16 | - createFoldersFor: makes the required folders for a given filepath
17 | - grouped: to use in a forloop to group values in batch
18 | - overlap: given two tuples, returns the overlap
19 | - union: given two tuples, returns the union
20 | - nans: gets nans from a panda df
21 | - randomString: generate a random string for namings
22 | - parrun: runs list of commands in parallel
23 | - askif: ask the user a questions and returns the y/n answer
24 | - inttodate: converts an int to a string date.
25 | - datetoint: converts a date to an int
26 | - showcount: pretty print of i/size%, to put in a for loop
27 | - combin: outputs the number of comabination of n object taken k at a time
28 | - dups: shows the duplicates in a list
29 | - makeCombinations: produces probability of X event happening at the same time. wil compute it given binomial probabilities of each event occuring and the number of trials
30 | - closest: returns the index of the value closest to K in a lst
31 | - compareDfs: compares df1 to df2. Shows col difference, index difference, nans & 0s differences
32 | 
33 | _in ./plot.py_
34 | 
35 | - scatter: makes a hoverable/zoomable bokeh scatter plot
36 | - bigScatter: 
37 | - CNV_Map: makes a hoverable Copy Number plot using bokeh
38 | - volcano: makes a searchable volcano plot for a Differential expression experiment using bokeh
39 | - correlationMatrix: makes a hoverable bokeh correlation matrix, works with annotations, pvalues, clusters, etc.
40 | - venn: makes a venn diagram from a list of sets
41 | - mergeImages: merge multiple pngs/pdfs together into one
42 | - addTextToImage: adds a text in an image to a specific location
43 | - SOMPlot: a tool that uses simpSOM's package output (which produces self organizing maps), to plot its output in an interactive fashion.
44 | 
45 | ## other necessary tools
46 | 
47 | _I am not creating anything that overlaps with that/ I am using these tools_
48 | 
49 | - os (python)
50 | - subprocess (python)
51 | - sns (python)
52 | - bokeh (python)
53 | 


--------------------------------------------------------------------------------
/genepy/utils/RScript.R:
--------------------------------------------------------------------------------
 1 | MofaRun <- function(valueList){
 2 | 	library(reticulate)
 3 | 	MOFAobject <- createMOFAobject(valueList)
 4 | 	DataOptions <- getDefaultDataOptions()
 5 | 	ModelOptions <- getDefaultModelOptions(MOFAobject)
 6 | 	TrainOptions <- getDefaultTrainOptions()
 7 | 	ModelOptions$numFactors <- 200
 8 | 	TrainOptions$DropFactorThreshold <- 0.02
 9 | 	MOFAobject <- prepareMOFA(
10 | 	  MOFAobject, 
11 | 	  DataOptions = DataOptions,
12 | 	  ModelOptions = ModelOptions,
13 | 	  TrainOptions = TrainOptions
14 | 	)
15 | 	MOFAobject <- runMOFA(MOFAobject)
16 | 	return(MOFAobject)
17 | }
18 | 
19 | # Function that converts the segmented data to be continuous (so can plot chromosomes in 1, 2, 3, 4... order)
20 | generate_chromosome_cutoffs_list <- function(cyto_band_file="data/hg38_cytoband.gz") {
21 |   # Have to edit the chr values to 
22 |   chr_bp_cutoffs <- read_tsv(cyto_band_file, col_names = F)
23 |   cutoffs <- chr_bp_cutoffs %>% 
24 |     group_by(X1) %>% 
25 |     dplyr::summarize(pos=max(X3)) %>%
26 |     mutate(X1=gsub('chr', '', X1)) %$% 
27 |     setNames(pos, ifelse(X1 %in% seq(1,21), paste0('chr', as.integer(X1) + 1), ifelse(X1==22, 'chrX', ifelse(X1=='X', 'chrY', 'chrZ'))))
28 |   
29 |   cutoffs_final <- cutoffs[paste0('chr',c(seq(2, 22), 'X', 'Y'))] %>% cumsum()
30 |   cutoffs_final['chr1'] = 0
31 |   
32 |   return(cutoffs_final)
33 | }


--------------------------------------------------------------------------------
/genepy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/genepy/f3a8d65551d726a92341b3cb96d756c349816514/genepy/utils/__init__.py


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: genepy
2 | theme: readthedocs
3 | plugins:
4 |   - search
5 |   - mkdocstrings


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | # This requirements are for development and testing only, not for production.
 2 | pytest
 3 | coverage
 4 | flake8
 5 | black
 6 | isort
 7 | pytest-cov
 8 | codecov
 9 | mypy
10 | gitchangelog
11 | mkdocs
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bokeh>= 2.2
 2 | colorcet
 3 | firecloud_dalmatian>=0.0.17
 4 | gseapy==0.9.18
 5 | gsheets==0.4.1
 6 | gspread==3.6.0
 7 | matplotlib
 8 | oauth2client>=4.1.3
 9 | pandas
10 | pybedtools
11 | pyBigWig
12 | pysam
13 | pytest
14 | requests>=2.24.0
15 | scikit_learn
16 | scipy>=1.0.0
17 | seaborn
18 | statsmodels
19 | taigapy>=2.12
20 | venn
21 | biomart


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup
  2 | import sys
  3 | import os
  4 | import io
  5 | import subprocess
  6 | 
  7 | if sys.version_info.major < 3 or sys.version_info.minor < 2:
  8 |     raise ValueError("genepy is only compatible with Python 3.5 and above")
  9 | if sys.version_info.minor < 5:
 10 |     import warnings
 11 | 
 12 |     warnings.warn("genepy may not function properly on Python < 3.8")
 13 | 
 14 | os.system("git submodule init && git submodule sync")
 15 | 
 16 | with open("README.md", "r") as f:
 17 |     long_description = f.read()
 18 | 
 19 | print("trying to install R packages")
 20 | try:
 21 |     subprocess.run(
 22 |         'R -e \'if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install(c("GSEABase", "erccdashboard", "GSVA", "DESeq2"));\'',
 23 |         shell=True,
 24 |         check=True,
 25 |         stdout=subprocess.PIPE,
 26 |         stderr=subprocess.PIPE,
 27 |     )
 28 |     subprocess.run("pip install rpy2")
 29 | except:
 30 |     print("R packages not installed")
 31 | print("if it did not work. please install R or check your R installation")
 32 | print(
 33 |     "once R is installed you need to install erccdashboard, GSEABase GSVA, DESeq2 to have access to all the functions"
 34 | )
 35 | 
 36 | 
 37 | def read(*paths, **kwargs):
 38 |     """Read the contents of a text file safely.
 39 |     >>> read("genepy", "VERSION")
 40 |     '0.1.0'
 41 |     >>> read("README.md")
 42 |     ...
 43 |     """
 44 | 
 45 |     content = ""
 46 |     with io.open(
 47 |         os.path.join(os.path.dirname(__file__), *paths),
 48 |         encoding=kwargs.get("encoding", "utf8"),
 49 |     ) as open_file:
 50 |         content = open_file.read().strip()
 51 |     return content
 52 | 
 53 | 
 54 | def read_requirements(path):
 55 |     return [
 56 |         line.strip()
 57 |         for line in read(path).split("\n")
 58 |         if not line.startswith(('"', "#", "-", "git+"))
 59 |     ]
 60 | 
 61 | 
 62 | setup(
 63 |     name="Broad-genepy",
 64 |     version=read("genepy", "VERSION"),
 65 |     description="A useful module for any CompBio",
 66 |     long_description=long_description,
 67 |     long_description_content_type="text/markdown",
 68 |     author="Jeremie Kalfon",
 69 |     author_email="jkobject@gmail.com",
 70 |     url="https://github.com/BroadInstitute/genepy",
 71 |     packages=[
 72 |         "genepy/cell_line_mapping-master/python/cell_line_mapper",
 73 |         "genepy/epigenetics",
 74 |         "genepy/mutations",
 75 |         "genepy/google",
 76 |         "genepy/sequencing/",
 77 |         "genepy/terra",
 78 |         "genepy/rna",
 79 |         "genepy/utils",
 80 |     ],
 81 |     package_data={"genepy": ["data/*"]},
 82 |     python_requires=">=3.5",
 83 |     install_requires=read_requirements("requirements.txt"),
 84 |     classifiers=[
 85 |         "Programming Language :: Python :: 3",
 86 |         "Intended Audience :: Science/Research",
 87 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
 88 |     ],
 89 | )
 90 | 
 91 | 
 92 | print(
 93 |     "You might want to install Bowtie2, samtools, bwa and R to be able to use all functions of this package:\n\
 94 |   http://bowtie-bio.sourceforge.net/bowtie2/index.shtml\n\
 95 |   http://www.htslib.org/\n\
 96 |   https://github.com/lh3/bwa\n"
 97 | )
 98 | 
 99 | print("Finished!")
100 | 


--------------------------------------------------------------------------------