├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── cmapPy ├── __init__.py ├── clue_api_client │ ├── __init__.py │ ├── cell_queries.py │ ├── clue_api_client.py │ ├── gene_queries.py │ ├── macchiato_queries.py │ ├── mock_clue_api_client.py │ ├── pert_queries.py │ ├── setup_logger.py │ └── tests │ │ ├── test_cell_queries.py │ │ ├── test_clue_api_client.py │ │ ├── test_gene_queries.py │ │ ├── test_macchiato_queries.py │ │ ├── test_mock_clue_api_client.py │ │ └── test_pert_queries.py ├── example_cmapPy_config_file.cfg ├── math │ ├── __init__.py │ ├── agg_wt_avg.py │ ├── fast_corr.py │ ├── fast_cov.py │ ├── robust_zscore.py │ └── tests │ │ ├── __init__.py │ │ ├── test_agg_wt_avg.py │ │ ├── test_fast_corr.py │ │ ├── test_fast_cov.py │ │ └── test_robust_zscore.py ├── pandasGEXpress │ ├── GCToo.py │ ├── README.rst │ ├── __init__.py │ ├── concat.py │ ├── concat_gctoo.py │ ├── diff_gctoo.py │ ├── gct2gctx.py │ ├── gctx2gct.py │ ├── mini_gctoo_for_testing.py │ ├── parse.py │ ├── parse_gct.py │ ├── parse_gctx.py │ ├── random_slice.py │ ├── setup_GCToo_logger.py │ ├── simple_GCT_to_GCToo_figure.png │ ├── slice_gct.py │ ├── slice_gctoo.py │ ├── subset.py │ ├── subset_gctoo.py │ ├── tests │ │ ├── __init__.py │ │ ├── functional_tests │ │ │ ├── LJP_row_metadata.txt │ │ │ ├── both_metadata_example_n1476x978.gct │ │ │ ├── both_metadata_example_n1476x978.gctx │ │ │ ├── col_meta_only_example_n355x355.gct │ │ │ ├── col_meta_only_example_n355x355.gctx │ │ │ ├── concated.gctx │ │ │ ├── metadata_writer_test.gctx │ │ │ ├── mini_folder │ │ │ │ ├── both_metadata_example_n1476x978.gctx │ │ │ │ ├── col_meta_only_example_n355x355.gctx │ │ │ │ ├── row_meta_only_example_n2x1203.gctx │ │ │ │ └── tsne_n2x1203.gctx │ │ │ ├── mini_gctoo_data_matrix.gctx │ │ │ ├── mini_gctoo_for_testing.gct │ │ │ ├── mini_gctoo_for_testing.gctx │ │ │ ├── mini_gctoo_for_testing_nometa.gct │ │ │ ├── mini_gctoo_for_testing_nometa.gctx │ │ │ ├── mini_gctx_with_metadata_n2x3.gctx │ │ │ ├── older_version_v1_2.gct │ │ │ ├── row_meta_only_example_n2x1203.gct │ │ │ ├── row_meta_only_example_n2x1203.gctx │ │ │ ├── test_colmeta_n6.txt │ │ │ ├── test_concat │ │ │ │ └── test_main │ │ │ │ │ ├── a.gct │ │ │ │ │ └── b.gct │ │ │ ├── test_concat_gctoo_test_main_fake_empty_file.gct │ │ │ ├── test_l1000.gct │ │ │ ├── test_l1000.gctx │ │ │ ├── test_l1000_highprecision.gct │ │ │ ├── test_l1000_highprecision.gctx │ │ │ ├── test_merge_bottom.gct │ │ │ ├── test_merge_left.gct │ │ │ ├── test_merge_right.gct │ │ │ ├── test_merge_top.gct │ │ │ ├── test_merged_left_right.gct │ │ │ ├── test_merged_top_bottom.gct │ │ │ ├── test_missing_colmeta.txt │ │ │ ├── test_missing_rowmeta.txt │ │ │ ├── test_p100.gct │ │ │ ├── test_parse_gct_int_ids.gct │ │ │ ├── test_parse_gctx_rid_entrez_id.gctx │ │ │ ├── test_rowmeta_n6.txt │ │ │ ├── test_subset_expected.gct │ │ │ ├── test_subset_in.gct │ │ │ ├── test_subset_rid.grp │ │ │ ├── test_v1point2_n5x10.gct │ │ │ └── tsne_n2x1203.gctx │ │ ├── python2_tests │ │ │ ├── __init__.py │ │ │ ├── test_GCToo.py │ │ │ ├── test_concat.py │ │ │ ├── test_diff_gctoo.py │ │ │ ├── test_edge_cases.py │ │ │ ├── test_gct2gctx.py │ │ │ ├── test_gctx2gct.py │ │ │ ├── test_parse.py │ │ │ ├── test_parse_gct.py │ │ │ ├── test_parse_gctx.py │ │ │ ├── test_random_slice.py │ │ │ ├── test_subset.py │ │ │ ├── test_subset_gctoo.py │ │ │ ├── test_write_gct.py │ │ │ └── test_write_gctx.py │ │ ├── python3_tests │ │ │ ├── __init__.py │ │ │ ├── test_GCToo.py │ │ │ ├── test_concat.py │ │ │ ├── test_diff_gctoo.py │ │ │ ├── test_edge_cases.py │ │ │ ├── test_gct2gctx.py │ │ │ ├── test_gctx2gct.py │ │ │ ├── test_parse.py │ │ │ ├── test_parse_gct.py │ │ │ ├── test_parse_gctx.py │ │ │ ├── test_random_slice.py │ │ │ ├── test_subset.py │ │ │ ├── test_subset_gctoo.py │ │ │ ├── test_transform_gctoo.py │ │ │ ├── test_write_gct.py │ │ │ └── test_write_gctx.py │ │ └── test_python2_python3_compatibility.py │ ├── transform_gctoo.py │ ├── write_gct.py │ └── write_gctx.py ├── set_io │ ├── __init__.py │ ├── gmt.py │ ├── grp.py │ └── tests │ │ ├── __init__.py │ │ ├── functional_tests │ │ ├── test.gmt │ │ ├── test.grp │ │ ├── test_bad.gmt │ │ └── test_bad2.gmt │ │ ├── test_gmt.py │ │ └── test_grp.py └── visualization │ ├── .gitignore │ ├── __init__.py │ ├── cohort_view.py │ ├── scattergram.py │ ├── stratogram.py │ ├── test_cohort_view.py │ ├── test_files │ └── PBRANT_CYCLE1_key_metrics_expanded_sample.txt │ ├── test_scattergram.py │ └── test_stratogram.py ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── available_modules.rst │ ├── build.rst │ ├── citing.rst │ ├── clue_api_client.rst │ ├── conf.py │ ├── contributing.rst │ ├── faq.rst │ ├── index.rst │ ├── licenses.rst │ ├── pandasGEXpress.rst │ ├── pandasgexpress_fig.png │ └── set_io.rst ├── nginx.conf ├── performance_testing ├── python_parse_timing.py └── python_write_timing.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tutorials ├── GCTX_mockup.png ├── GCT_mockup.png └── cmapPy_pandasGEXpress_tutorial.ipynb /.dockerignore: -------------------------------------------------------------------------------- 1 | cmapPy 2 | performance_testing 3 | tutorials 4 | .travis.yml 5 | LICENSE.txt 6 | MANIFEST.in 7 | README.rst 8 | requirements.txt 9 | setup.cfg 10 | setup.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | cmapPy.egg-info/ 4 | .vscode 5 | .gitignore 6 | .idea 7 | docs/build 8 | .DS_Store 9 | *-checkpoint.ipynb 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # set language 2 | language: python 3 | 4 | # requirements 5 | install: 6 | - pip install -r requirements.txt 7 | - python setup.py develop 8 | 9 | matrix: 10 | include: 11 | # run pandasGEXpress python2_tests 12 | - python: "2.7" 13 | script: 14 | - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python2_tests/ 15 | 16 | # run pandasGEXpress python3_tests 17 | - python: "3.6" 18 | script: 19 | - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python3_tests/ 20 | 21 | # run set_io tests for python2 22 | - python: "2.7" 23 | script: 24 | - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/ 25 | 26 | # run set_io tests for python3 27 | - python: "3.6" 28 | script: 29 | - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/ 30 | 31 | # run math tests for python2 32 | - python: "2.7" 33 | script: 34 | - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/ 35 | 36 | # run math tests for python3 37 | - python: "3.6" 38 | script: 39 | - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/ 40 | 41 | # run python2_python3_comaptibility tests for python2 42 | - python: "2.7" 43 | script: 44 | - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/ 45 | 46 | # run python2_python3_comaptibility tests for python3 47 | - python: "3.6" 48 | script: 49 | - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/ 50 | 51 | # what branches of github to use 52 | branches: 53 | only: 54 | - master -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nginx 2 | RUN mkdir -p /usr/share/nginx/html/cmapPy 3 | COPY docs/build/html /usr/share/nginx/html/cmapPy/ 4 | COPY nginx.conf /etc/nginx/ 5 | EXPOSE 9081 6 | CMD ["nginx", "-g", "daemon off;"] -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Connectivity Map (CMap) at the Broad Institute, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include cmapPy *.py 2 | recursive-include *.gct 3 | recursive-include *.gctx 4 | recursive-include *.cfg 5 | include LICENSE.txt 6 | include requirements.txt -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |install with bioconda| 2 | 3 | .. |install with bioconda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square 4 | :target: http://bioconda.github.io/recipes/cmappy/README.html 5 | 6 | .. image:: https://badge.fury.io/py/cmapPy.svg 7 | :target: https://badge.fury.io/py/cmapPy 8 | 9 | .. image:: https://travis-ci.org/cmap/cmapPy.svg?branch=master 10 | :target: https://travis-ci.org/cmap/cmapPy 11 | 12 | .. image:: https://readthedocs.org/projects/cmappy/badge/?version=latest 13 | :target: http://cmappy.readthedocs.io/en/latest/?badge=latest 14 | :alt: Documentation Status 15 | 16 | **cmapPy:** Tools for interacting with .gctx and .gct files, and other Connectivity Map resources 17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 18 | **Connectivity Map, Broad Institute of MIT and Harvard** 19 | 20 | Documentation: ``_ 21 | 22 | For questions/problems, please add an issue (that includes code/files that reproduce your problem) to the repository. 23 | 24 | Contributing 25 | ==================== 26 | 27 | We welcome contributors! For your pull requests, please include the following: 28 | 29 | * Sample code/file that reproducibly causes the bug/issue 30 | * Documented code providing fix 31 | * Unit tests evaluating added/modified methods. 32 | 33 | 34 | Citation 35 | ==================== 36 | 37 | If you use cmapPy and/or GCTx for your research, please cite `Enache et al.`_ 38 | 39 | .. _Enache et al.: https://academic.oup.com/bioinformatics/article/35/8/1427/5094509 40 | -------------------------------------------------------------------------------- /cmapPy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/__init__.py -------------------------------------------------------------------------------- /cmapPy/clue_api_client/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/clue_api_client/__init__.py -------------------------------------------------------------------------------- /cmapPy/clue_api_client/cell_queries.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | 4 | __authors__ = "David L. Lahr" 5 | __email__ = "dlahr@broadinstitute.org" 6 | 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | resource_name = "cells" 11 | 12 | 13 | def is_cell_line_in_api(my_clue_api_client, cell_id): 14 | query_result = my_clue_api_client.run_count_query(resource_name, {"cell_id":cell_id}) 15 | logger.debug("query_result: {}".format(query_result)) 16 | return query_result["count"] == 1 -------------------------------------------------------------------------------- /cmapPy/clue_api_client/clue_api_client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | import cmapPy.clue_api_client.setup_logger as setup_logger 4 | import json 5 | 6 | __authors__ = "David L. Lahr" 7 | __email__ = "dlahr@broadinstitute.org" 8 | 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class ClueApiClient(object): 14 | """Basic class for running queries against CLUE api 15 | """ 16 | 17 | def __init__(self, base_url=None, user_key=None): 18 | """ 19 | Args: 20 | base_url: specific URL to use for the CLUE api, e.g. https://dev-api.clue.io/api/ 21 | user_key: user key to use for authentication, available from CLUE account 22 | 23 | Returns: 24 | """ 25 | self.base_url = base_url 26 | self.headers = {"user_key":user_key} 27 | 28 | def run_filter_query(self, resource_name, filter_clause): 29 | """run a query (get) against the CLUE api, using the API and user key fields of self and the fitler_clause provided 30 | 31 | Args: 32 | resource_name: str - name of the resource / collection to query - e.g. genes, perts, cells etc. 33 | filter_clause: dictionary - contains filter to pass to API to; uses loopback specification 34 | 35 | Returns: list of dictionaries containing the results of the query 36 | """ 37 | url = self.base_url + "/" + resource_name 38 | params = {"filter":json.dumps(filter_clause)} 39 | 40 | r = requests.get(url, headers=self.headers, params=params) 41 | logger.debug("requests.get result r.status_code: {}".format(r.status_code)) 42 | 43 | ClueApiClient._check_request_response(r) 44 | 45 | return r.json() 46 | 47 | def run_count_query(self, resource_name, where_clause): 48 | """run a query (get) against CLUE api 49 | 50 | Args: 51 | resource_name: str - name of the resource / collection to query - e.g. genes, perts, cells etc. 52 | where_clause: dictionary - contains where clause to pass to API to; uses loopback specification 53 | 54 | Returns: dictionary containing the results of the query 55 | """ 56 | url = self.base_url + "/" + resource_name + "/count" 57 | params = {"where":json.dumps(where_clause)} 58 | 59 | r = requests.get(url, headers=self.headers, params=params) 60 | logger.debug("requests.get result r.status_code: {}".format(r.status_code)) 61 | 62 | ClueApiClient._check_request_response(r) 63 | 64 | return r.json() 65 | 66 | def run_post(self, resource_name, data): 67 | url = self.base_url + "/" + resource_name 68 | 69 | r = requests.post(url, data=data, headers=self.headers) 70 | logger.debug("requests.post result r.status_code: {}".format(r.status_code)) 71 | 72 | ClueApiClient._check_request_response(r) 73 | 74 | return r.json() 75 | 76 | def run_delete(self, resource_name, id): 77 | url = self.base_url + "/" + resource_name + "/" + id 78 | r = requests.delete(url, headers=self.headers) 79 | logger.debug("requests.delete result r.status_code: {}".format(r.status_code)) 80 | 81 | ClueApiClient._check_request_response(r) 82 | 83 | did_delete = r.json()["count"] == 1 84 | return did_delete 85 | 86 | def run_put(self, resource_name, id, data): 87 | url = self.base_url + "/" + resource_name + "/" + id 88 | 89 | r = requests.put(url, data=data, headers=self.headers) 90 | logger.debug("requests.put result r.status_code: {}".format(r.status_code)) 91 | 92 | ClueApiClient._check_request_response(r) 93 | 94 | return r.json() 95 | 96 | @staticmethod 97 | def _check_request_response(response): 98 | assert response.status_code == 200, "ClueApiClient request failed response.status_code: {} response.reason: {}".format( 99 | response.status_code, response.reason) -------------------------------------------------------------------------------- /cmapPy/clue_api_client/gene_queries.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 3 | 4 | __authors__ = "David L. Lahr" 5 | __email__ = "dlahr@broadinstitute.org" 6 | 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | resource_name = "genes" 11 | 12 | 13 | def are_genes_in_api(my_clue_api_client, gene_symbols): 14 | """determine if genes are present in the API 15 | 16 | Args: 17 | my_clue_api_client: 18 | gene_symbols: collection of gene symbols to query the API with 19 | 20 | Returns: set of the found gene symbols 21 | 22 | """ 23 | if len(gene_symbols) > 0: 24 | query_gene_symbols = gene_symbols if type(gene_symbols) is list else list(gene_symbols) 25 | 26 | query_result = my_clue_api_client.run_filter_query(resource_name, 27 | {"where":{"gene_symbol":{"inq":query_gene_symbols}}, "fields":{"gene_symbol":True}}) 28 | logger.debug("query_result: {}".format(query_result)) 29 | 30 | r = set([x["gene_symbol"] for x in query_result]) 31 | return r 32 | else: 33 | logger.warning("provided gene_symbols was empty, cannot run query") 34 | return set() 35 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/macchiato_queries.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | 4 | __authors__ = "David L. Lahr" 5 | __email__ = "dlahr@broadinstitute.org" 6 | 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | resource_name = "macchiato" 11 | 12 | uploading_status = "UPLOADING" 13 | uploaded_status = "UPLOADED" 14 | 15 | 16 | def is_brew_prefix_in_api(my_clue_api_client, brew_prefix): 17 | my_where_clause = {"brew_prefix":brew_prefix} 18 | query_result = my_clue_api_client.run_count_query(resource_name, my_where_clause) 19 | logger.debug("query_result: {}".format(query_result)) 20 | return query_result["count"] == 1 21 | 22 | 23 | def get_api_id(my_clue_api_client, brew_prefix): 24 | my_filter = {"where":{"brew_prefix":brew_prefix}, "fields":{"id":True}} 25 | id_result = my_clue_api_client.run_filter_query(resource_name, my_filter) 26 | logger.debug("id_result: {}".format(id_result)) 27 | return id_result[0]["id"] 28 | 29 | 30 | def change_status(my_clue_api_client, api_id, new_status): 31 | r = my_clue_api_client.run_put(resource_name, api_id, {"status":new_status}) 32 | return r 33 | 34 | 35 | def create_brew_prefix_in_api(my_clue_api_client, brew_prefix, status=uploading_status): 36 | data = {"brew_prefix":brew_prefix, "status":uploading_status} 37 | r = my_clue_api_client.run_post(resource_name, data) 38 | return r 39 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/mock_clue_api_client.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import cmapPy.clue_api_client.clue_api_client as clue_api_client 4 | 5 | __authors__ = "David L. Lahr" 6 | __email__ = "dlahr@broadinstitute.org" 7 | 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class MockClueApiClient(clue_api_client.ClueApiClient): 13 | def __init__(self, base_url=None, user_key=None, default_return_values=None, filter_query_result=None, 14 | count_query_result=None, post_result=None, delete_result=None, put_result=None): 15 | 16 | super(MockClueApiClient, self).__init__(base_url=base_url, user_key=user_key) 17 | 18 | self.default_return_values = default_return_values if default_return_values else [] 19 | 20 | self.filter_query_result = filter_query_result if filter_query_result else self.default_return_values 21 | 22 | self.count_query_result = count_query_result if count_query_result else self.default_return_values 23 | 24 | self.post_result = post_result if post_result else self.default_return_values 25 | 26 | self.delete_result = delete_result if delete_result else self.default_return_values 27 | 28 | self.put_result = put_result if put_result else self.default_return_values 29 | 30 | def run_filter_query(self, resource_name, filter_clause): 31 | return self.filter_query_result 32 | 33 | def run_count_query(self, resource_name, where_clause): 34 | return self.count_query_result 35 | 36 | def run_post(self, resource_name, data): 37 | return self.post_result 38 | 39 | def run_delete(self, resource_name, id): 40 | return self.delete_result 41 | 42 | def run_put(self, resource_name, id, data): 43 | return self.put_result 44 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/pert_queries.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | 4 | __authors__ = "David L. Lahr" 5 | __email__ = "dlahr@broadinstitute.org" 6 | 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | resource_name = "perts" 11 | 12 | 13 | def retrieve_pert_id_pert_iname_map(pert_ids, my_clue_api_client): 14 | my_filter = {"where":{"pert_id":{"inq":pert_ids}}, "fields":{"pert_id":True, "pert_iname":True}} 15 | query_result = my_clue_api_client.run_filter_query(resource_name, my_filter) 16 | logger.debug("query_result: {}".format(query_result)) 17 | 18 | r = _build_map_from_clue_api_result(query_result, "pert_id", "pert_iname") 19 | return r 20 | 21 | 22 | def retrieve_pert_id_pert_type_map(pert_ids, my_clue_api_client): 23 | my_filter = {"where":{"pert_id":{"inq":pert_ids}}, "fields":{"pert_id":True, "pert_type":True}} 24 | query_result = my_clue_api_client.run_filter_query(resource_name, my_filter) 25 | logger.debug("query_result: {}".format(query_result)) 26 | 27 | r = _build_map_from_clue_api_result(query_result, "pert_id", "pert_type") 28 | return r 29 | 30 | 31 | def _build_map_from_clue_api_result(clue_api_result, key_field, value_field): 32 | r = {} 33 | for car in clue_api_result: 34 | key = car[key_field] 35 | value = car[value_field] 36 | r[key] = value 37 | 38 | return r 39 | 40 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/setup_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | 4 | __author__ = "David Lahr" 5 | __email__ = "dlahr@broadinstitute.org" 6 | 7 | LOGGER_NAME = "cmap_logger" 8 | 9 | _LOG_FORMAT = "%(levelname)s %(asctime)s %(module)s %(funcName)s %(message)s" 10 | _LOG_FILE_MAX_BYTES = 10000000 11 | _LOG_FILE_BACKUP_COUNT = 5 12 | 13 | 14 | def setup(verbose=False, log_file=None): 15 | logger = logging.getLogger(LOGGER_NAME) 16 | 17 | level = (logging.DEBUG if verbose else logging.INFO) 18 | 19 | if log_file is None: 20 | logging.basicConfig(level=level, format=_LOG_FORMAT) 21 | else: 22 | logger.setLevel(level) 23 | handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=_LOG_FILE_MAX_BYTES, 24 | backupCount=_LOG_FILE_BACKUP_COUNT) 25 | handler.setFormatter(logging.Formatter(fmt=_LOG_FORMAT)) 26 | logger.addHandler(handler) 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_cell_queries.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import test_clue_api_client 5 | import cmapPy.clue_api_client.cell_queries as cq 6 | 7 | __authors__ = "David L. Lahr" 8 | __email__ = "dlahr@broadinstitute.org" 9 | 10 | 11 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 12 | 13 | cao = None 14 | 15 | 16 | class TestCellQueries(unittest.TestCase): 17 | def test_is_cell_line_in_api(self): 18 | r = cq.is_cell_line_in_api(cao, "A375") 19 | self.assertTrue(r) 20 | r = cq.is_cell_line_in_api(cao, "Dave Lahr's fake cell line that never existed") 21 | self.assertFalse(r) 22 | 23 | 24 | if __name__ == "__main__": 25 | setup_logger.setup(verbose=True) 26 | 27 | cao = test_clue_api_client.build_clue_api_client_from_default_test_config() 28 | 29 | unittest.main() -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_clue_api_client.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import cmapPy.clue_api_client.clue_api_client as clue_api_client 5 | import os.path 6 | import collections 7 | 8 | __authors__ = "David L. Lahr" 9 | __email__ = "dlahr@broadinstitute.org" 10 | 11 | 12 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 13 | 14 | config_filepath = os.path.expanduser("~/.cmapPy.cfg") 15 | config_section = "test" 16 | cao = None 17 | 18 | test_brew_prefix = "dlahr brew prefix 001" 19 | test_status = "my fake status" 20 | 21 | class TestClueApiClient(unittest.TestCase): 22 | def test_run_query(self): 23 | #get one gene 24 | r = cao.run_filter_query("genes", {"where":{"entrez_id":5720}}) 25 | self.assertIsNotNone(r) 26 | logger.debug("len(r): {}".format(len(r))) 27 | logger.debug("r: {}".format(r)) 28 | self.assertEqual(1, len(r)) 29 | 30 | #get multiple genes 31 | r = cao.run_filter_query("genes", {"where":{"entrez_id":{"inq":[5720,207]}}}) 32 | self.assertIsNotNone(r) 33 | logger.debug("len(r): {}".format(len(r))) 34 | logger.debug("r: {}".format(r)) 35 | self.assertEqual(2, len(r)) 36 | 37 | r = cao.run_filter_query("perts", {"where":{"pert_id":"BRD-K12345678"}}) 38 | self.assertIsNotNone(r) 39 | logger.debug("len(r): {}".format(len(r))) 40 | self.assertEqual(0, len(r)) 41 | 42 | def test_run_query_handle_fail(self): 43 | with self.assertRaises(Exception) as context: 44 | cao.run_filter_query("fakeresource", {}) 45 | self.assertIsNotNone(context.exception) 46 | logger.debug("context.exception: {}".format(context.exception)) 47 | self.assertIn("ClueApiClient request failed", str(context.exception)) 48 | 49 | def test_run_where_query(self): 50 | r = cao.run_count_query("cells", {"cell_id":"A375"}) 51 | self.assertIsNotNone(r) 52 | logger.debug("r: {}".format(r)) 53 | self.assertIn("count", r) 54 | self.assertEqual(1, r["count"]) 55 | 56 | def test__check_request_response(self): 57 | FakeResponse = collections.namedtuple("FakeResponse", ["status_code", "reason"]) 58 | 59 | #happy path 60 | fr = FakeResponse(200, None) 61 | clue_api_client.ClueApiClient._check_request_response(fr) 62 | 63 | #response status code that should cause failure 64 | fr2 = FakeResponse(404, "I don't need a good reason!") 65 | with self.assertRaises(Exception) as context: 66 | clue_api_client.ClueApiClient._check_request_response(fr2) 67 | logger.debug("context.exception: {}".format(context.exception)) 68 | self.assertIn(str(fr2.status_code), str(context.exception)) 69 | self.assertIn(fr2.reason, str(context.exception)) 70 | 71 | def test_run_post(self): 72 | #check that the entry isn't already there, if it is delete it 73 | check_result = cao.run_count_query("macchiato", {"brew_prefix":test_brew_prefix}) 74 | if check_result["count"] == 1: 75 | lookup_result = cao.run_filter_query("macchiato", {"where":{"brew_prefix":test_brew_prefix}})[0] 76 | cao.run_delete("macchiato", lookup_result["id"]) 77 | 78 | #happy path 79 | data = {"brew_prefix":test_brew_prefix, "status":test_status} 80 | r = cao.run_post("macchiato", data) 81 | self.assertIsNotNone(r) 82 | logger.debug("r: {}".format(r)) 83 | self.assertIn("brew_prefix", r) 84 | self.assertEqual(data["brew_prefix"], r["brew_prefix"]) 85 | self.assertIn("id", r) 86 | #check that user key has not been added to entry 87 | self.assertNotIn("user_key", r) 88 | 89 | #clean up 90 | r = cao.run_delete("macchiato", r["id"]) 91 | 92 | def test_run_delete(self): 93 | #check that there is an entry to delete, if not create it 94 | lookup_result = add_entry_if_not_already_present(cao, "macchiato", {"brew_prefix":test_brew_prefix}, 95 | {"brew_prefix":test_brew_prefix, "status": test_status}) 96 | 97 | delete_id = lookup_result["id"] 98 | 99 | #happy path 100 | r = cao.run_delete("macchiato", delete_id) 101 | self.assertIsNotNone(r) 102 | logger.debug("r: {}".format(r)) 103 | self.assertTrue(r) 104 | 105 | def test_run_put(self): 106 | #check that there is an entry to update, if not create it 107 | lookup_result = add_entry_if_not_already_present(cao, "macchiato", {"brew_prefix":test_brew_prefix}, 108 | {"brew_prefix":test_brew_prefix, "status": test_status}) 109 | 110 | put_id = lookup_result["id"] 111 | 112 | expected_status = "test status for test_clue_api_client test_run_put" 113 | r = cao.run_put("macchiato", put_id, {"status":expected_status}) 114 | self.assertIsNotNone(r) 115 | logger.debug("r: {}".format(r)) 116 | self.assertIn("status", r) 117 | self.assertEqual(expected_status, r["status"]) 118 | self.assertNotIn("user_key", r) 119 | 120 | 121 | def build_clue_api_client_from_default_test_config(): 122 | cfg = configparser.RawConfigParser() 123 | cfg.read(config_filepath) 124 | cao = clue_api_client.ClueApiClient(base_url=cfg.get(config_section, "clue_api_url"), 125 | user_key=cfg.get(config_section, "clue_api_user_key")) 126 | return cao 127 | 128 | 129 | def add_entry_if_not_already_present(my_clue_api_orm, resource_name, where_query, default_data): 130 | check_result = my_clue_api_orm.run_count_query(resource_name, where_query) 131 | if check_result["count"] == 0: 132 | lookup_result = my_clue_api_orm.run_post(resource_name, default_data) 133 | else: 134 | lookup_result = my_clue_api_orm.run_filter_query(resource_name, {"where":where_query})[0] 135 | 136 | return lookup_result 137 | 138 | 139 | if __name__ == "__main__": 140 | setup_logger.setup(verbose=True) 141 | 142 | cao = build_clue_api_client_from_default_test_config() 143 | 144 | unittest.main() 145 | -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_gene_queries.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import test_clue_api_client 5 | import cmapPy.clue_api_client.gene_queries as gq 6 | 7 | __authors__ = "David L. Lahr" 8 | __email__ = "dlahr@broadinstitute.org" 9 | 10 | 11 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 12 | 13 | cao = None 14 | 15 | 16 | class TestGeneQueries(unittest.TestCase): 17 | def test_are_genes_in_api(self): 18 | #happy path mix of valid and invalid genes 19 | r = gq.are_genes_in_api(cao, ["AKT1", "BRAF", "Dave Lahr's fake cell line that never existed"]) 20 | logger.debug("r: {}".format(r)) 21 | self.assertIsNotNone(r) 22 | self.assertEqual(2, len(r)) 23 | self.assertIn("AKT1", r) 24 | self.assertIn("BRAF", r) 25 | 26 | #happy path provide genes as set 27 | r = gq.are_genes_in_api(cao, {"AKT1"}) 28 | logger.debug("r: {}".format(r)) 29 | self.assertIsNotNone(r) 30 | self.assertEqual(1, len(r)) 31 | self.assertIn("AKT1", r) 32 | 33 | def test_are_genes_in_api_no_genes_provided(self): 34 | r = gq.are_genes_in_api(cao, set()) 35 | logger.debug("r: {}".format(r)) 36 | self.assertIsNotNone(r) 37 | self.assertEqual(0, len(r)) 38 | 39 | if __name__ == "__main__": 40 | setup_logger.setup(verbose=True) 41 | 42 | cao = test_clue_api_client.build_clue_api_client_from_default_test_config() 43 | 44 | unittest.main() -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_macchiato_queries.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import test_clue_api_client 5 | import cmapPy.clue_api_client.macchiato_queries as mq 6 | 7 | __authors__ = "David L. Lahr" 8 | __email__ = "dlahr@broadinstitute.org" 9 | 10 | 11 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 12 | 13 | cao = None 14 | 15 | test_brew_prefix = "test_brew_prefix_for_test_macchiato_queries" 16 | test_status = "test macchiato status for test_macchiato_queries" 17 | 18 | 19 | class TestMacchiatoQueries(unittest.TestCase): 20 | def setUp(self): 21 | test_clue_api_client.add_entry_if_not_already_present(cao, mq.resource_name, 22 | {"brew_prefix":test_brew_prefix}, {"brew_prefix":test_brew_prefix, "status": test_status}) 23 | 24 | def test_is_brew_prefix_in_api(self): 25 | r = mq.is_brew_prefix_in_api(cao, test_brew_prefix) 26 | self.assertTrue(r) 27 | 28 | r = mq.is_brew_prefix_in_api(cao, "Dave Lahr's fake brew prefix that hopefully will never exist in the API") 29 | self.assertFalse(r) 30 | 31 | def test_get_api_id(self): 32 | r = mq.get_api_id(cao, test_brew_prefix) 33 | self.assertIsNotNone(r) 34 | logger.debug("r: {}".format(r)) 35 | 36 | def test_change_status(self): 37 | cur_id = mq.get_api_id(cao, test_brew_prefix) 38 | 39 | expected_new_status = "test status for test_macchiato_queries TestMacchiatoQueries.test_change_status" 40 | r = mq.change_status(cao, cur_id, expected_new_status) 41 | self.assertIsNotNone(r) 42 | logger.debug("r: {}".format(r)) 43 | self.assertIn("status", r) 44 | self.assertEqual(expected_new_status, r["status"]) 45 | 46 | def test_create_brew_prefix_in_api(self): 47 | #happy path 48 | expected_brew_prefix = "brew_prefix for TestMacchiatoQueries.test_create_brew_prefix_in_api" 49 | r = mq.create_brew_prefix_in_api(cao, expected_brew_prefix, status=test_status) 50 | self.assertIsNotNone(r) 51 | logger.debug("r: {}".format(r)) 52 | self.assertIn("id", r) 53 | self.assertIsNotNone(r["id"]) 54 | 55 | #cleanup by deleting created entry 56 | cao.run_delete(mq.resource_name, r["id"]) 57 | 58 | 59 | if __name__ == "__main__": 60 | setup_logger.setup(verbose=True) 61 | 62 | cao = test_clue_api_client.build_clue_api_client_from_default_test_config() 63 | 64 | unittest.main() -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_mock_clue_api_client.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import cmapPy.clue_api_client.mock_clue_api_client as mock_clue_api_client 5 | 6 | __authors__ = "David L. Lahr" 7 | __email__ = "dlahr@broadinstitute.org" 8 | 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class TestMockClueApiClient(unittest.TestCase): 14 | def test_run(self): 15 | mcao = mock_clue_api_client.MockClueApiClient(default_return_values=[{"hello":"world"}]) 16 | method_list = [mcao.run_filter_query, mcao.run_count_query, mcao.run_delete, mcao.run_post, mcao.run_put] 17 | for ml in method_list: 18 | if ml == mcao.run_put: 19 | r = ml("fake resource name", {"unused":"filter"}, None) 20 | else: 21 | r = ml("fake resource name", {"unused":"filter"}) 22 | self.assertIsNotNone(r) 23 | logger.debug("r: {}".format(r)) 24 | self.assertEqual(1, len(r)) 25 | r = r[0] 26 | self.assertEqual(1, len(r)) 27 | self.assertIn("hello", r) 28 | self.assertEqual("world", r["hello"]) 29 | 30 | 31 | if __name__ == "__main__": 32 | setup_logger.setup(verbose=True) 33 | 34 | unittest.main() -------------------------------------------------------------------------------- /cmapPy/clue_api_client/tests/test_pert_queries.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import cmapPy.clue_api_client.setup_logger as setup_logger 3 | import logging 4 | import test_clue_api_client 5 | import cmapPy.clue_api_client.cell_queries as pq 6 | 7 | __authors__ = "David L. Lahr" 8 | __email__ = "dlahr@broadinstitute.org" 9 | 10 | 11 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 12 | 13 | cao = None 14 | 15 | 16 | class TestPertQueries(unittest.TestCase): 17 | def test__build_map_from_clue_api_result(self): 18 | r = pq._build_map_from_clue_api_result([{"a": "b", "c": "d"}], "a", "c") 19 | self.assertIsNotNone(r) 20 | logger.debug("r: {}".format(r)) 21 | self.assertEqual(1, len(r)) 22 | self.assertIn("b", r) 23 | self.assertEqual("d", r["b"]) 24 | 25 | def test_retrieve_pert_id_pert_iname_map(self): 26 | r = pq.retrieve_pert_id_pert_iname_map(["BRD-K21680192", "BRD-K88378636", "not a valid BRD"], cao) 27 | self.assertIsNotNone(r) 28 | logger.debug("r: {}".format(r)) 29 | self.assertEqual(2, len(r)) 30 | self.assertIn("BRD-K21680192", r) 31 | self.assertIsNotNone(r["BRD-K21680192"]) 32 | self.assertIn("BRD-K88378636", r) 33 | self.assertIsNotNone(r["BRD-K88378636"]) 34 | self.assertNotIn("not a valid BRD", r) 35 | 36 | def test_retrieve_pert_id_pert_type_map(self): 37 | r = pq.retrieve_pert_id_pert_type_map(["BRD-K21680192", "BRD-K88378636", "not a valid BRD"], cao) 38 | self.assertIsNotNone(r) 39 | logger.debug("r: {}".format(r)) 40 | self.assertEqual(2, len(r)) 41 | self.assertIn("BRD-K21680192", r) 42 | self.assertIsNotNone(r["BRD-K21680192"]) 43 | self.assertIn("BRD-K88378636", r) 44 | self.assertIsNotNone(r["BRD-K88378636"]) 45 | self.assertNotIn("not a valid BRD", r) 46 | 47 | 48 | if __name__ == "__main__": 49 | setup_logger.setup(verbose=True) 50 | 51 | cao = test_clue_api_client.build_clue_api_client_from_default_test_config() 52 | 53 | unittest.main() -------------------------------------------------------------------------------- /cmapPy/example_cmapPy_config_file.cfg: -------------------------------------------------------------------------------- 1 | [prod] 2 | clue_api_url = https://api.clue.io/api 3 | clue_api_user_key = CHANGE_ME 4 | 5 | [test] 6 | clue_api_url = https://dev-api.clue.io/api 7 | clue_api_user_key = CHANGE_ME -------------------------------------------------------------------------------- /cmapPy/math/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/math/__init__.py -------------------------------------------------------------------------------- /cmapPy/math/agg_wt_avg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | agg_wt_avg.py 3 | 4 | Aggregate a matrix of replicate profiles into a single signature using 5 | a weighted average based on the correlation between replicates. That is, if 6 | one replicate is less correlated with the other replicates, its values will 7 | not be weighted as highly in the aggregated signature. 8 | 9 | Equivalent to the 'modz' method in mortar. 10 | ''' 11 | 12 | import numpy as np 13 | 14 | rounding_precision = 4 15 | 16 | 17 | def get_upper_triangle(correlation_matrix): 18 | ''' Extract upper triangle from a square matrix. Negative values are 19 | set to 0. 20 | 21 | Args: 22 | correlation_matrix (pandas df): Correlations between all replicates 23 | 24 | Returns: 25 | upper_tri_df (pandas df): Upper triangle extracted from 26 | correlation_matrix; rid is the row index, cid is the column index, 27 | corr is the extracted correlation value 28 | ''' 29 | upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool)) 30 | 31 | # convert matrix into long form description 32 | upper_tri_df = upper_triangle.stack().reset_index(level=1) 33 | upper_tri_df.columns = ['rid', 'corr'] 34 | 35 | # Index at this point is cid, it now becomes a column 36 | upper_tri_df.reset_index(level=0, inplace=True) 37 | 38 | # Get rid of negative values 39 | upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0) 40 | 41 | return upper_tri_df.round(rounding_precision) 42 | 43 | 44 | def calculate_weights(correlation_matrix, min_wt): 45 | ''' Calculate a weight for each profile based on its correlation to other 46 | replicates. Negative correlations are clipped to 0, and weights are clipped 47 | to be min_wt at the least. 48 | 49 | Args: 50 | correlation_matrix (pandas df): Correlations between all replicates 51 | min_wt (float): Minimum raw weight when calculating weighted average 52 | 53 | Returns: 54 | raw weights (pandas series): Mean correlation to other replicates 55 | weights (pandas series): raw_weights normalized such that they add to 1 56 | ''' 57 | # fill diagonal of correlation_matrix with np.nan 58 | np.fill_diagonal(correlation_matrix.values, np.nan) 59 | 60 | # remove negative values 61 | correlation_matrix = correlation_matrix.clip(lower=0) 62 | 63 | # get average correlation for each profile (will ignore NaN) 64 | raw_weights = correlation_matrix.mean(axis=1) 65 | 66 | # threshold weights 67 | raw_weights = raw_weights.clip(lower=min_wt) 68 | 69 | # normalize raw_weights so that they add to 1 70 | weights = raw_weights / sum(raw_weights) 71 | 72 | return raw_weights.round(rounding_precision), weights.round(rounding_precision) 73 | 74 | 75 | def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'): 76 | ''' Aggregate a set of replicate profiles into a single signature using 77 | a weighted average. 78 | 79 | Args: 80 | mat (pandas df): a matrix of replicate profiles, where the columns are 81 | samples and the rows are features; columns correspond to the 82 | replicates of a single perturbagen 83 | min_wt (float): Minimum raw weight when calculating weighted average 84 | corr_metric (string): Spearman or Pearson; the correlation method 85 | 86 | Returns: 87 | out_sig (pandas series): weighted average values 88 | upper_tri_df (pandas df): the correlations between each profile that went into the signature 89 | raw weights (pandas series): weights before normalization 90 | weights (pandas series): weights after normalization 91 | ''' 92 | assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat) 93 | 94 | if mat.shape[1] == 1: 95 | 96 | out_sig = mat 97 | upper_tri_df = None 98 | raw_weights = None 99 | weights = None 100 | 101 | else: 102 | 103 | assert corr_metric in ["spearman", "pearson"] 104 | 105 | # Make correlation matrix column wise 106 | corr_mat = mat.corr(method=corr_metric) 107 | 108 | # Save the values in the upper triangle 109 | upper_tri_df = get_upper_triangle(corr_mat) 110 | 111 | # Calculate weight per replicate 112 | raw_weights, weights = calculate_weights(corr_mat, min_wt) 113 | 114 | # Apply weights to values 115 | weighted_values = mat * weights 116 | out_sig = weighted_values.sum(axis=1) 117 | 118 | return out_sig, upper_tri_df, raw_weights, weights -------------------------------------------------------------------------------- /cmapPy/math/fast_corr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 3 | import numpy 4 | import cmapPy.math.fast_cov as fast_cov 5 | import pandas 6 | 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | 11 | def fast_corr(x, y=None, destination=None): 12 | """calculate the pearson correlation matrix for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix 13 | between x and y (with dimensions OxP). If destination is provided, put the results there. 14 | In the language of statistics the columns are the variables and the rows are the observations. 15 | 16 | Args: 17 | x (numpy array-like) MxN in shape 18 | y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) 19 | destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy 20 | memmap of a file) 21 | 22 | returns (numpy array-like) array of the covariance values 23 | for defaults (y=None), shape is NxN 24 | if y is provied, shape is NxP 25 | """ 26 | if y is None: 27 | y = x 28 | 29 | r = fast_cov.fast_cov(x, y, destination=destination) 30 | 31 | std_x = numpy.std(x, axis=0, ddof=1) 32 | if numpy.isscalar(std_x): 33 | std_x = numpy.array((std_x,)) 34 | 35 | std_y = numpy.std(y, axis=0, ddof=1) 36 | if numpy.isscalar(std_y): 37 | std_y = numpy.array((std_y,)) 38 | 39 | numpy.divide(r, std_x[:, numpy.newaxis], out=r) 40 | numpy.divide(r, std_y[numpy.newaxis, :], out=r) 41 | 42 | return r 43 | 44 | 45 | def calculate_moments_with_additional_mask(x, mask): 46 | """calculate the moments (y, y^2, and variance) of the columns of x, excluding masked within x, for each of the masking columns in mask 47 | Number of rows in x and mask must be the same. 48 | 49 | Args: 50 | x (numpy.ma.array like) 51 | mask (numpy array-like boolean) 52 | """ 53 | non_mask_overlaps = fast_cov.calculate_non_mask_overlaps(x.mask, mask) 54 | 55 | unmask = 1.0 * ~mask 56 | 57 | expect_x = numpy.ma.dot(x.T, unmask) / non_mask_overlaps 58 | expect_x = expect_x.T 59 | 60 | expect_x_squared = numpy.ma.dot( 61 | numpy.power(x, 2.0).T, unmask 62 | ) / non_mask_overlaps 63 | expect_x_squared = expect_x_squared.T 64 | 65 | var_x = (expect_x_squared - numpy.power(expect_x, 2.0)) * non_mask_overlaps.T / (non_mask_overlaps.T - 1) 66 | 67 | return expect_x, expect_x_squared, var_x 68 | 69 | 70 | def nan_fast_corr(x, y=None, destination=None): 71 | """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix 72 | between x and y (with dimensions OxP). If destination is provided, put the results there. 73 | In the language of statistics the columns are the variables and the rows are the observations. 74 | 75 | Args: 76 | x (numpy array-like) MxN in shape 77 | y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) 78 | destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy 79 | memmap of a file) 80 | 81 | returns (numpy array-like) array of the covariance values 82 | for defaults (y=None), shape is NxN 83 | if y is provied, shape is NxP 84 | """ 85 | x_masked = numpy.ma.array(x, mask=numpy.isnan(x)) 86 | 87 | if y is None: 88 | y_masked = x_masked 89 | else: 90 | y_masked = numpy.ma.array(y, mask=numpy.isnan(y)) 91 | 92 | r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination) 93 | 94 | # calculate the standard deviation of the columns of each matrix, given the masking from the other 95 | _, _, var_x = calculate_moments_with_additional_mask(x_masked, y_masked.mask) 96 | std_x = numpy.sqrt(var_x) 97 | 98 | _, _, var_y = calculate_moments_with_additional_mask(y_masked, x_masked.mask) 99 | std_y = numpy.sqrt(var_y) 100 | 101 | numpy.divide(r, std_x.T, out=r) 102 | numpy.divide(r, std_y, out=r) 103 | 104 | return r 105 | 106 | 107 | def fast_spearman(x, y=None, destination=None): 108 | """calculate the spearman correlation matrix for the columns of x (with dimensions MxN), or optionally, the spearman correlaton 109 | matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there. 110 | In the language of statistics the columns are the variables and the rows are the observations. 111 | 112 | Args: 113 | x (numpy array-like) MxN in shape 114 | y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) 115 | destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy 116 | memmap of a file) 117 | 118 | returns: 119 | (numpy array-like) array of the covariance values 120 | for defaults (y=None), shape is NxN 121 | if y is provied, shape is NxP 122 | """ 123 | r = _fast_spearman(fast_corr, x, y, destination) 124 | return r 125 | 126 | 127 | def _fast_spearman(corr_method, x, y, destination): 128 | """internal method for calculating spearman correlation, allowing subsititution of methods for calculationg correlation (corr_method), 129 | allowing to choose methods that are fast (fast_corr) or tolerant of nan's (nan_fast_corr) to be used 130 | """ 131 | logger.debug("x.shape: {}".format(x.shape)) 132 | if hasattr(y, "shape"): 133 | logger.debug("y.shape: {}".format(y.shape)) 134 | 135 | x_ranks = pandas.DataFrame(x).rank(method="average", na_option="keep").values 136 | logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0))) 137 | 138 | y_ranks = pandas.DataFrame(y).rank(method="average", na_option="keep").values if y is not None else None 139 | 140 | return corr_method(x_ranks, y_ranks, destination=destination) 141 | 142 | 143 | def nan_fast_spearman(x, y=None, destination=None): 144 | """calculate the spearman correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the spearman correlaton 145 | matrix between the columns of x and the columns of y (with dimensions OxP). If destination is provided, put the results there. 146 | In the language of statistics the columns are the variables and the rows are the observations. 147 | Note that the ranks will be slightly miscalculated in the masked situations leading to slight errors in the spearman rho value. 148 | 149 | Args: 150 | x (numpy array-like) MxN in shape 151 | y (optional, numpy array-like) OxP in shape. M (# rows in x) must equal O (# rows in y) 152 | destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy 153 | memmap of a file) 154 | 155 | returns: 156 | (numpy array-like) array of the covariance values 157 | for defaults (y=None), shape is NxN 158 | if y is provied, shape is NxP 159 | """ 160 | r = _fast_spearman(nan_fast_corr, x, y, destination) 161 | return r 162 | -------------------------------------------------------------------------------- /cmapPy/math/fast_cov.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 3 | import numpy 4 | 5 | 6 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 7 | 8 | 9 | def _fast_dot_divide(x, y, destination): 10 | """helper method for use within the _fast_cov method - carry out the dot product and subsequent 11 | division to generate the covariance values. For use when there are no missing values. 12 | """ 13 | numpy.dot(x.T, y, out=destination) 14 | numpy.divide(destination, (x.shape[0] - 1), out=destination) 15 | 16 | 17 | def calculate_non_mask_overlaps(x_mask, y_mask): 18 | """for two mask arrays (x_mask, y_mask - boolean arrays) determine the number of entries in common there would be for each 19 | entry if their dot product were taken 20 | """ 21 | x_is_not_nan = 1 * ~x_mask 22 | y_is_not_nan = 1 * ~y_mask 23 | 24 | r = numpy.dot(x_is_not_nan.T, y_is_not_nan) 25 | return r 26 | 27 | 28 | def _nan_dot_divide(x, y, destination): 29 | """helper method for use within the _fast_cov method - carry out the dot product and subsequent 30 | division to generate the covariance values. For use when there are missing values. 31 | """ 32 | numpy.ma.dot(x.T, y, out=destination) 33 | 34 | divisor = calculate_non_mask_overlaps(x.mask, y.mask) - 1 35 | 36 | numpy.ma.divide(destination, divisor, out=destination) 37 | 38 | 39 | def fast_cov(x, y=None, destination=None): 40 | """calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the 41 | columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows 42 | are observations). 43 | 44 | Args: 45 | x (numpy array-like) MxN in shape 46 | y (numpy array-like) MxP in shape 47 | destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy 48 | memmap of a file) 49 | 50 | returns (numpy array-like) array of the covariance values 51 | for defaults (y=None), shape is NxN 52 | if y is provided, shape is NxP 53 | """ 54 | r = _fast_cov(numpy.mean, _fast_dot_divide, x, y, destination) 55 | 56 | return r 57 | 58 | 59 | def _fast_cov(mean_method, dot_divide_method, x, y, destination): 60 | validate_inputs(x, y, destination) 61 | 62 | new_x = x if len(x.shape) == 2 else x[:, numpy.newaxis] 63 | 64 | if y is None: 65 | y = new_x 66 | new_y = y if len(y.shape) == 2 else y[:, numpy.newaxis] 67 | 68 | if destination is None: 69 | destination = numpy.zeros((new_x.shape[1], new_y.shape[1])) 70 | 71 | mean_x = mean_method(new_x, axis=0) 72 | mean_y = mean_method(new_y, axis=0) 73 | 74 | mean_centered_x = (new_x - mean_x).astype(destination.dtype) 75 | mean_centered_y = (new_y - mean_y).astype(destination.dtype) 76 | 77 | dot_divide_method(mean_centered_x, mean_centered_y, destination) 78 | 79 | return destination 80 | 81 | 82 | def validate_inputs(x, y, destination): 83 | error_msg = "" 84 | 85 | if not hasattr(x, "shape"): 86 | error_msg += "x needs to be numpy array-like but it does not have \"shape\" attribute - type(x): {}\n".format(type(x)) 87 | 88 | if destination is not None and not hasattr(destination, "shape"): 89 | error_msg += "destination needs to be numpy array-like but it does not have \"shape\" attribute - type(destination): {}\n".format(type(destination)) 90 | 91 | if y is None: 92 | if destination is not None: 93 | expected_dim = x.shape[1] if len(x.shape) == 2 else 1 94 | expected_shape = (expected_dim, expected_dim) 95 | if destination.shape != expected_shape: 96 | error_msg += "x and destination provided, therefore destination must have shape matching number of columns of x but it does not - x.shape: {} expected_shape: {} destination.shape: {}\n".format( 97 | x.shape, expected_shape, destination.shape) 98 | else: 99 | if not hasattr(y, "shape"): 100 | error_msg += "y needs to be numpy array-like but it does not have \"shape\" attribute - type(y): {}\n".format(type(y)) 101 | elif x.shape[0] != y.shape[0]: 102 | error_msg += "the number of rows in the x and y matrices must be the same - x.shape: {} y.shape: {}\n".format(x.shape, y.shape) 103 | elif destination is not None: 104 | expected_rows = x.shape[1] if len(x.shape) == 2 else 1 105 | expected_cols = y.shape[1] if len(y.shape) == 2 else 1 106 | expected_shape = (expected_rows, expected_cols) 107 | if destination.shape != expected_shape: 108 | error_msg += "x, y, and destination provided, therefore destination must have number of rows matching number of columns of x and destination needs to have number of columns matching number of columns of y - x.shape: {} y.shape: {} expected_shape: {} destination.shape: {}\n".format( 109 | x.shape, y.shape, expected_shape, destination.shape) 110 | 111 | if error_msg != "": 112 | raise CmapPyMathFastCovInvalidInputXY(error_msg) 113 | 114 | 115 | def nan_fast_cov(x, y=None, destination=None): 116 | """calculate the covariance matrix (ignoring nan values) for the columns of x (MxN), or optionally, the covariance matrix between the 117 | columns of x and and the columns of y (MxP). (In the language of statistics, the columns are variables, the rows 118 | are observations). 119 | 120 | Args: 121 | x (numpy array-like) MxN in shape 122 | y (numpy array-like) MxP in shape 123 | destination (numpy masked array-like) optional location where to store the results as they are calculated (e.g. a numpy 124 | memmap of a file) 125 | 126 | returns (numpy array-like) array of the covariance values 127 | for defaults (y=None), shape is NxN 128 | if y is provided, shape is NxP 129 | """ 130 | x_masked = numpy.ma.array(x, mask=numpy.isnan(x)) 131 | 132 | if y is None: 133 | y_masked = x_masked 134 | else: 135 | y_masked = numpy.ma.array(y, mask=numpy.isnan(y)) 136 | 137 | dest_was_None = False 138 | if destination is None: 139 | num_rows = x_masked.shape[1] if len(x_masked.shape) == 2 else 1 140 | num_cols = y_masked.shape[1] if len(y_masked.shape) == 2 else 1 141 | destination = numpy.ma.zeros((num_rows, num_cols)) 142 | dest_was_None = True 143 | 144 | r = _fast_cov(numpy.nanmean, _nan_dot_divide, x_masked, y_masked, destination) 145 | 146 | r[numpy.isinf(r)] = numpy.nan 147 | 148 | r = numpy.ma.filled(r, fill_value=numpy.nan) if dest_was_None else r 149 | 150 | return r 151 | 152 | 153 | class CmapPyMathFastCovInvalidInputXY(Exception): 154 | pass 155 | -------------------------------------------------------------------------------- /cmapPy/math/robust_zscore.py: -------------------------------------------------------------------------------- 1 | ''' 2 | robust_zscore.py 3 | 4 | Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative 5 | to a row). A robust z-score means that median is used instead of mean and 6 | median absolute deviation (MAD) instead of standard deviation in the 7 | standard z-score calculation: 8 | 9 | z = (x - u) / s 10 | 11 | x: input value 12 | u: median 13 | s: MAD 14 | 15 | Optionally, the median and MAD can be computed from a control df, instead of the 16 | input df. This functionality is useful for "vehicle-control"; that is, if 17 | the control df consists only of negative control samples, the median and MAD 18 | can be computed using just those samples but applied to the input df. 19 | ''' 20 | 21 | rounding_precision = 4 22 | 23 | 24 | def robust_zscore(mat, ctrl_mat=None, min_mad=0.1): 25 | ''' Robustly z-score a pandas df along the rows. 26 | 27 | Args: 28 | mat (pandas df): Matrix of data that z-scoring will be applied to 29 | ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs 30 | (e.g. vehicle control) 31 | min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause 32 | z-scores to blow up 33 | 34 | Returns: 35 | zscore_df (pandas_df): z-scored data 36 | ''' 37 | 38 | # If optional df exists, calc medians and mads from it 39 | if ctrl_mat is not None: 40 | medians = ctrl_mat.median(axis=1) 41 | median_devs = abs(ctrl_mat.subtract(medians, axis=0)) 42 | 43 | # Else just use plate medians 44 | else: 45 | medians = mat.median(axis=1) 46 | median_devs = abs(mat.subtract(medians, axis=0)) 47 | 48 | sub = mat.subtract(medians, axis='index') 49 | mads = median_devs.median(axis=1) 50 | 51 | # Threshold mads 52 | mads = mads.clip(lower=min_mad) 53 | 54 | # Must multiply values by 1.4826 to make MAD comparable to SD 55 | # (https://en.wikipedia.org/wiki/Median_absolute_deviation) 56 | zscore_df = sub.divide(mads * 1.4826, axis='index') 57 | 58 | return zscore_df.round(rounding_precision) -------------------------------------------------------------------------------- /cmapPy/math/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/math/tests/__init__.py -------------------------------------------------------------------------------- /cmapPy/math/tests/test_agg_wt_avg.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.math.agg_wt_avg as agg_wt_avg 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | 9 | test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]}) 10 | test_mat_corr = test_mat.corr() 11 | 12 | 13 | class TestAggWtAvg(unittest.TestCase): 14 | def test_calculate_weights(self): 15 | # happy path 16 | raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1) 17 | self.assertTrue(len(weights == 3)) 18 | self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838]) 19 | self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649]) 20 | 21 | # test that min_wt works 22 | raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85) 23 | self.assertEqual(raw_weights2[1], 0.85) 24 | 25 | def test_get_upper_triangle(self): 26 | # happy path 27 | upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr) 28 | self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857]) 29 | self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C']) 30 | self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B']) 31 | 32 | def test_agg_wt_avg(self): 33 | # use spearman 34 | out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat) 35 | self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0]) 36 | self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5) 37 | self.assertAlmostEqual(raw_weights[0], 0.75) 38 | self.assertAlmostEqual(weights[0], 0.375) 39 | 40 | # test on a single signature 41 | out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]]) 42 | pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]]) 43 | 44 | # should break if empty input 45 | with self.assertRaises(AssertionError) as e: 46 | agg_wt_avg.agg_wt_avg(test_mat[[]]) 47 | self.assertIn("mat is empty!", str(e.exception)) 48 | 49 | if __name__ == "__main__": 50 | setup_logger.setup(verbose=True) 51 | unittest.main() 52 | 53 | -------------------------------------------------------------------------------- /cmapPy/math/tests/test_robust_zscore.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.math.robust_zscore as robust_zscore 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | 9 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]}) 10 | test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]}) 11 | test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]}) 12 | 13 | 14 | class TestRobustZscore(unittest.TestCase): 15 | def test_zscore_pc(self): 16 | pc_zscores = robust_zscore.robust_zscore(test_mat) 17 | self.assertTrue(pc_zscores.shape == (3, 4)) 18 | 19 | pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame( 20 | {'A': [-0.3372, -0.6745, -0.4047], 21 | 'B': [-1.6862, 2.0235, 0.4047], 22 | 'C': [1.0117, 0.6745, 1.2141], 23 | 'D': [0.3372, -0.6745, -0.9443]})) 24 | 25 | def test_zscore_vc(self): 26 | vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat) 27 | self.assertTrue(vc_zscores.shape == (3, 4)) 28 | pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame( 29 | {'A': [-4.7214, -3.3725, -20.2347], 30 | 'B': [-7.4194, 0.6745, 0.0], 31 | 'C': [-2.0235, -1.349, 20.2347], 32 | 'D': [-3.3725, -3.3725, -33.7245]})) 33 | 34 | # check that min_mad works 35 | vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2) 36 | self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796) 37 | self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745) 38 | 39 | if __name__ == "__main__": 40 | setup_logger.setup(verbose=True) 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/README.rst: -------------------------------------------------------------------------------- 1 | pandasGEXpress library 2 | ====================== 3 | 4 | This is a package of Python scripts that enable reading, writing, and 5 | basic modifications (subsetting, concatenation) of .gct and .gctx files. 6 | 7 | Installation instructions and documentation can be found `on the package's ReadTheDocs page `_. 8 | 9 | Questions/issues 10 | ====================== 11 | 12 | Please add an issue to the cmapPy repository. We would appreciate if your issue included sample code/files (as appropriate) so that we can reproduce your bug/issue. 13 | 14 | Contributing 15 | ====================== 16 | 17 | We welcome contributors! For your pull requests, please include the following: 18 | 19 | * Sample code/file that reproducibly causes the bug/issue 20 | * Documented code providing fix 21 | * Unit tests evaluating added/modified methods. 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/__init__.py -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/concat_gctoo.py: -------------------------------------------------------------------------------- 1 | msg = "concat_gctoo.py is deprecated. Please use concat.py instead." 2 | raise(DeprecationWarning(msg)) -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/diff_gctoo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | diff_gctoo.py 3 | 4 | Converts a matrix of values (e.g. gene expression, viability, etc.) into a 5 | matrix of differential values. Values can be made differential relative to all 6 | samples in the dataset ("plate-control") or relative to just negative control 7 | samples ("vehicle-control"). The method of computing the differential can be 8 | either a robust z-score ("robust_z") or simply median normalization 9 | ("median_norm"). 10 | 11 | ''' 12 | import cmapPy.math.robust_zscore as robust_zscore 13 | import cmapPy.pandasGEXpress.GCToo as GCToo 14 | 15 | possible_diff_methods = ["robust_z", "median_norm"] 16 | 17 | 18 | def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle', 19 | diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10): 20 | ''' Converts a matrix of values (e.g. gene expression, viability, etc.) 21 | into a matrix of differential values. 22 | 23 | Args: 24 | df (pandas df): data to make diff_gctoo 25 | plate_control (bool): True means calculate diff_gctoo using plate control. 26 | False means vehicle control. 27 | group_field (string): Metadata field in which to find group_val 28 | group_val (string): Value in group_field that indicates use in vehicle control 29 | diff_method (string): Method of computing differential data; currently only 30 | support either "robust_z" or "median_norm" 31 | upper_diff_thresh (float): Maximum value for diff data 32 | lower_diff_thresh (float): Minimum value for diff data 33 | 34 | Returns: 35 | out_gctoo (GCToo object): GCToo with differential data values 36 | ''' 37 | assert diff_method in possible_diff_methods, ( 38 | "possible_diff_methods: {}, diff_method: {}".format( 39 | possible_diff_methods, diff_method)) 40 | 41 | # Compute median and MAD using all samples in the dataset 42 | if plate_control: 43 | 44 | # Compute differential data 45 | if diff_method == "robust_z": 46 | diff_data = robust_zscore.robust_zscore(gctoo.data_df) 47 | 48 | elif diff_method == "median_norm": 49 | medians = gctoo.data_df.median(axis=1) 50 | diff_data = gctoo.data_df.subtract(medians, axis='index') 51 | 52 | # Compute median and MAD from negative controls, rather than all samples 53 | else: 54 | 55 | assert group_field in gctoo.col_metadata_df.columns.values, ( 56 | "group_field {} not present in column metadata. " + 57 | "gctoo.col_metadata_df.columns.values: {}").format( 58 | group_field, gctoo.col_metadata_df.columns.values) 59 | 60 | assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, ( 61 | "group_val {} not present in the {} column.").format( 62 | group_val, group_field) 63 | 64 | # Find negative control samples 65 | neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val] 66 | neg_ctl_df = gctoo.data_df[neg_ctl_samples] 67 | 68 | # Compute differential data 69 | if diff_method == "robust_z": 70 | diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df) 71 | 72 | elif diff_method == "median_norm": 73 | medians = gctoo.data_df.median(axis=1) 74 | diff_data = gctoo.data_df.subtract(medians, axis='index') 75 | 76 | # Threshold differential data before returning 77 | diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh) 78 | 79 | # Construct output GCToo object 80 | out_gctoo = GCToo.GCToo(data_df=diff_data, 81 | row_metadata_df=gctoo.row_metadata_df, 82 | col_metadata_df=gctoo.col_metadata_df) 83 | 84 | return out_gctoo 85 | 86 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/gct2gctx.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line script to convert a .gct file to .gctx. 3 | 4 | Main method takes in a .gct file path (and, optionally, an 5 | out path and/or name to which to save the equivalent .gctx) 6 | and saves the enclosed content to a .gctx file. 7 | 8 | Note: Only supports v1.3 .gct files. 9 | """ 10 | import sys 11 | import logging 12 | import argparse 13 | import os.path 14 | import pandas as pd 15 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 16 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 17 | import cmapPy.pandasGEXpress.write_gctx as write_gctx 18 | 19 | __author__ = "Oana Enache" 20 | __email__ = "oana@broadinstitute.org" 21 | 22 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 23 | 24 | 25 | def build_parser(): 26 | parser = argparse.ArgumentParser(description=__doc__, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 28 | # required 29 | parser.add_argument("-filename", "-f", required=True, 30 | help=".gct file that you would like to convert to .gctx") 31 | # optional 32 | parser.add_argument("-output_filepath", "-o", default=None, 33 | help=("out path/name for output gctx file. " + 34 | "Default is just to modify the extension")) 35 | parser.add_argument("-verbose", "-v", 36 | help="Whether to print a bunch of output.", action="store_true", default=False) 37 | parser.add_argument("-row_annot_path", help="Path to annotations file for rows") 38 | parser.add_argument("-col_annot_path", help="Path to annotations file for columns") 39 | return parser 40 | 41 | 42 | def main(): 43 | args = build_parser().parse_args(sys.argv[1:]) 44 | setup_logger.setup(verbose=args.verbose) 45 | gct2gctx_main(args) 46 | 47 | 48 | def gct2gctx_main(args): 49 | """ Separate from main() in order to make command-line tool. """ 50 | 51 | in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False) 52 | 53 | if args.output_filepath is None: 54 | basename = os.path.basename(args.filename) 55 | out_name = os.path.splitext(basename)[0] + ".gctx" 56 | else: 57 | out_name = args.output_filepath 58 | 59 | """ If annotations are supplied, parse table and set metadata_df """ 60 | if args.row_annot_path is None: 61 | pass 62 | else: 63 | row_metadata = pd.read_csv(args.row_annot_path, sep='\t', index_col=0, header=0, low_memory=False) 64 | assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \ 65 | "Row ids in matrix missing from annotations file" 66 | in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(in_gctoo.data_df.index)] 67 | 68 | if args.col_annot_path is None: 69 | pass 70 | else: 71 | col_metadata = pd.read_csv(args.col_annot_path, sep='\t', index_col=0, header=0, low_memory=False) 72 | assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \ 73 | "Column ids in matrix missing from annotations file" 74 | in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(in_gctoo.data_df.columns)] 75 | 76 | write_gctx.write(in_gctoo, out_name) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/gctx2gct.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line script to convert a .gctx file to .gct. 3 | 4 | Main method takes in a .gctx file path (and, optionally, an 5 | out path and/or name to which to save the equivalent .gct) 6 | and saves the enclosed content to a .gct file. 7 | 8 | Note: Only supports v1.0 .gctx files. 9 | """ 10 | import sys 11 | import logging 12 | import argparse 13 | import os.path 14 | import pandas as pd 15 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 16 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 17 | import cmapPy.pandasGEXpress.write_gct as write_gct 18 | 19 | __author__ = "Oana Enache" 20 | __email__ = "oana@broadinstitute.org" 21 | 22 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 23 | 24 | 25 | def build_parser(): 26 | parser = argparse.ArgumentParser(description=__doc__, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 28 | # required 29 | parser.add_argument("-filename", "-f", required=True, 30 | help=".gctx file that you would like to converted to .gct") 31 | # optional 32 | parser.add_argument("-output_filepath", "-o", default=None, 33 | help=("out path/name for output gct file. " + 34 | "Default is just to modify the extension")) 35 | parser.add_argument("-verbose", "-v", 36 | help="Whether to print a bunch of output.", action="store_true", default=False) 37 | parser.add_argument("-row_annot_path", help="Path to annotations file for rows") 38 | parser.add_argument("-col_annot_path", help="Path to annotations file for columns") 39 | return parser 40 | 41 | 42 | def main(): 43 | args = build_parser().parse_args(sys.argv[1:]) 44 | setup_logger.setup(verbose=args.verbose) 45 | gctx2gct_main(args) 46 | 47 | 48 | def gctx2gct_main(args): 49 | """ Separate from main() in order to make command-line tool. """ 50 | 51 | in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False) 52 | 53 | if args.output_filepath is None: 54 | basename = os.path.basename(args.filename) 55 | out_name = os.path.splitext(basename)[0] + ".gct" 56 | else: 57 | out_name = args.output_filepath 58 | 59 | """ If annotations are supplied, parse table and set metadata_df """ 60 | if args.row_annot_path is None: 61 | pass 62 | else: 63 | row_metadata = pd.read_csv(args.row_annot_path, sep='\t', index_col=0, header=0, low_memory=False) 64 | assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \ 65 | "Row ids in matrix missing from annotations file" 66 | in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(in_gctoo.data_df.index)] 67 | 68 | if args.col_annot_path is None: 69 | pass 70 | else: 71 | col_metadata = pd.read_csv(args.col_annot_path, sep='\t', index_col=0, header=0, low_memory=False) 72 | assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \ 73 | "Column ids in matrix missing from annotations file" 74 | in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(in_gctoo.data_df.columns)] 75 | 76 | write_gct.write(in_gctoo, out_name) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/mini_gctoo_for_testing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates a small GCToo instance (with representative examples of typically found fields); can use for testing. 3 | 4 | ex: 5 | import mini_gctoo_for testing 6 | my_mini_gctoo = mini_gctoo_for_testing.make() 7 | """ 8 | import logging 9 | import pandas 10 | import numpy 11 | import cmapPy.pandasGEXpress.GCToo as GCToo 12 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 13 | 14 | __author__ = 'Oana Enache' 15 | __email__ = 'oana@broadinstitute.org' 16 | 17 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 18 | 19 | 20 | def make(convert_neg_666=True): 21 | """ 22 | Creates a small GCToo instance (with representative examples of typically found fields); can use for testing. 23 | """ 24 | # metadata examples; should be one of each type reasonable to find 25 | id_vals = ["LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33", "MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33", 26 | "LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10", "LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10", 27 | "LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666", "LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10"] 28 | count_cv = ["14|15|14", "13|14|13", 29 | "13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13", 30 | "13", "13", "14"] 31 | distil_ss = [9.822065353, 6.8915205, 1.35840559, 5.548898697, 3.355231762, 4.837643147] 32 | zmad_ref = ["population", "population", "population", "population", "population", "population"] 33 | distil_nsample = [3, 3, 66, 2, 9, 111111] 34 | mfc_plate_id = ["-666", "-666", "-666", "-666", "-666", "-666"] 35 | 36 | # build metadata dataframe 37 | mini_meta_dict = {} 38 | mini_meta_dict["id"] = id_vals 39 | mini_meta_dict["count_cv"] = count_cv 40 | mini_meta_dict["distil_ss"] = distil_ss 41 | mini_meta_dict["zmad_ref"] = zmad_ref 42 | mini_meta_dict["distil_nsample"] = distil_nsample 43 | mini_meta_dict["mfc_plate_id"] = mfc_plate_id 44 | mini_row_metadata = pandas.DataFrame(mini_meta_dict, 45 | columns=['id', 'count_cv', 'distil_nsample', 'distil_ss', 'mfc_plate_id', 'zmad_ref']) 46 | 47 | if convert_neg_666: 48 | mini_row_metadata = mini_row_metadata.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan]) 49 | # if all values in a column are nanpandas.Series(mini_row_metadata.isna().sum() == mini_row_metadata.shape[0]) convert dtype of that column to float 50 | all_nan_columns = (mini_row_metadata.isnull().sum() == numpy.array(mini_row_metadata.shape[0])).to_numpy().nonzero()[0] 51 | mini_row_metadata = mini_row_metadata.astype({d: 'float' for d in mini_row_metadata.columns[all_nan_columns.tolist()]}) 52 | else: 53 | mini_row_metadata = mini_row_metadata.replace([-666, -666.0], ["-666", "-666"]) 54 | 55 | # for now (at least) col and row metadata are the same 56 | mini_col_metadata = mini_row_metadata.copy() 57 | 58 | # data example values 59 | r1 = [1, 2, 3, 4, 5, 6] 60 | r2 = [4.3, 4.5, 4.3, 4.3, 4.3, 4.3] 61 | r3 = [7, 8, 9, 0, 1.23476, 9.758320] 62 | r4 = [0.11, 3.3456356, 2.345667, 9.822065353, 4.78865099, 4.7886] 63 | r5 = [-0.11, -3.3456356, -2.345667, -9.822065353, -4.78865099, -4.7886] 64 | r6 = [1, -2, 3, -4, 5, -6] 65 | 66 | # build data dataframe 67 | mini_data_mat = pandas.DataFrame([r1, r2, r3, r4, r5, r6], dtype=numpy.float32) 68 | mini_data_mat.index = id_vals 69 | mini_data_mat.columns = id_vals 70 | 71 | # instantiate & assign attributes of GCToo instance 72 | mini_version = "GCTX1.0" 73 | mini_src = "mini_gctoo.gctx" 74 | 75 | mini_row_metadata_df = mini_row_metadata 76 | mini_row_metadata_df.set_index("id", inplace=True, drop=True) 77 | mini_row_metadata.index.name = "rid" 78 | mini_row_metadata_df.columns.name = "rhd" 79 | 80 | mini_col_metadata_df = mini_col_metadata 81 | mini_col_metadata_df.set_index("id", inplace=True, drop=True) 82 | mini_col_metadata.index.name = "cid" 83 | mini_col_metadata_df.columns.name = "chd" 84 | 85 | mini_data_df = mini_data_mat 86 | mini_data_df.index.name = "rid" 87 | mini_data_df.columns.name = "cid" 88 | 89 | logger.debug("Making mini_gctoo instance...") 90 | mini_gctoo = GCToo.GCToo(data_df=mini_data_df, row_metadata_df=mini_row_metadata_df, 91 | col_metadata_df=mini_col_metadata_df, src=mini_src, version=mini_version) 92 | 93 | return mini_gctoo 94 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/parse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generic parse method to parse either a .gct or a .gctx. 3 | 4 | Takes in a file path corresponding to either a .gct or .gctx, 5 | and parses to a GCToo instance accordingly. 6 | 7 | Note: Supports GCT1.2, GCT1.3, and GCTX1.0 files. 8 | """ 9 | import logging 10 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 11 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 12 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 13 | 14 | import numpy 15 | 16 | __author__ = "Oana Enache" 17 | __email__ = "oana@broadinstitute.org" 18 | 19 | # instantiate logger 20 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 21 | 22 | 23 | def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=None, 24 | row_meta_only=False, col_meta_only=False, make_multiindex=False, 25 | gct_data_type=numpy.float32): 26 | """ 27 | Identifies whether file_path corresponds to a .gct or .gctx file and calls the 28 | correct corresponding parse method. 29 | 30 | Input: 31 | Mandatory: 32 | - gct(x)_file_path (str): full path to gct(x) file you want to parse. 33 | 34 | Optional: 35 | - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not 36 | (see Note below for more details on this). Default = False. 37 | - rid (list of strings): list of row ids to specifically keep from gctx. Default=None. 38 | - cid (list of strings): list of col ids to specifically keep from gctx. Default=None. 39 | - ridx (list of integers): only read the rows corresponding to this 40 | list of integer ids. Default=None. 41 | - cidx (list of integers): only read the columns corresponding to this 42 | list of integer ids. Default=None. 43 | - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True) 44 | as pandas DataFrame 45 | - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True) 46 | as pandas DataFrame 47 | - make_multiindex (bool): whether to create a multi-index df combining 48 | the 3 component dfs 49 | - gct_data_type (numpy datatype): if loading a gct file, what data type the matrix should be converted into 50 | i.e. default is numpy float32 51 | 52 | Output: 53 | - out (GCToo object or pandas df): if row_meta_only or col_meta_only, then 54 | out is a metadata df; otherwise, it's a GCToo instance containing 55 | content of parsed gct(x) file 56 | 57 | Note: why does convert_neg_666 exist? 58 | - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value 59 | for metadata. However (so that users can take full advantage of pandas' methods, 60 | including those for filtering nan's etc) we provide the option of converting these 61 | into numpy.NaN values, the pandas default. 62 | """ 63 | if file_path.endswith(".gctx"): 64 | out = parse_gctx.parse(file_path, convert_neg_666=convert_neg_666, 65 | rid=rid, cid=cid, ridx=ridx, cidx=cidx, 66 | row_meta_only=row_meta_only, col_meta_only=col_meta_only, 67 | make_multiindex=make_multiindex) 68 | 69 | else: 70 | if file_path.endswith(".gct"): 71 | logger.info("parsing gct file") 72 | else: 73 | logger.info("parsing file of unknown extension, assuming/trying gct format") 74 | 75 | out = parse_gct.parse(file_path, convert_neg_666=convert_neg_666, 76 | rid=rid, cid=cid, ridx=ridx, cidx=cidx, 77 | row_meta_only=row_meta_only, col_meta_only=col_meta_only, 78 | make_multiindex=make_multiindex, data_type=gct_data_type) 79 | 80 | return out 81 | 82 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/random_slice.py: -------------------------------------------------------------------------------- 1 | """ 2 | Slices a random subset of a GCToo instance of a user-specified size. 3 | """ 4 | import logging 5 | import numpy 6 | import cmapPy.pandasGEXpress.GCToo as GCToo 7 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 8 | 9 | __author__ = "Oana Enache" 10 | __email__ = "oana@broadinstitute.org" 11 | 12 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 13 | 14 | 15 | def make_specified_size_gctoo(og_gctoo, num_entries, dim): 16 | """ 17 | Subsets a GCToo instance along either rows or columns to obtain a specified size. 18 | 19 | Input: 20 | - og_gctoo (GCToo): a GCToo instance 21 | - num_entries (int): the number of entries to keep 22 | - dim (str): the dimension along which to subset. Must be "row" or "col" 23 | 24 | Output: 25 | - new_gctoo (GCToo): the GCToo instance subsetted as specified. 26 | """ 27 | assert dim in ["row", "col"], "dim specified must be either 'row' or 'col'" 28 | 29 | dim_index = 0 if "row" == dim else 1 30 | assert num_entries <= og_gctoo.data_df.shape[dim_index], ("number of entries must be smaller than dimension being " 31 | "subsetted - num_entries: {} dim: {} dim_index: {} og_gctoo.data_df.shape[dim_index]: {}".format( 32 | num_entries, dim, dim_index, og_gctoo.data_df.shape[dim_index])) 33 | 34 | if dim == "col": 35 | columns = [x for x in og_gctoo.data_df.columns.values] 36 | numpy.random.shuffle(columns) 37 | columns = columns[0:num_entries] 38 | rows = og_gctoo.data_df.index.values 39 | else: 40 | rows = [x for x in og_gctoo.data_df.index.values] 41 | numpy.random.shuffle(rows) 42 | rows = rows[0:num_entries] 43 | columns = og_gctoo.data_df.columns.values 44 | 45 | new_data_df = og_gctoo.data_df.loc[rows, columns] 46 | new_row_meta = og_gctoo.row_metadata_df.loc[rows] 47 | new_col_meta = og_gctoo.col_metadata_df.loc[columns] 48 | 49 | logger.debug( 50 | "after slice - new_col_meta.shape: {} new_row_meta.shape: {}".format(new_col_meta.shape, new_row_meta.shape)) 51 | 52 | # make & return new gctoo instance 53 | new_gctoo = GCToo.GCToo(data_df=new_data_df, row_metadata_df=new_row_meta, col_metadata_df=new_col_meta) 54 | 55 | return new_gctoo 56 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/setup_GCToo_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | 4 | 5 | __author__ = "David Lahr" 6 | __email__ = "dlahr@broadinstitute.org" 7 | 8 | LOGGER_NAME = "cmap_logger" 9 | 10 | _LOG_FORMAT = "%(levelname)s %(asctime)s %(module)s %(funcName)s %(message)s" 11 | _LOG_FILE_MAX_BYTES = 10000000 12 | _LOG_FILE_BACKUP_COUNT = 5 13 | 14 | 15 | def setup(verbose=False, log_file=None): 16 | logger = logging.getLogger(LOGGER_NAME) 17 | 18 | level = (logging.DEBUG if verbose else logging.INFO) 19 | 20 | if log_file is None: 21 | logging.basicConfig(level=level, format=_LOG_FORMAT) 22 | else: 23 | logger.setLevel(level) 24 | handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=_LOG_FILE_MAX_BYTES, 25 | backupCount=_LOG_FILE_BACKUP_COUNT) 26 | handler.setFormatter(logging.Formatter(fmt=_LOG_FORMAT)) 27 | logger.addHandler(handler) 28 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/simple_GCT_to_GCToo_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/simple_GCT_to_GCToo_figure.png -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/slice_gct.py: -------------------------------------------------------------------------------- 1 | msg = "slice_gct.py is deprecated. Please use subset.py instead." 2 | raise(DeprecationWarning(msg)) -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/slice_gctoo.py: -------------------------------------------------------------------------------- 1 | msg = "slice_gctoo.py is deprecated. Please use subset_gctoo.py instead." 2 | raise(DeprecationWarning(msg)) -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/subset.py: -------------------------------------------------------------------------------- 1 | """ 2 | subset.py 3 | 4 | Extract a subset of data from a GCT(x) file using the command line. ids can 5 | be provided as a list or as a path to a grp file. See subset_gctoo for the 6 | equivalent method to be used on GCToo objects. 7 | 8 | """ 9 | import logging 10 | import sys 11 | import os 12 | import argparse 13 | 14 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 15 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 16 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 17 | import cmapPy.pandasGEXpress.subset_gctoo as sg 18 | import cmapPy.pandasGEXpress.write_gct as wg 19 | import cmapPy.pandasGEXpress.write_gct as wgx 20 | import cmapPy.set_io.grp as grp 21 | 22 | __author__ = "Lev Litichevskiy" 23 | __email__ = "lev@broadinstitute.org" 24 | 25 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 26 | 27 | 28 | def build_parser(): 29 | """Build argument parser.""" 30 | 31 | parser = argparse.ArgumentParser(description=__doc__, 32 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 33 | 34 | # Required args 35 | parser.add_argument("--in_path", "-i", required=True, 36 | help="file path to input GCT(x) file") 37 | 38 | parser.add_argument("--rid", nargs="+", help="filepath to grp file or string array for including rows") 39 | parser.add_argument("--cid", nargs="+", help="filepath to grp file or string array for including cols") 40 | parser.add_argument("--exclude_rid", "-er", nargs="+", help="filepath to grp file or string array for excluding rows") 41 | parser.add_argument("--exclude_cid", "-ec", nargs="+", help="filepath to grp file or string array for excluding cols") 42 | parser.add_argument("--out_name", "-o", default="ds_subsetted.gct", 43 | help="what to name the output file") 44 | parser.add_argument("--out_type", default="gct", choices=["gct", "gctx"], 45 | help="whether to write output as GCT or GCTx") 46 | parser.add_argument("--verbose", "-v", action="store_true", default=False, 47 | help="whether to increase the # of messages reported") 48 | 49 | return parser 50 | 51 | 52 | def main(): 53 | # Get args 54 | args = build_parser().parse_args(sys.argv[1:]) 55 | setup_logger.setup(verbose=args.verbose) 56 | subset_main(args) 57 | 58 | 59 | def subset_main(args): 60 | """ Separate method from main() in order to make testing easier and to 61 | enable command-line access. """ 62 | 63 | # Read in each of the command line arguments 64 | rid = _read_arg(args.rid) 65 | cid = _read_arg(args.cid) 66 | exclude_rid = _read_arg(args.exclude_rid) 67 | exclude_cid = _read_arg(args.exclude_cid) 68 | 69 | # If GCT, use subset_gctoo 70 | if args.in_path.endswith(".gct"): 71 | 72 | in_gct = parse_gct.parse(args.in_path) 73 | out_gct = sg.subset_gctoo(in_gct, rid=rid, cid=cid, 74 | exclude_rid=exclude_rid, 75 | exclude_cid=exclude_cid) 76 | 77 | # If GCTx, use parse_gctx 78 | else: 79 | 80 | if (exclude_rid is not None) or (exclude_cid is not None): 81 | msg = "exclude_{rid,cid} args not currently supported for parse_gctx." 82 | raise(Exception(msg)) 83 | 84 | logger.info("Using hyperslab selection functionality of parse_gctx...") 85 | out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid) 86 | 87 | # Write the output gct 88 | if args.out_type == "gctx": 89 | wgx.write(out_gct, args.out_name) 90 | else: 91 | wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") 92 | 93 | 94 | def _read_arg(arg): 95 | """ 96 | If arg is a list with 1 element that corresponds to a valid file path, use 97 | set_io.grp to read the grp file. Otherwise, check that arg is a list of strings. 98 | 99 | Args: 100 | arg (list or None) 101 | 102 | Returns: 103 | arg_out (list or None) 104 | """ 105 | 106 | # If arg is None, just return it back 107 | if arg is None: 108 | arg_out = arg 109 | 110 | else: 111 | # If len(arg) == 1 and arg[0] is a valid filepath, read it as a grp file 112 | if len(arg) == 1 and os.path.exists(arg[0]): 113 | arg_out = grp.read(arg[0]) 114 | else: 115 | arg_out = arg 116 | 117 | # Make sure that arg_out is a list of strings 118 | assert isinstance(arg_out, list), "arg_out must be a list." 119 | assert type(arg_out[0]) == str, "arg_out must be a list of strings." 120 | 121 | return arg_out 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/__init__.py -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/both_metadata_example_n1476x978.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/both_metadata_example_n1476x978.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/col_meta_only_example_n355x355.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/col_meta_only_example_n355x355.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/concated.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/concated.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/metadata_writer_test.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/metadata_writer_test.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/both_metadata_example_n1476x978.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/both_metadata_example_n1476x978.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/col_meta_only_example_n355x355.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/col_meta_only_example_n355x355.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/row_meta_only_example_n2x1203.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/row_meta_only_example_n2x1203.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/tsne_n2x1203.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/tsne_n2x1203.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_data_matrix.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_data_matrix.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 6 6 5 5 3 | id count_cv distil_nsample distil_ss mfc_plate_id zmad_ref LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 4 | count_cv -666 -666 -666 -666 -666 14|15|14 13|14|13 13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13 13 13 14 5 | distil_nsample -666 -666 -666 -666 -666 3 3 66 2 9 111111 6 | distil_ss -666 -666 -666 -666 -666 9.822065353 6.8915205 1.35840559 5.548898697 3.355231762 4.837643147 7 | mfc_plate_id -666 -666 -666 -666 -666 -666 -666 -666 -666 -666 -666 8 | zmad_ref -666 -666 -666 -666 -666 population population population population population population 9 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 14|15|14 3 9.822065353 -666 population 1.0 2.0 3.0 4.0 5.0 6.0 10 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 13|14|13 3 6.8915205 -666 population 4.300000190734863 4.5 4.300000190734863 4.300000190734863 4.300000190734863 4.300000190734863 11 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13 66 1.35840559 -666 population 7.0 8.0 9.0 0.0 1.234760046005249 9.758319854736328 12 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 13 2 5.548898697 -666 population 0.10999999940395355 3.3456356525421143 2.3456668853759766 9.822065353393555 4.788650989532471 4.788599967956543 13 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 13 9 3.355231762 -666 population -0.10999999940395355 -3.3456356525421143 -2.3456668853759766 -9.822065353393555 -4.788650989532471 -4.788599967956543 14 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 14 111111 4.837643147 -666 population 1.0 -2.0 3.0 -4.0 5.0 -6.0 15 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 6 6 0 0 3 | id LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 4 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 1.0000 2.0000 3.0000 4.0000 5.0000 6.0000 5 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 4.3000 4.5000 4.3000 4.3000 4.3000 4.3000 6 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 7.0000 8.0000 9.0000 0.0000 1.2348 9.7583 7 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 0.1100 3.3456 2.3457 9.8221 4.7887 4.7886 8 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 -0.1100 -3.3456 -2.3457 -9.8221 -4.7887 -4.7886 9 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 1.0000 -2.0000 3.0000 -4.0000 5.0000 -6.0000 10 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/mini_gctx_with_metadata_n2x3.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctx_with_metadata_n2x3.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/older_version_v1_2.gct: -------------------------------------------------------------------------------- 1 | #1.2 2 | 5 3 3 | NAME Description DLBCL.205 DLBCL.206 DLBCL.232 4 | 1007_s_at U48705 /FEATURE=mRNA 280.53 271.48 113.57 5 | 1053_at M87338 /FEATURE= /DEFINITION=HUMA 32.13 91.6 117.43 6 | 117_at X5175 /FEATURE = cds 41.27 61.12 24.1 7 | 121_at blah blah blah 738.32 330.59 249.37 8 | 1320_at first/ second /ok 88.45 12.94 18.46 -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/row_meta_only_example_n2x1203.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/row_meta_only_example_n2x1203.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt: -------------------------------------------------------------------------------- 1 | cid count_cv distil_nsample distil_ss mfc_plate_id zmad_ref 2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 14|15|14 3 9.822065353 -666 population 3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 13|14|13 3 6.8915205 -666 population 4 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13 66 1.35840559 -666 population 5 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 13 2 5.548898697 -666 population 6 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 13 9 3.355231762 -666 population 7 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 14 111111 4.837643147 -666 population 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_concat/test_main/a.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 2 2 1 0 3 | id rhd1 a b 4 | rid1 c 1.1 2.2 5 | rid2 d 3.3 4.4 6 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_concat/test_main/b.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 2 2 1 0 3 | id rhd1 g f 4 | rid1 e 1.1 2.2 5 | rid2 d 3.3 4.4 6 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_concat_gctoo_test_main_fake_empty_file.gct: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_concat_gctoo_test_main_fake_empty_file.gct -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_l1000_highprecision.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_l1000_highprecision.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merge_bottom.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 2 3 2 2 3 | id rhd1 rhd2 s1 s3 s2 4 | chd1 NA NA s1_1 s3_1 s2_1 5 | chd2 NA NA s1_2 s3_2 s2_2 6 | p5 p5_1 p5_2 0.5 NaN 0.1 7 | p4 p4_1 p4_2 0.3 0.9 0.8 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merge_left.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 4 3 2 3 3 | id rhd1 rhd2 s1 s2 s3 4 | chd1 NA NA s1_1 s2_1 s3_1 5 | chd2 NA NA s1_2 s2_2 s3_2 6 | chd3 NA NA s1_3 s2_3 s3_3 7 | p1 p1_1 p1_2 0.1 0.2 0.3 8 | p2 p2_1 p2_2 0.4 NaN 0.5 9 | p3 p3_1 p3_2 0.6 0.7 0.8 10 | p4 p4_1 p4_2 0.9 1.0 1.1 11 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merge_right.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 3 3 2 4 3 | id rhd1 rhd2 s6 s5 s4 4 | chd1 NA NA s6_1 s5_1 s4_1 5 | chd2 NA NA s6_2 s5_2 s4_2 6 | chd3 NA NA s6_3 s5_3 s4_3 7 | chd4 NA NA s6_4 s5_4 s4_4 8 | p1 p1_1 p1_2 1.1 1.2 1.3 9 | p3 p3_1 p3_2 1.4 1.5 NaN 10 | p4 p4_1 p4_2 1.6 1.7 1.8 11 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merge_top.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 2 3 2 2 3 | id rhd1 rhd2 s1 s2 s3 4 | chd1 NA NA s1_1 s2_1 s3_1 5 | chd2 NA NA s1_2 s2_2 s3_2 6 | p1 p1_1 p1_2 0.1 0.2 0.3 7 | p2 p2_1 p2_2 0.4 NaN 0.5 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merged_left_right.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 4 6 2 4 3 | id rhd1 rhd2 s1 s2 s3 s4 s5 s6 4 | chd1 NA NA s1_1 s2_1 s3_1 s4_1 s5_1 s6_1 5 | chd2 NA NA s1_2 s2_2 s3_2 s4_2 s5_2 s6_2 6 | chd3 NA NA s1_3 s2_3 s3_3 s4_3 s5_3 s6_3 7 | chd4 NA NA NA NA NA s4_4 s5_4 s6_4 8 | p1 p1_1 p1_2 0.1 0.2 0.3 1.3 1.2 1.1 9 | p2 p2_1 p2_2 0.4 NaN 0.5 NaN NaN NaN 10 | p3 p3_1 p3_2 0.6 0.7 0.8 NaN 1.5 1.4 11 | p4 p4_1 p4_2 0.9 1.0 1.1 1.8 1.7 1.6 12 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_merged_top_bottom.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 4 3 2 2 3 | id rhd1 rhd2 s1 s2 s3 4 | chd1 NA NA s1_1 s2_1 s3_1 5 | chd2 NA NA s1_2 s2_2 s3_2 6 | p1 p1_1 p1_2 0.1 0.2 0.3 7 | p2 p2_1 p2_2 0.4 NaN 0.5 8 | p4 p4_1 p4_2 0.3 0.8 0.9 9 | p5 p5_1 p5_2 0.5 0.1 NaN 10 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_missing_colmeta.txt: -------------------------------------------------------------------------------- 1 | cid count_cv distil_nsample distil_ss mfc_plate_id zmad_ref 2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 14|15|14 3 9.822065353 -666 population 3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 13|14|13 3 6.8915205 -666 population 4 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 13 2 5.548898697 -666 population 5 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 13 9 3.355231762 -666 population 6 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 14 111111 4.837643147 -666 population 7 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_missing_rowmeta.txt: -------------------------------------------------------------------------------- 1 | rid count_cv distil_nsample distil_ss mfc_plate_id zmad_ref 2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 14|15|14 3 9.822065353 -666 population 3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 13|14|13 3 6.8915205 -666 population 4 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 13 2 5.548898697 -666 population 5 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 13 9 3.355231762 -666 population 6 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 14 111111 4.837643147 -666 population 7 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gct_int_ids.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 3 2 1 1 3 | id rhd1 1 2 4 | chd1 -666 a b 5 | 3 e 5 7 6 | 11 f 13 17 7 | -3 c -7 -11 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gctx_rid_entrez_id.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gctx_rid_entrez_id.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt: -------------------------------------------------------------------------------- 1 | rid count_cv distil_nsample distil_ss mfc_plate_id zmad_ref 2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33 14|15|14 3 9.822065353 -666 population 3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33 13|14|13 3 6.8915205 -666 population 4 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10 13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13 66 1.35840559 -666 population 5 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10 13 2 5.548898697 -666 population 6 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666 13 9 3.355231762 -666 population 7 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10 14 111111 4.837643147 -666 population 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_subset_expected.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 2 3 2 2 3 | id rhd1 rhd2 d e g 4 | chd1 NA NA d1 e1 g1 5 | chd2 NA NA d2 e2 g2 6 | a a1 a2 1 2 5 7 | c c1 c2 19 23 31 8 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_subset_in.gct: -------------------------------------------------------------------------------- 1 | #1.3 2 | 3 4 2 2 3 | id rhd1 rhd2 d e f g 4 | chd1 NA NA d1 e1 f1 g1 5 | chd2 NA NA d2 e2 f2 g2 6 | a a1 a2 1 2 3 5 7 | b b1 b2 7 11 13 17 8 | c c1 c2 19 23 29 31 9 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_subset_rid.grp: -------------------------------------------------------------------------------- 1 | # used by test_subset.py 2 | a 3 | Bb 4 | c 5 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/test_v1point2_n5x10.gct: -------------------------------------------------------------------------------- 1 | #1.2 2 | 10 5 3 | Name Description LJP005_A375_24H_X1_B19:A03 LJP005_A375_24H_X1_B19:A04 LJP005_A375_24H_X1_B19:A05 LJP005_A375_24H_X1_B19:A06 LJP005_A375_24H_X1_B19:A07 4 | 200814_at PSME1 11.3819 11.3336 11.4486 11.3117 11.6321 5 | 218597_s_at CISD1 10.445 10.445 10.3658 10.5809 11.0401 6 | 217140_s_at VDAC1 6.3682 5.9869 6.0089 6.9966 6.7862 7 | 209253_at SORBS3 8.1372 8.2499 8.4592 7.9091 7.5321 8 | 214404_x_at SPDEF 4.9227 5.1192 4.95 4.8193 6.0052 9 | 222103_at ATF1 7.9259 8.1555 8.0674 8.0616 8.7338 10 | 219888_at SPAG4 4.028 4.583 4.6234 4.4257 4.0465 11 | 207042_at E2F2 3.8934 4.1096 3.8643 4.7922 4.2392 12 | 201453_x_at RHEB 11.4787 11.6041 11.7341 11.5345 11.5706 13 | 203627_at IGF1R 7.6509 7.5775 7.4636 7.3899 8.1654 14 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/functional_tests/tsne_n2x1203.gctx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/tsne_n2x1203.gctx -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/python2_tests/__init__.py -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_diff_gctoo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.pandasGEXpress.GCToo as GCToo 6 | import cmapPy.pandasGEXpress.diff_gctoo as diff_gctoo 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 11 | 'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]}) 12 | test_col_meta = pd.DataFrame( 13 | {'pert_type': ['trt_cp', 'trt_cp', 'trt_cp', 14 | 'trt_cp', 'ctl_vehicle', 'ctl_vehicle'], 15 | 'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']}, 16 | index=['A', 'B', 'C', 'D', 'E', 'F']) 17 | test_gctoo = GCToo.GCToo(data_df=test_mat, 18 | col_metadata_df=test_col_meta) 19 | 20 | 21 | class TestDifferential(unittest.TestCase): 22 | def test_diff_gctoo_pc(self): 23 | pc_zscores = diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, lower_diff_thresh=-2) 24 | self.assertTrue(pc_zscores.data_df.shape == (3, 6)) 25 | 26 | pd.util.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame( 27 | {'A': [-0.6745, -0.9443, -1.349], 28 | 'C': [0.2248, -0.1349, 1.349], 29 | 'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0], 30 | 'D': [-0.2248, -0.9443, -2], # last val should be -2 bc of thresholding 31 | 'F': [0.6745, 0.1349, 0.0]})) 32 | 33 | # test diff_method assertion 34 | with self.assertRaises(AssertionError) as e: 35 | diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, diff_method="robust_zs") 36 | self.assertIn("diff_method: robust_zs", str(e.exception)) 37 | 38 | # test median norm 39 | pc_median_normed_df = diff_gctoo.diff_gctoo(test_gctoo, diff_method="median_norm") 40 | self.assertEqual(pc_median_normed_df.data_df.iloc[0, 0], -1.5) 41 | self.assertEqual(pc_median_normed_df.data_df.loc[2, "B"], 0) 42 | 43 | def test_diff_gctoo_vc(self): 44 | vc_zscores1 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False) 45 | vc_zscores2 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, 46 | group_field='pert_iname', 47 | group_val='DMSO') 48 | self.assertTrue(vc_zscores1.data_df.shape == (3, 6)) 49 | self.assertTrue(vc_zscores2.data_df.shape == (3, 6)) 50 | 51 | pd.util.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame( 52 | {'A': [-4.7214, -3.3725, -10.0], # check for thresholding 53 | 'C': [-2.0235, -1.349, 10.0], 54 | 'B': [-7.4194, 0.6745, 0.0], 55 | 'E': [0.6745, 0.6745, 0.0], 56 | 'D': [-3.3725, -3.3725, -10.0], 57 | 'F': [-0.6745, -0.6745, 0.0]})) 58 | 59 | pd.util.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame( 60 | {'A': [-2.0235, -0.6745, -0.3372], 61 | 'C': [0.6745, 0.6745, 0.6745], 62 | 'B': [-4.7214, 2.0235, 0.1686], 63 | 'E': [3.3725, 2.0235, 0.1686], 64 | 'D': [-0.6745, -0.6745, -0.6745], 65 | 'F': [2.0235, 1.1242, 0.1686]})) 66 | 67 | # test group_val assertion 68 | with self.assertRaises(AssertionError) as e: 69 | diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, group_val="dmso") 70 | self.assertIn("dmso not present", str(e.exception)) 71 | 72 | 73 | if __name__ == "__main__": 74 | setup_logger.setup(verbose=True) 75 | unittest.main() 76 | 77 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_gct2gctx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import os 5 | import cmapPy.pandasGEXpress.gct2gctx as gct2gctx 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class TestGCT2GCTx(unittest.TestCase): 14 | 15 | def test_gct2gctx_main(self): 16 | 17 | in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct" 18 | out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out.gctx" 19 | args_string = "-f {} -o {}".format(in_name, out_name) 20 | args = gct2gctx.build_parser().parse_args(args_string.split()) 21 | 22 | gct2gctx.gct2gctx_main(args) 23 | 24 | # Make sure the input is identical to output 25 | in_gct = parse_gct.parse(in_name) 26 | out_gctx = parse_gctx.parse(out_name) 27 | 28 | pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df) 29 | pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df) 30 | pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df) 31 | 32 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct" 33 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out_annotated.gctx" 34 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt" 35 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt" 36 | args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta) 37 | args = gct2gctx.build_parser().parse_args(args_string.split()) 38 | 39 | gct2gctx.gct2gctx_main(args) 40 | 41 | annotated_gctx = parse_gctx.parse(added_meta) 42 | 43 | # Check added annotations are the same as original input GCTX 44 | pd.util.testing.assert_frame_equal(in_gct.data_df, annotated_gctx.data_df, check_less_precise=3) 45 | pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, annotated_gctx.col_metadata_df) 46 | pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, annotated_gctx.row_metadata_df) 47 | 48 | # Clean up 49 | os.remove(out_name) 50 | os.remove(added_meta) 51 | 52 | def test_missing_annotations(self): 53 | with self.assertRaises(Exception) as context: 54 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct" 55 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx" 56 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt" 57 | args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta) 58 | args = gct2gctx.build_parser().parse_args(args_string.split()) 59 | 60 | gct2gctx.gct2gctx_main(args) 61 | 62 | self.assertTrue('Row ids in matrix missing from annotations file' in context.exception) 63 | 64 | with self.assertRaises(Exception) as context: 65 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct" 66 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx" 67 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt" 68 | args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta) 69 | args = gct2gctx.build_parser().parse_args(args_string.split()) 70 | 71 | gct2gctx.gct2gctx_main(args) 72 | 73 | self.assertTrue('Column ids in matrix missing from annotations file' in context.exception) 74 | 75 | 76 | if __name__ == "__main__": 77 | setup_logger.setup(verbose=True) 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_gctx2gct.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import os 5 | import cmapPy.pandasGEXpress.gctx2gct as gctx2gct 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class TestGCTx2GCT(unittest.TestCase): 14 | 15 | def test_gctx2gct_main(self): 16 | 17 | in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx" 18 | out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out.gct" 19 | args_string = "-f {} -o {}".format(in_name, out_name) 20 | args = gctx2gct.build_parser().parse_args(args_string.split()) 21 | 22 | gctx2gct.gctx2gct_main(args) 23 | 24 | # Make sure the input is identical to output 25 | in_gctx = parse_gctx.parse(in_name) 26 | out_gct = parse_gct.parse(out_name) 27 | 28 | pd.util.testing.assert_frame_equal(in_gctx.data_df, out_gct.data_df, check_less_precise=3) 29 | pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df) 30 | pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df) 31 | 32 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx" 33 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct" 34 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt" 35 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt" 36 | args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta ) 37 | args = gctx2gct.build_parser().parse_args(args_string.split()) 38 | 39 | gctx2gct.gctx2gct_main(args) 40 | 41 | annotated_gct = parse_gct.parse(added_meta) 42 | 43 | # Check added annotations are the same as original input GCTX 44 | pd.util.testing.assert_frame_equal(in_gctx.data_df, annotated_gct.data_df, check_less_precise=3) 45 | pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, annotated_gct.col_metadata_df) 46 | pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, annotated_gct.row_metadata_df) 47 | 48 | # Clean up 49 | os.remove(out_name) 50 | os.remove(added_meta) 51 | 52 | def test_missing_annotations(self): 53 | with self.assertRaises(Exception) as context: 54 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx" 55 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct" 56 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt" 57 | args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta) 58 | args = gctx2gct.build_parser().parse_args(args_string.split()) 59 | 60 | gctx2gct.gctx2gct_main(args) 61 | 62 | self.assertTrue('Row ids in matrix missing from annotations file' in context.exception) 63 | 64 | with self.assertRaises(Exception) as context: 65 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx" 66 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct" 67 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt" 68 | args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta) 69 | args = gctx2gct.build_parser().parse_args(args_string.split()) 70 | 71 | gctx2gct.gctx2gct_main(args) 72 | 73 | self.assertTrue('Column ids in matrix missing from annotations file' in context.exception) 74 | 75 | 76 | if __name__ == "__main__": 77 | setup_logger.setup(verbose=True) 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_parse.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 4 | import unittest 5 | import pandas.util.testing as pandas_testing 6 | import cmapPy.pandasGEXpress.subset_gctoo as subset_gctoo 7 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing 8 | import cmapPy.pandasGEXpress.parse as parse 9 | 10 | __author__ = "Oana Enache" 11 | __email__ = "oana@broadinstitute.org" 12 | 13 | FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/" 14 | 15 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 16 | 17 | class TestParse(unittest.TestCase): 18 | def test_gctx_parsing(self): 19 | # parse in gctx, no other arguments 20 | mg1 = mini_gctoo_for_testing.make() 21 | mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx") 22 | 23 | pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) 24 | pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) 25 | pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 26 | 27 | # check convert_neg_666 worked correctly 28 | self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) 29 | 30 | # parse w/o convert_neg_666 31 | mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", convert_neg_666 = False) 32 | self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all()) 33 | 34 | # parsing w/rids & cids specified 35 | test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'] 36 | test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] 37 | mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids) 38 | mg4 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", 39 | rid=test_rids, cid=test_cids) 40 | pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df) 41 | pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) 42 | pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) 43 | 44 | # parsing w/ridx & cidx specified 45 | mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 46 | cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) 47 | mg6 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) 48 | 49 | pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df) 50 | pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) 51 | pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) 52 | 53 | # parsing row metadata only 54 | mg7 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", row_meta_only=True) 55 | pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df) 56 | 57 | # parsing col metadata only 58 | mg8 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", col_meta_only=True) 59 | pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df) 60 | 61 | # parsing w/multiindex 62 | mg9 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", make_multiindex=True) 63 | self.assertTrue(mg9.multi_index_df is not None) 64 | 65 | def test_gct_parsing(self): 66 | # parse in gct, no other arguments 67 | mg1 = mini_gctoo_for_testing.make() 68 | mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct") 69 | 70 | pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) 71 | pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) 72 | pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 73 | 74 | # check convert_neg_666 worked correctly 75 | self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) 76 | 77 | # parse w/o convert_neg_666 78 | mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct", convert_neg_666 = False) 79 | self.assertItemsEqual(mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(), 80 | [-666] * 6) 81 | 82 | # parse in gct with subsetting 83 | my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33" 84 | mg3 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct", 85 | cidx=[0, 2], rid=[my_rid]) 86 | 87 | self.assertEqual(mg3.data_df.shape, (1, 2)) 88 | self.assertItemsEqual(mg3.data_df.values.flatten().tolist(), [1., 3.]) 89 | self.assertEqual(mg3.row_metadata_df.index[0], my_rid) 90 | 91 | if __name__ == "__main__": 92 | setup_logger.setup(verbose=True) 93 | unittest.main() 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_random_slice.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 4 | import cmapPy.pandasGEXpress.random_slice as random_slice 5 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | 9 | 10 | class TestRandomSlice(unittest.TestCase): 11 | def test_make_specified_size_gctoo(self): 12 | mini_gctoo = mini_gctoo_for_testing.make() 13 | logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape)) 14 | logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape)) 15 | logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape)) 16 | 17 | # case 1: dim isn't 'row' or 'col' 18 | with self.assertRaises(AssertionError) as context: 19 | random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll") 20 | self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'") 21 | 22 | # case 2: row subsetting - happy 23 | row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row") 24 | self.assertEqual(row_subset.data_df.shape, (3, 6), 25 | "data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape)) 26 | self.assertEqual(row_subset.row_metadata_df.shape, (3, 5), 27 | "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format( 28 | row_subset.row_metadata_df.shape)) 29 | self.assertEqual(row_subset.col_metadata_df.shape, (6, 5), 30 | "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format( 31 | row_subset.col_metadata_df.shape)) 32 | 33 | # case 3: row subsetting - sample subset > og # of samples 34 | with self.assertRaises(AssertionError) as context: 35 | random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row") 36 | self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception)) 37 | 38 | # case 4: col subsetting - happy 39 | col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col") 40 | self.assertEqual(col_subset.data_df.shape, (6, 3), 41 | "data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape)) 42 | self.assertEqual(col_subset.row_metadata_df.shape, (6, 5), 43 | "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format( 44 | col_subset.row_metadata_df.shape)) 45 | self.assertEqual(col_subset.col_metadata_df.shape, (3, 5), 46 | "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format( 47 | col_subset.col_metadata_df.shape)) 48 | 49 | # case 5: col subsetting - sample subset > og # of samples 50 | with self.assertRaises(AssertionError) as context: 51 | random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col") 52 | self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception)) 53 | 54 | 55 | if __name__ == "__main__": 56 | setup_logger.setup(verbose=True) 57 | 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_subset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import os 4 | import pandas as pd 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 6 | import cmapPy.pandasGEXpress.parse as parse 7 | import cmapPy.pandasGEXpress.subset as sg 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class TestSubset(unittest.TestCase): 13 | 14 | def test_read_arg(self): 15 | arg_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp") 16 | rids = sg._read_arg([arg_path]) 17 | self.assertItemsEqual(rids, ["a", "Bb", "c"]) 18 | 19 | def test_read_arg_bad(self): 20 | with self.assertRaises(AssertionError) as e: 21 | sg._read_arg("a b c") 22 | self.assertIn("arg_out must be a list", str(e.exception)) 23 | 24 | with self.assertRaises(AssertionError) as e: 25 | sg._read_arg([1, 2, 3]) 26 | self.assertIn("arg_out must be a list of strings", str(e.exception)) 27 | 28 | def test_subset_main(self): 29 | 30 | in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct") 31 | rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp") 32 | out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct") 33 | expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct") 34 | 35 | args_string = "-i {} --rid {} -ec {} -o {}".format( 36 | in_gct_path, rid_grp_path, "f", out_name) 37 | args = sg.build_parser().parse_args(args_string.split()) 38 | 39 | # Run main method 40 | sg.subset_main(args) 41 | 42 | # Compare output to expected 43 | out_gct = parse.parse(out_name) 44 | expected_gct = parse.parse(expected_out_path) 45 | 46 | pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df) 47 | pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df) 48 | pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df) 49 | 50 | # Clean up 51 | os.remove(out_name) 52 | 53 | # gctx with exclude_rid should fail 54 | args_string2 = "-i {} --rid {} -ec {} -o {}".format( 55 | "FAKE.gctx", rid_grp_path, "f", out_name) 56 | args2 = sg.build_parser().parse_args(args_string2.split()) 57 | 58 | with self.assertRaises(Exception) as e: 59 | sg.subset_main(args2) 60 | self.assertIn("exclude_{rid,cid} args not currently supported", 61 | str(e.exception)) 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python2_tests/test_subset_gctoo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.pandasGEXpress.GCToo as GCToo 6 | import cmapPy.pandasGEXpress.subset_gctoo as sg 7 | 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class TestSubsetGCToo(unittest.TestCase): 13 | 14 | @classmethod 15 | def setUpClass(cls): 16 | data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]], 17 | index=["a", "b", "c", "d"], columns=["e", "f", "g"]) 18 | row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]], 19 | index=["a", "b", "c", "d"], columns=["rhd1", "rh2"]) 20 | col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]], 21 | index=["e", "f", "g"], columns=["chd1", "chd2"]) 22 | cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df) 23 | 24 | def test_subset_gctoo(self): 25 | 26 | # Error if resulting GCT is empty 27 | with self.assertRaises(AssertionError) as e: 28 | sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"]) 29 | self.assertIn("Subsetting yielded an", str(e.exception)) 30 | 31 | # cid and col_bool should not both be provided 32 | with self.assertRaises(AssertionError) as e: 33 | sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False]) 34 | self.assertIn("Only one of cid,", str(e.exception)) 35 | 36 | # Providing all 3 row inputs is also bad! 37 | with self.assertRaises(AssertionError) as e: 38 | sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!") 39 | self.assertIn("Only one of rid,", str(e.exception)) 40 | 41 | # happy path 42 | out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0], 43 | exclude_rid=["a"]) 44 | pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]]) 45 | 46 | def test_get_rows_to_keep(self): 47 | 48 | # rid must be a list 49 | with self.assertRaises(AssertionError) as e: 50 | sg.get_rows_to_keep(self.in_gct, rid="bad") 51 | self.assertIn("rid must be a list", str(e.exception)) 52 | 53 | # bools 54 | out_rows = sg.get_rows_to_keep(self.in_gct, row_bool=[True, True, True, False]) 55 | self.assertItemsEqual(out_rows, ["a", "b", "c"]) 56 | 57 | # rid and exclude_rid 58 | out_rows2 = sg.get_rows_to_keep(self.in_gct, rid=["a", "c", "d"], exclude_rid=["d"]) 59 | self.assertItemsEqual(out_rows2, ["a", "c"]) 60 | 61 | # keep all rows 62 | out_rows3 = sg.get_rows_to_keep(self.in_gct) 63 | self.assertItemsEqual(out_rows3, ["a", "b", "c", "d"]) 64 | 65 | with self.assertRaises(AssertionError) as e: 66 | sg.get_rows_to_keep(self.in_gct, row_bool=[True, False, True]) 67 | self.assertIn("row_bool must have length", str(e.exception)) 68 | 69 | with self.assertRaises(AssertionError) as e: 70 | sg.get_rows_to_keep(self.in_gct, ridx=[True, False, True]) 71 | self.assertIn("ridx must be a list of integers", str(e.exception)) 72 | 73 | with self.assertRaises(AssertionError) as e: 74 | sg.get_rows_to_keep(self.in_gct, ridx=[0, 2, 5]) 75 | self.assertIn("ridx contains an integer", str(e.exception)) 76 | 77 | def test_get_cols_to_keep(self): 78 | # N.B. annoying that we have two extremely similar but separate methods 79 | # for rows and columns, but I think it's worth it to have clear error 80 | # messages 81 | 82 | # cid must be a list 83 | with self.assertRaises(AssertionError) as e: 84 | sg.get_cols_to_keep(self.in_gct, cid="real_bad") 85 | self.assertIn("cid must be a list", str(e.exception)) 86 | 87 | # bools 88 | out_cols = sg.get_cols_to_keep(self.in_gct, col_bool=[False, True, True]) 89 | self.assertItemsEqual(out_cols, ["f", "g"]) 90 | 91 | # cid and exclude_cid 92 | out_cols2 = sg.get_cols_to_keep(self.in_gct, cid=["g", "e", "f"], exclude_cid=["f"], cidx=None) 93 | self.assertItemsEqual(out_cols2, ["g", "e"]) 94 | 95 | # keep all cols 96 | out_cols3 = sg.get_cols_to_keep(self.in_gct) 97 | self.assertItemsEqual(out_cols3, ["e", "f", "g"]) 98 | 99 | with self.assertRaises(AssertionError) as e: 100 | sg.get_cols_to_keep(self.in_gct, col_bool=[True, False, True, True]) 101 | self.assertIn("col_bool must have length", str(e.exception)) 102 | 103 | with self.assertRaises(AssertionError) as e: 104 | sg.get_cols_to_keep(self.in_gct, cidx=[True, False, True]) 105 | self.assertIn("cidx must be a list of integers", str(e.exception)) 106 | 107 | with self.assertRaises(AssertionError) as e: 108 | sg.get_cols_to_keep(self.in_gct, cidx=[10]) 109 | self.assertIn("cidx contains an integer", str(e.exception)) 110 | 111 | if __name__ == '__main__': 112 | setup_logger.setup(verbose=True) 113 | unittest.main() 114 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/python3_tests/__init__.py -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_diff_gctoo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.pandasGEXpress.GCToo as GCToo 6 | import cmapPy.pandasGEXpress.diff_gctoo as diff_gctoo 7 | 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 9 | 10 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 11 | 'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]}, 12 | columns=['A','C','B','E','D','F']) 13 | test_col_meta = pd.DataFrame( 14 | {'pert_type': ['trt_cp', 'trt_cp', 'trt_cp', 15 | 'trt_cp', 'ctl_vehicle', 'ctl_vehicle'], 16 | 'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']}, 17 | index=['A', 'B', 'C', 'D', 'E', 'F']) 18 | test_gctoo = GCToo.GCToo(data_df=test_mat, 19 | col_metadata_df=test_col_meta) 20 | 21 | 22 | class TestDifferential(unittest.TestCase): 23 | def test_diff_gctoo_pc(self): 24 | pc_zscores = diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, lower_diff_thresh=-2) 25 | self.assertTrue(pc_zscores.data_df.shape == (3, 6)) 26 | 27 | pd.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame( 28 | {'A': [-0.6745, -0.9443, -1.349], 29 | 'C': [0.2248, -0.1349, 1.349], 30 | 'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0], 31 | 'D': [-0.2248, -0.9443, -2], # last val should be -2 bc of thresholding 32 | 'F': [0.6745, 0.1349, 0.0]}, 33 | columns=['A', 'C', 'B', 'E', 'D', 'F'])) 34 | 35 | # test diff_method assertion 36 | with self.assertRaises(AssertionError) as e: 37 | diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, diff_method="robust_zs") 38 | self.assertIn("diff_method: robust_zs", str(e.exception)) 39 | 40 | # test median norm 41 | pc_median_normed_df = diff_gctoo.diff_gctoo(test_gctoo, diff_method="median_norm") 42 | self.assertEqual(pc_median_normed_df.data_df.iloc[0, 0], -1.5) 43 | self.assertEqual(pc_median_normed_df.data_df.loc[2, "B"], 0) 44 | 45 | def test_diff_gctoo_vc(self): 46 | vc_zscores1 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False) 47 | vc_zscores2 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, 48 | group_field='pert_iname', 49 | group_val='DMSO') 50 | self.assertTrue(vc_zscores1.data_df.shape == (3, 6)) 51 | self.assertTrue(vc_zscores2.data_df.shape == (3, 6)) 52 | 53 | pd.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame( 54 | {'A': [-4.7214, -3.3725, -10.0], # check for thresholding 55 | 'C': [-2.0235, -1.349, 10.0], 56 | 'B': [-7.4194, 0.6745, 0.0], 57 | 'E': [0.6745, 0.6745, 0.0], 58 | 'D': [-3.3725, -3.3725, -10.0], 59 | 'F': [-0.6745, -0.6745, 0.0]}, 60 | columns=['A', 'C', 'B', 'E', 'D', 'F'])) 61 | 62 | pd.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame( 63 | {'A': [-2.0235, -0.6745, -0.3372], 64 | 'C': [0.6745, 0.6745, 0.6745], 65 | 'B': [-4.7214, 2.0235, 0.1686], 66 | 'E': [3.3725, 2.0235, 0.1686], 67 | 'D': [-0.6745, -0.6745, -0.6745], 68 | 'F': [2.0235, 1.1242, 0.1686]}, 69 | columns=['A', 'C', 'B', 'E', 'D', 'F'])) 70 | 71 | # test group_val assertion 72 | with self.assertRaises(AssertionError) as e: 73 | diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, group_val="dmso") 74 | self.assertIn("dmso not present", str(e.exception)) 75 | 76 | 77 | if __name__ == "__main__": 78 | setup_logger.setup(verbose=True) 79 | unittest.main() 80 | 81 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_gct2gctx.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import os 5 | import cmapPy.pandasGEXpress.gct2gctx as gct2gctx 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class TestGCT2GCTx(unittest.TestCase): 14 | 15 | def test_gct2gctx_main(self): 16 | 17 | in_name = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct" 18 | out_name = "cmapPy/pandasGEXpress/tests/functional_tests/test_gct2gctx_out.gctx" 19 | args_string = "-f {} -o {}".format(in_name, out_name) 20 | args = gct2gctx.build_parser().parse_args(args_string.split()) 21 | 22 | gct2gctx.gct2gctx_main(args) 23 | 24 | # Make sure the input is identical to output 25 | in_gct = parse_gct.parse(in_name) 26 | out_gctx = parse_gctx.parse(out_name) 27 | 28 | pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df) 29 | pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df) 30 | pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df) 31 | 32 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gct" 33 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gct2gctx_out_annotated.gctx" 34 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt" 35 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt" 36 | args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta) 37 | args = gct2gctx.build_parser().parse_args(args_string.split()) 38 | 39 | gct2gctx.gct2gctx_main(args) 40 | 41 | annotated_gctx = parse_gctx.parse(added_meta) 42 | 43 | # Check added annotations are the same as original input GCTX 44 | pd.util.testing.assert_frame_equal(in_gct.data_df, annotated_gctx.data_df, check_less_precise=3) 45 | pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, annotated_gctx.col_metadata_df) 46 | pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, annotated_gctx.row_metadata_df) 47 | 48 | # Clean up 49 | os.remove(out_name) 50 | os.remove(added_meta) 51 | 52 | def test_missing_annotations(self): 53 | with self.assertRaises(Exception) as context: 54 | no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct" 55 | added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx" 56 | row_meta = "../functional_tests/test_missing_rowmeta.txt" 57 | args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta) 58 | args = gct2gctx.build_parser().parse_args(args_string.split()) 59 | 60 | gct2gctx.gct2gctx_main(args) 61 | 62 | self.assertTrue('Row ids in matrix missing from annotations file', context.exception) 63 | 64 | with self.assertRaises(Exception) as context: 65 | no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct" 66 | added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx" 67 | col_meta = "../functional_tests/test_missing_colmeta.txt" 68 | args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta) 69 | args = gct2gctx.build_parser().parse_args(args_string.split()) 70 | 71 | gct2gctx.gct2gctx_main(args) 72 | 73 | self.assertTrue('Column ids in matrix missing from annotations file', context.exception) 74 | 75 | 76 | if __name__ == "__main__": 77 | setup_logger.setup(verbose=True) 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_gctx2gct.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import os 5 | import cmapPy.pandasGEXpress.gctx2gct as gctx2gct 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | 13 | class TestGCTx2GCT(unittest.TestCase): 14 | 15 | def test_gctx2gct_main(self): 16 | 17 | in_name = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx" 18 | out_name = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out.gct" 19 | args_string = "-f {} -o {}".format(in_name, out_name) 20 | args = gctx2gct.build_parser().parse_args(args_string.split()) 21 | 22 | gctx2gct.gctx2gct_main(args) 23 | 24 | # Make sure the input is identical to output 25 | in_gctx = parse_gctx.parse(in_name) 26 | out_gct = parse_gct.parse(out_name) 27 | 28 | pd.util.testing.assert_frame_equal(in_gctx.data_df, out_gct.data_df, check_less_precise=3) 29 | pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df) 30 | pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df) 31 | 32 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx" 33 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct" 34 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt" 35 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt" 36 | args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta ) 37 | args = gctx2gct.build_parser().parse_args(args_string.split()) 38 | 39 | gctx2gct.gctx2gct_main(args) 40 | 41 | annotated_gct = parse_gct.parse(added_meta) 42 | 43 | # Check added annotations are the same as original input GCTX 44 | pd.util.testing.assert_frame_equal(in_gctx.data_df, annotated_gct.data_df, check_less_precise=3) 45 | pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, annotated_gct.col_metadata_df) 46 | pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, annotated_gct.row_metadata_df) 47 | 48 | # Clean up 49 | os.remove(out_name) 50 | os.remove(added_meta) 51 | 52 | def test_missing_annotations(self): 53 | with self.assertRaises(Exception) as context: 54 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx" 55 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct" 56 | row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_missing_rowmeta.txt" 57 | args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta) 58 | args = gctx2gct.build_parser().parse_args(args_string.split()) 59 | 60 | gctx2gct.gctx2gct_main(args) 61 | 62 | print(context.exception) 63 | self.assertTrue('Row ids in matrix missing from annotations file', context.exception) 64 | 65 | with self.assertRaises(Exception) as context: 66 | no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx" 67 | added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct" 68 | col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_missing_colmeta.txt" 69 | args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta) 70 | args = gctx2gct.build_parser().parse_args(args_string.split()) 71 | 72 | gctx2gct.gctx2gct_main(args) 73 | 74 | self.assertTrue('Column ids in matrix missing from annotations file', context.exception) 75 | 76 | 77 | if __name__ == "__main__": 78 | setup_logger.setup(verbose=True) 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_parse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 3 | import unittest 4 | import pandas.util.testing as pandas_testing 5 | import cmapPy.pandasGEXpress.subset_gctoo as subset_gctoo 6 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing 7 | import cmapPy.pandasGEXpress.parse as parse 8 | 9 | __author__ = "Oana Enache" 10 | __email__ = "oana@broadinstitute.org" 11 | 12 | FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/" 13 | 14 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 15 | 16 | class TestParse(unittest.TestCase): 17 | def test_gctx_parsing(self): 18 | # parse in gctx, no other arguments 19 | mg1 = mini_gctoo_for_testing.make() 20 | mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx") 21 | 22 | pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) 23 | pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) 24 | pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 25 | 26 | # check convert_neg_666 worked correctly 27 | self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) 28 | 29 | # parse w/o convert_neg_666 30 | mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", convert_neg_666 = False) 31 | self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all()) 32 | 33 | # parsing w/rids & cids specified 34 | test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'] 35 | test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] 36 | mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids) 37 | mg4 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", 38 | rid=test_rids, cid=test_cids) 39 | pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df) 40 | pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) 41 | pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) 42 | 43 | # parsing w/ridx & cidx specified 44 | mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 45 | cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) 46 | mg6 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) 47 | 48 | pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df) 49 | pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) 50 | pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) 51 | 52 | # parsing row metadata only 53 | mg7 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True) 54 | pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df) 55 | 56 | # parsing col metadata only 57 | mg8 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True) 58 | pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df) 59 | 60 | # parsing w/multiindex 61 | mg9 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", make_multiindex=True) 62 | self.assertTrue(mg9.multi_index_df is not None) 63 | 64 | def test_gct_parsing(self): 65 | # parse in gct, no other arguments 66 | mg1 = mini_gctoo_for_testing.make() 67 | mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct") 68 | 69 | pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) 70 | pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) 71 | pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 72 | 73 | # check convert_neg_666 worked correctly 74 | self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all()) 75 | 76 | # parse w/o convert_neg_666 77 | mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct", convert_neg_666 = False) 78 | self.assertCountEqual(mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(), 79 | [-666] * 6) 80 | 81 | # parse in gct with subsetting 82 | my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33" 83 | mg3 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct", 84 | cidx=[0, 2], rid=[my_rid]) 85 | 86 | self.assertEqual(mg3.data_df.shape, (1, 2)) 87 | self.assertCountEqual(mg3.data_df.values.flatten().tolist(), [1., 3.]) 88 | self.assertEqual(mg3.row_metadata_df.index[0], my_rid) 89 | 90 | if __name__ == "__main__": 91 | setup_logger.setup(verbose=True) 92 | unittest.main() 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_random_slice.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 4 | import cmapPy.pandasGEXpress.random_slice as random_slice 5 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | 9 | 10 | class TestRandomSlice(unittest.TestCase): 11 | def test_make_specified_size_gctoo(self): 12 | mini_gctoo = mini_gctoo_for_testing.make() 13 | logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape)) 14 | logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape)) 15 | logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape)) 16 | 17 | # case 1: dim isn't 'row' or 'col' 18 | with self.assertRaises(AssertionError) as context: 19 | random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll") 20 | self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'") 21 | 22 | # case 2: row subsetting - happy 23 | row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row") 24 | self.assertEqual(row_subset.data_df.shape, (3, 6), 25 | "data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape)) 26 | self.assertEqual(row_subset.row_metadata_df.shape, (3, 5), 27 | "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format( 28 | row_subset.row_metadata_df.shape)) 29 | self.assertEqual(row_subset.col_metadata_df.shape, (6, 5), 30 | "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format( 31 | row_subset.col_metadata_df.shape)) 32 | 33 | # case 3: row subsetting - sample subset > og # of samples 34 | with self.assertRaises(AssertionError) as context: 35 | random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row") 36 | self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception)) 37 | 38 | # case 4: col subsetting - happy 39 | col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col") 40 | self.assertEqual(col_subset.data_df.shape, (6, 3), 41 | "data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape)) 42 | self.assertEqual(col_subset.row_metadata_df.shape, (6, 5), 43 | "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format( 44 | col_subset.row_metadata_df.shape)) 45 | self.assertEqual(col_subset.col_metadata_df.shape, (3, 5), 46 | "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format( 47 | col_subset.col_metadata_df.shape)) 48 | 49 | # case 5: col subsetting - sample subset > og # of samples 50 | with self.assertRaises(AssertionError) as context: 51 | random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col") 52 | self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception)) 53 | 54 | 55 | if __name__ == "__main__": 56 | setup_logger.setup(verbose=True) 57 | 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_subset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import os 4 | import pandas as pd 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 6 | import cmapPy.pandasGEXpress.parse as parse 7 | import cmapPy.pandasGEXpress.subset as sg 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class TestSubset(unittest.TestCase): 13 | 14 | def test_read_arg(self): 15 | arg_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp") 16 | rids = sg._read_arg([arg_path]) 17 | self.assertCountEqual(rids, ["a", "Bb", "c"]) 18 | 19 | def test_read_arg_bad(self): 20 | with self.assertRaises(AssertionError) as e: 21 | sg._read_arg("a b c") 22 | self.assertIn("arg_out must be a list", str(e.exception)) 23 | 24 | with self.assertRaises(AssertionError) as e: 25 | sg._read_arg([1, 2, 3]) 26 | self.assertIn("arg_out must be a list of strings", str(e.exception)) 27 | 28 | def test_subset_main(self): 29 | 30 | in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct") 31 | rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp") 32 | out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct") 33 | expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct") 34 | 35 | args_string = "-i {} --rid {} -ec {} -o {}".format( 36 | in_gct_path, rid_grp_path, "f", out_name) 37 | args = sg.build_parser().parse_args(args_string.split()) 38 | 39 | # Run main method 40 | sg.subset_main(args) 41 | 42 | # Compare output to expected 43 | out_gct = parse.parse(out_name) 44 | expected_gct = parse.parse(expected_out_path) 45 | 46 | pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df) 47 | pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df) 48 | pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df) 49 | 50 | # Clean up 51 | os.remove(out_name) 52 | 53 | # gctx with exclude_rid should fail 54 | args_string2 = "-i {} --rid {} -ec {} -o {}".format( 55 | "FAKE.gctx", rid_grp_path, "f", out_name) 56 | args2 = sg.build_parser().parse_args(args_string2.split()) 57 | 58 | with self.assertRaises(Exception) as e: 59 | sg.subset_main(args2) 60 | self.assertIn("exclude_{rid,cid} args not currently supported", 61 | str(e.exception)) 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_subset_gctoo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import pandas as pd 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.pandasGEXpress.GCToo as GCToo 6 | import cmapPy.pandasGEXpress.subset_gctoo as sg 7 | 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class TestSubsetGCToo(unittest.TestCase): 13 | 14 | @classmethod 15 | def setUpClass(cls): 16 | data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]], 17 | index=["a", "b", "c", "d"], columns=["e", "f", "g"]) 18 | row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]], 19 | index=["a", "b", "c", "d"], columns=["rhd1", "rh2"]) 20 | col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]], 21 | index=["e", "f", "g"], columns=["chd1", "chd2"]) 22 | cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df) 23 | 24 | def test_subset_gctoo(self): 25 | 26 | # Error if resulting GCT is empty 27 | with self.assertRaises(AssertionError) as e: 28 | sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"]) 29 | self.assertIn("Subsetting yielded an", str(e.exception)) 30 | 31 | # cid and col_bool should not both be provided 32 | with self.assertRaises(AssertionError) as e: 33 | sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False]) 34 | self.assertIn("Only one of cid,", str(e.exception)) 35 | 36 | # Providing all 3 row inputs is also bad! 37 | with self.assertRaises(AssertionError) as e: 38 | sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!") 39 | self.assertIn("Only one of rid,", str(e.exception)) 40 | 41 | # happy path 42 | out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0], 43 | exclude_rid=["a"]) 44 | pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]]) 45 | 46 | def test_get_rows_to_keep(self): 47 | 48 | # rid must be a list 49 | with self.assertRaises(AssertionError) as e: 50 | sg.get_rows_to_keep(self.in_gct, rid="bad") 51 | self.assertIn("rid must be a list", str(e.exception)) 52 | 53 | # bools 54 | out_rows = sg.get_rows_to_keep(self.in_gct, row_bool=[True, True, True, False]) 55 | self.assertCountEqual(out_rows, ["a", "b", "c"]) 56 | 57 | # rid and exclude_rid 58 | out_rows2 = sg.get_rows_to_keep(self.in_gct, rid=["a", "c", "d"], exclude_rid=["d"]) 59 | self.assertCountEqual(out_rows2, ["a", "c"]) 60 | 61 | # keep all rows 62 | out_rows3 = sg.get_rows_to_keep(self.in_gct) 63 | self.assertCountEqual(out_rows3, ["a", "b", "c", "d"]) 64 | 65 | with self.assertRaises(AssertionError) as e: 66 | sg.get_rows_to_keep(self.in_gct, row_bool=[True, False, True]) 67 | self.assertIn("row_bool must have length", str(e.exception)) 68 | 69 | with self.assertRaises(AssertionError) as e: 70 | sg.get_rows_to_keep(self.in_gct, ridx=[True, False, True]) 71 | self.assertIn("ridx must be a list of integers", str(e.exception)) 72 | 73 | with self.assertRaises(AssertionError) as e: 74 | sg.get_rows_to_keep(self.in_gct, ridx=[0, 2, 5]) 75 | self.assertIn("ridx contains an integer", str(e.exception)) 76 | 77 | def test_get_cols_to_keep(self): 78 | # N.B. annoying that we have two extremely similar but separate methods 79 | # for rows and columns, but I think it's worth it to have clear error 80 | # messages 81 | 82 | # cid must be a list 83 | with self.assertRaises(AssertionError) as e: 84 | sg.get_cols_to_keep(self.in_gct, cid="real_bad") 85 | self.assertIn("cid must be a list", str(e.exception)) 86 | 87 | # bools 88 | out_cols = sg.get_cols_to_keep(self.in_gct, col_bool=[False, True, True]) 89 | self.assertCountEqual(out_cols, ["f", "g"]) 90 | 91 | # cid and exclude_cid 92 | out_cols2 = sg.get_cols_to_keep(self.in_gct, cid=["g", "e", "f"], exclude_cid=["f"], cidx=None) 93 | self.assertCountEqual(out_cols2, ["g", "e"]) 94 | 95 | # keep all cols 96 | out_cols3 = sg.get_cols_to_keep(self.in_gct) 97 | self.assertCountEqual(out_cols3, ["e", "f", "g"]) 98 | 99 | with self.assertRaises(AssertionError) as e: 100 | sg.get_cols_to_keep(self.in_gct, col_bool=[True, False, True, True]) 101 | self.assertIn("col_bool must have length", str(e.exception)) 102 | 103 | with self.assertRaises(AssertionError) as e: 104 | sg.get_cols_to_keep(self.in_gct, cidx=[True, False, True]) 105 | self.assertIn("cidx must be a list of integers", str(e.exception)) 106 | 107 | with self.assertRaises(AssertionError) as e: 108 | sg.get_cols_to_keep(self.in_gct, cidx=[10]) 109 | self.assertIn("cidx contains an integer", str(e.exception)) 110 | 111 | if __name__ == '__main__': 112 | setup_logger.setup(verbose=True) 113 | unittest.main() 114 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/tests/python3_tests/test_transform_gctoo.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import logging 4 | import pandas 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 6 | import cmapPy.pandasGEXpress.GCToo as GCToo 7 | import cmapPy.pandasGEXpress.transform_gctoo as tg 8 | 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 10 | 11 | 12 | class TestSubset(unittest.TestCase): 13 | def test_transpose(self): 14 | data_df = pandas.DataFrame({"a":range(2,5), "b":range(7,10)}) 15 | logger.debug("happy path - data_df:\n{}".format(data_df)) 16 | 17 | row_metadata_df = pandas.DataFrame({"rm1":range(3)}, index=data_df.index) 18 | logger.debug("row_metadata_df:\n{}".format(row_metadata_df)) 19 | 20 | col_metadata_df = pandas.DataFrame({"cm1":range(2), "cm2":range(3,5)}, index=data_df.columns) 21 | logger.debug("col_metadata_df:\n{}".format(col_metadata_df)) 22 | 23 | my_gctoo = GCToo.GCToo(data_df, row_metadata_df=row_metadata_df, col_metadata_df=col_metadata_df) 24 | logger.debug("my_gctoo:\n{}".format(my_gctoo)) 25 | 26 | r = tg.transpose(my_gctoo) 27 | logger.debug("result r:\n{}".format(r)) 28 | 29 | logger.debug("r.data_df:\n{}".format(r.data_df)) 30 | self.assertTrue(data_df.equals(r.data_df.T)) 31 | 32 | logger.debug("r.row_metadata_df:\n{}".format(row_metadata_df)) 33 | self.assertTrue(col_metadata_df.equals(r.row_metadata_df)) 34 | 35 | logger.debug("r.col_metadata_df:\n{}".format(r.col_metadata_df)) 36 | self.assertTrue(row_metadata_df.equals(r.col_metadata_df)) 37 | 38 | 39 | if __name__ == '__main__': 40 | setup_logger.setup(verbose=True) 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/transform_gctoo.py: -------------------------------------------------------------------------------- 1 | """ 2 | transform_gctoo.py 3 | 4 | module to contain various transformations of GCToo objects. Initially just transpose. 5 | 6 | """ 7 | import logging 8 | 9 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 10 | import cmapPy.pandasGEXpress.GCToo as GCToo 11 | 12 | 13 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 14 | 15 | def transpose(my_gctoo): 16 | new_gctoo = GCToo.GCToo( 17 | data_df=my_gctoo.data_df.T, 18 | row_metadata_df=my_gctoo.col_metadata_df, 19 | col_metadata_df=my_gctoo.row_metadata_df 20 | ) 21 | 22 | return new_gctoo -------------------------------------------------------------------------------- /cmapPy/pandasGEXpress/write_gct.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 6 | 7 | __author__ = "Lev Litichevskiy" 8 | __email__ = "lev@broadinstitute.org" 9 | 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 11 | 12 | # Only writes GCT1.3 13 | VERSION = "1.3" 14 | 15 | 16 | def write(gctoo, out_fname, data_null="NaN", metadata_null="-666", filler_null="-666", data_float_format="%.4f"): 17 | """Write a gctoo object to a gct file. 18 | 19 | Args: 20 | gctoo (gctoo object) 21 | out_fname (string): filename for output gct file 22 | data_null (string): how to represent missing values in the data (default = "NaN") 23 | metadata_null (string): how to represent missing values in the metadata (default = "-666") 24 | filler_null (string): what value to fill the top-left filler block with (default = "-666") 25 | data_float_format (string): how many decimal points to keep in representing data 26 | (default = 4 digits; None will keep all digits) 27 | 28 | Returns: 29 | None 30 | 31 | """ 32 | # Create handle for output file 33 | if not out_fname.endswith(".gct"): 34 | out_fname += ".gct" 35 | f = open(out_fname, "w") 36 | 37 | # Write first two lines 38 | dims = [str(gctoo.data_df.shape[0]), str(gctoo.data_df.shape[1]), 39 | str(gctoo.row_metadata_df.shape[1]), str(gctoo.col_metadata_df.shape[1])] 40 | write_version_and_dims(VERSION, dims, f) 41 | 42 | # Write top half of the gct 43 | write_top_half(f, gctoo.row_metadata_df, gctoo.col_metadata_df, 44 | metadata_null, filler_null) 45 | 46 | # Write bottom half of the gct 47 | write_bottom_half(f, gctoo.row_metadata_df, gctoo.data_df, 48 | data_null, data_float_format, metadata_null) 49 | 50 | f.close() 51 | logger.info("GCT has been written to {}".format(out_fname)) 52 | 53 | 54 | def write_version_and_dims(version, dims, f): 55 | """Write first two lines of gct file. 56 | 57 | Args: 58 | version (string): 1.3 by default 59 | dims (list of strings): length = 4 60 | f (file handle): handle of output file 61 | Returns: 62 | nothing 63 | """ 64 | f.write(("#" + version + "\n")) 65 | f.write((dims[0] + "\t" + dims[1] + "\t" + dims[2] + "\t" + dims[3] + "\n")) 66 | 67 | 68 | def write_top_half(f, row_metadata_df, col_metadata_df, metadata_null, filler_null): 69 | """ Write the top half of the gct file: top-left filler values, row metadata 70 | headers, and top-right column metadata. 71 | 72 | Args: 73 | f (file handle): handle for output file 74 | row_metadata_df (pandas df) 75 | col_metadata_df (pandas df) 76 | metadata_null (string): how to represent missing values in the metadata 77 | filler_null (string): what value to fill the top-left filler block with 78 | 79 | Returns: 80 | None 81 | """ 82 | # Initialize the top half of the gct including the third line 83 | size_of_top_half_df = (1 + col_metadata_df.shape[1], 84 | 1 + row_metadata_df.shape[1] + col_metadata_df.shape[0]) 85 | 86 | top_half_df = pd.DataFrame(np.full(size_of_top_half_df, filler_null, dtype=object)) 87 | 88 | # Assemble the third line of the gct: "id", then rhds, then cids 89 | top_half_df.iloc[0, :] = np.hstack(("id", row_metadata_df.columns.values, col_metadata_df.index.values)) 90 | 91 | # Insert the chds 92 | top_half_df.iloc[range(1, top_half_df.shape[0]), 0] = col_metadata_df.columns.values 93 | 94 | # Insert the column metadata, but first convert to strings and replace NaNs 95 | col_metadata_indices = (range(1, top_half_df.shape[0]), 96 | range(1 + row_metadata_df.shape[1], top_half_df.shape[1])) 97 | # pd.DataFrame.loc to insert into dataframe(python3) 98 | top_half_df.loc[col_metadata_indices[0], col_metadata_indices[1]] = ( 99 | col_metadata_df.astype(str).replace("nan", value=metadata_null).T.values) 100 | 101 | # Write top_half_df to file 102 | top_half_df.to_csv(f, header=False, index=False, sep="\t") 103 | 104 | 105 | def write_bottom_half(f, row_metadata_df, data_df, data_null, data_float_format, metadata_null): 106 | """ Write the bottom half of the gct file: row metadata and data. 107 | 108 | Args: 109 | f (file handle): handle for output file 110 | row_metadata_df (pandas df) 111 | data_df (pandas df) 112 | data_null (string): how to represent missing values in the data 113 | metadata_null (string): how to represent missing values in the metadata 114 | data_float_format (string): how many decimal points to keep in representing data 115 | 116 | Returns: 117 | None 118 | """ 119 | # create the left side of the bottom half of the gct (for the row metadata) 120 | size_of_left_bottom_half_df = (row_metadata_df.shape[0], 121 | 1 + row_metadata_df.shape[1]) 122 | left_bottom_half_df = pd.DataFrame(np.full(size_of_left_bottom_half_df, metadata_null, dtype=object)) 123 | 124 | #create the full bottom half by combining with the above with the matrix data 125 | bottom_half_df = pd.concat([left_bottom_half_df, data_df.reset_index(drop=True)], axis=1) 126 | bottom_half_df.columns = range(bottom_half_df.shape[1]) 127 | 128 | # Insert the rids 129 | bottom_half_df.iloc[:, 0] = row_metadata_df.index.values 130 | 131 | # Insert the row metadata, but first convert to strings and replace NaNs 132 | row_metadata_col_indices = range(1, 1 + row_metadata_df.shape[1]) 133 | bottom_half_df.iloc[:, row_metadata_col_indices] = ( 134 | row_metadata_df.astype(str).replace("nan", value=metadata_null).values) 135 | 136 | # Write bottom_half_df to file 137 | bottom_half_df.to_csv(f, header=False, index=False, sep="\t", 138 | na_rep=data_null, 139 | float_format=data_float_format) 140 | 141 | 142 | def append_dims_and_file_extension(fname, data_df): 143 | """Append dimensions and file extension to output filename. 144 | N.B. Dimensions are cols x rows. 145 | 146 | Args: 147 | fname (string): output filename 148 | data_df (pandas df) 149 | Returns: 150 | out_fname (string): output filename with matrix dims and .gct appended 151 | """ 152 | # If there's no .gct at the end of output file name, add the dims and .gct 153 | if not fname.endswith(".gct"): 154 | out_fname = '{0}_n{1}x{2}.gct'.format(fname, data_df.shape[1], data_df.shape[0]) 155 | return out_fname 156 | 157 | # Otherwise, only add the dims 158 | else: 159 | basename = os.path.splitext(fname)[0] 160 | out_fname = '{0}_n{1}x{2}.gct'.format(basename, data_df.shape[1], data_df.shape[0]) 161 | return out_fname 162 | -------------------------------------------------------------------------------- /cmapPy/set_io/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/set_io/__init__.py -------------------------------------------------------------------------------- /cmapPy/set_io/gmt.py: -------------------------------------------------------------------------------- 1 | """ 2 | gmt.py 3 | 4 | IO methods for handling GMT files. 5 | 6 | A GMT is stored as a list of dictionaries. 7 | Each line is its own dictionary. 8 | Each dictionary has the following keys: 9 | - head (string): identifier for the set 10 | - desc (string): longer description of the set 11 | - entries (list): members of the set 12 | 13 | AUTHOR: Corey Flynn, Broad Institute, 2012 14 | MODIFIED: Lev Litichevskiy, 2017 15 | 16 | """ 17 | import os 18 | 19 | SET_IDENTIFIER_FIELD = "head" 20 | SET_DESC_FIELD = "desc" 21 | SET_MEMBERS_FIELD = "entry" 22 | 23 | 24 | def read(file_path): 25 | """ Read a gmt file at the path specified by file_path. 26 | 27 | Args: 28 | file_path (string): path to gmt file 29 | 30 | Returns: 31 | gmt (GMT object): list of dicts, where each dict corresponds to one 32 | line of the GMT file 33 | 34 | """ 35 | # Read in file 36 | actual_file_path = os.path.expanduser(file_path) 37 | with open(actual_file_path, 'r') as f: 38 | lines = f.readlines() 39 | 40 | # Create GMT object 41 | gmt = [] 42 | 43 | # Iterate over each line 44 | for line_num, line in enumerate(lines): 45 | # Separate along tabs 46 | fields = line.split('\t') 47 | 48 | assert len(fields) > 2, ( 49 | "Each line must have at least 3 tab-delimited items. " + 50 | "line_num: {}, fields: {}").format(line_num, fields) 51 | 52 | # Get rid of trailing whitespace 53 | fields[-1] = fields[-1].rstrip() 54 | 55 | # Collect entries 56 | entries = fields[2:] 57 | 58 | # Remove empty entries 59 | entries = [x for x in entries if x] 60 | 61 | assert len(set(entries)) == len(entries), ( 62 | "There should not be duplicate entries for the same set. " + 63 | "line_num: {}, entries: {}").format(line_num, entries) 64 | 65 | # Store this line as a dictionary 66 | line_dict = {SET_IDENTIFIER_FIELD: fields[0], 67 | SET_DESC_FIELD: fields[1], 68 | SET_MEMBERS_FIELD: entries} 69 | gmt.append(line_dict) 70 | 71 | verify_gmt_integrity(gmt) 72 | 73 | return gmt 74 | 75 | 76 | def verify_gmt_integrity(gmt): 77 | """ Make sure that set ids are unique. 78 | 79 | Args: 80 | gmt (GMT object): list of dicts 81 | 82 | Returns: 83 | None 84 | 85 | """ 86 | 87 | # Verify that set ids are unique 88 | set_ids = [d[SET_IDENTIFIER_FIELD] for d in gmt] 89 | assert len(set(set_ids)) == len(set_ids), ( 90 | "Set identifiers should be unique. set_ids: {}".format(set_ids)) 91 | 92 | 93 | def write(gmt, out_path): 94 | """ Write a GMT to a text file. 95 | 96 | Args: 97 | gmt (GMT object): list of dicts 98 | out_path (string): output path 99 | 100 | Returns: 101 | None 102 | 103 | """ 104 | with open(out_path, 'w') as f: 105 | for _, each_dict in enumerate(gmt): 106 | f.write(each_dict[SET_IDENTIFIER_FIELD] + '\t') 107 | f.write(each_dict[SET_DESC_FIELD] + '\t') 108 | f.write('\t'.join([str(entry) for entry in each_dict[SET_MEMBERS_FIELD]])) 109 | f.write('\n') 110 | -------------------------------------------------------------------------------- /cmapPy/set_io/grp.py: -------------------------------------------------------------------------------- 1 | """ 2 | grp.py 3 | 4 | IO methods for handling GRP files. 5 | 6 | A GRP file is stored as a list. Lines beginning with # are ignored. 7 | 8 | AUTHOR: David Wadden, Broad Institute, 2012 9 | MODIFIED: Lev Litichevskiy, 2017 10 | """ 11 | 12 | import os 13 | import re 14 | 15 | 16 | def read(in_path): 17 | """ Read a grp file at the path specified by in_path. 18 | 19 | Args: 20 | in_path (string): path to GRP file 21 | 22 | Returns: 23 | grp (list) 24 | 25 | """ 26 | assert os.path.exists(in_path), "The following GRP file can't be found. in_path: {}".format(in_path) 27 | 28 | with open(in_path, "r") as f: 29 | lines = f.readlines() 30 | # need the second conditional to ignore comment lines 31 | grp = [line.strip() for line in lines if line and not re.match("^#", line)] 32 | 33 | return grp 34 | 35 | 36 | def write(grp, out_path): 37 | """ Write a GRP to a text file. 38 | 39 | Args: 40 | grp (list): GRP object to write to new-line delimited text file 41 | out_path (string): output path 42 | 43 | Returns: 44 | None 45 | 46 | """ 47 | with open(out_path, "w") as f: 48 | for x in grp: 49 | f.write(str(x) + "\n") -------------------------------------------------------------------------------- /cmapPy/set_io/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/set_io/tests/__init__.py -------------------------------------------------------------------------------- /cmapPy/set_io/tests/functional_tests/test.gmt: -------------------------------------------------------------------------------- 1 | A this one is A a1 a3 a2 2 | B this one is B b4 b2 b3 3 | -------------------------------------------------------------------------------- /cmapPy/set_io/tests/functional_tests/test.grp: -------------------------------------------------------------------------------- 1 | #a 2 | r 3 | d 4 | e 5 | #f 6 | -------------------------------------------------------------------------------- /cmapPy/set_io/tests/functional_tests/test_bad.gmt: -------------------------------------------------------------------------------- 1 | A this one is A a1 a3 a2 2 | B this one is B b4 b2 b3 3 | -------------------------------------------------------------------------------- /cmapPy/set_io/tests/functional_tests/test_bad2.gmt: -------------------------------------------------------------------------------- 1 | A this one is A a1 a3 a2 2 | B this one is B b4 b2 b2 3 | -------------------------------------------------------------------------------- /cmapPy/set_io/tests/test_gmt.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.set_io.gmt as gmt 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | FUNCTIONAL_TESTS_DIR = "cmapPy/set_io/tests/functional_tests/" 9 | 10 | 11 | class TestGMT(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.example_gmt = [{"head": "A", "desc": "this one is A", "entry": ["a1", "a3", "a2"]}, 16 | {"head": "B", "desc": "this one is B", "entry": ["b4", "b2", "b3"]}] 17 | 18 | def test_read(self): 19 | 20 | in_gmt = gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test.gmt")) 21 | 22 | self.assertEqual(len(self.example_gmt), len(in_gmt)) 23 | self.assertEqual(self.example_gmt[0], in_gmt[0]) 24 | self.assertEqual(self.example_gmt[1], in_gmt[1]) 25 | 26 | with self.assertRaises(AssertionError) as e: 27 | gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad.gmt")) 28 | self.assertIn("3 tab-delimited items. line_num: 0", str(e.exception)) 29 | 30 | with self.assertRaises(AssertionError) as e: 31 | gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad2.gmt")) 32 | self.assertIn("same set. line_num: 1", str(e.exception)) 33 | 34 | def test_verify_gmt_integrity(self): 35 | 36 | bad_gmt = [{"head": "A", "desc": "blah", "entry": ["a1", "a3", "a2"]}, 37 | {"head": "A", "desc": "blah", "entry": ["b4", "b2", "b3"]}] 38 | 39 | with self.assertRaises(AssertionError) as e: 40 | gmt.verify_gmt_integrity(bad_gmt) 41 | self.assertIn("Set identifiers should be unique", str(e.exception)) 42 | 43 | def test_write(self): 44 | 45 | out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_write.gmt") 46 | gmt.write(self.example_gmt, out_path) 47 | self.assertTrue(os.path.exists(out_path)) 48 | 49 | read_back_in = gmt.read(out_path) 50 | self.assertEqual(len(self.example_gmt), len(read_back_in)) 51 | self.assertEqual(self.example_gmt[0], read_back_in[0]) 52 | self.assertEqual(self.example_gmt[1], read_back_in[1]) 53 | 54 | # Cleanup 55 | os.remove(out_path) 56 | 57 | if __name__ == "__main__": 58 | setup_logger.setup(verbose=True) 59 | 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /cmapPy/set_io/tests/test_grp.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger 5 | import cmapPy.set_io.grp as grp 6 | 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME) 8 | FUNCTIONAL_TESTS_DIR = "cmapPy/set_io/tests/functional_tests/" 9 | 10 | 11 | class TestGRP(unittest.TestCase): 12 | 13 | def test_read(self): 14 | 15 | in_grp = grp.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test.grp")) 16 | self.assertEqual(in_grp, ["r", "d", "e"]) 17 | 18 | with self.assertRaises(AssertionError) as e: 19 | grp.read("testt.grp") 20 | self.assertIn("The following GRP file", str(e.exception)) 21 | 22 | def test_write(self): 23 | 24 | example_grp = ["x", "z", "w"] 25 | 26 | out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_write.grp") 27 | grp.write(example_grp, out_path) 28 | self.assertTrue(os.path.exists(out_path)) 29 | 30 | read_back_in = grp.read(out_path) 31 | self.assertEqual(example_grp, read_back_in) 32 | 33 | # Cleanup 34 | os.remove(out_path) 35 | 36 | if __name__ == "__main__": 37 | setup_logger.setup(verbose=True) 38 | 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /cmapPy/visualization/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /cmapPy/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from . import scattergram 2 | from . import stratogram 3 | from . import cohort_view -------------------------------------------------------------------------------- /cmapPy/visualization/cohort_view.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import logging 4 | 5 | from IPython.display import display 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | logger = logging.getLogger() 11 | 12 | 13 | def cohort_view_table(df, 14 | category_label="category_label", 15 | category_order="category_order", 16 | flags=[], 17 | flag_display_labels=[], 18 | add_percentages=True): 19 | 20 | ''' Generate a DataFrame showing counts and percentages 21 | of subsets (defined by flags), stratified by categories. 22 | For instance, each row (category) may be a selectivity 23 | bucket, and each column can be the number of compounds in 24 | that bucket that passed a given threshold. A "Total" 25 | column shows the total number of compounds in each 26 | bucket and a grand total sums them all up. 27 | @param df: DataFrame where each row is a compound and 28 | columns are various metrics and flags 29 | @kwarg category_label: name of the column that defines 30 | a category. The data is stratified based on this fieild. 31 | @kwarg category_order: order in which the categories should 32 | be displayed as rows of the table. There should be a one 33 | to one correspondence between category_label and category_order. 34 | @kwarg flags: list of column names defining binary flags. 35 | These flags define subsets that will be counted and displayed 36 | as columns of the output table. 37 | @kwarg flag_display_labels: string labels for output columns 38 | corresonding to flags 39 | @kwarg add_percentages: whether to display percentages 40 | alongside the counts. 41 | ''' 42 | assert len(flags) == len(flag_display_labels), '"flags" and "flag_display_labels" should have the same length' 43 | 44 | df['Total'] = 1 45 | columns = ['Total'] + flags 46 | df = ( 47 | df 48 | .groupby([category_order, category_label])[columns] 49 | .sum() 50 | .sort_index(axis=0, level=category_order) 51 | .reset_index(level=[category_order]) 52 | .drop(columns=category_order) 53 | ) 54 | 55 | column_names = ["Total"] + flag_display_labels 56 | df.columns = column_names 57 | df.index.names = ['Category'] 58 | 59 | df = df.T 60 | num_categories = len(df.columns) 61 | logger.info("num_categories: {}".format(num_categories)) 62 | 63 | # Test comopound fields 64 | cpd_fields = [c for c in df.columns if 'Test subset' in c] 65 | if len(cpd_fields) != 0: 66 | df['Test Compounds Total'] = df[cpd_fields].sum(1) 67 | df['Grand Total'] = df.iloc[:, :num_categories].sum(1) 68 | df = df.T 69 | df.index.name = None 70 | 71 | if add_percentages: 72 | df = df.transform(_add_row_percentages, axis=1) 73 | return df 74 | 75 | 76 | def _fmt_total_percentages(x, total): 77 | ''' 78 | Formatting function for DataFrame.Style. Formats the 79 | "Total" column to show percentages. 80 | ''' 81 | s = '''{:,d} 82 | 83 | ({:.0%})'''.format(int(x), float(x) / total) 84 | return s 85 | 86 | 87 | def _add_row_percentages(s): 88 | '''Convert all columns except for "Total" to a string 89 | that shows the integer count as well as the percentage 90 | of Total within the row.''' 91 | s = s + 0 92 | index = s.index 93 | assert "Total" in index 94 | total = s['Total'] 95 | for label, x in s.iteritems(): 96 | if label == "Total": 97 | continue 98 | s[label] = '''{:,d} 99 | 100 | ({:.0%})'''.format(int(x), float(x) / total) 101 | return s 102 | 103 | 104 | def display_cohort_stats_table(table, barplot_column): 105 | font_family = "Roboto" 106 | idx = pd.IndexSlice 107 | # indexes of the rows corresponding to categories, exludes 108 | # the last "total" sums 109 | group_ids = [x for x in table.index if 'Total' not in x] 110 | 111 | barplot_max = table.loc[group_ids, barplot_column].sum() 112 | 113 | # Sum of numbers in Total column (excluding Grand Total, obviously) 114 | total = table.loc['Grand Total', 'Total'] 115 | table_stylized = ( 116 | table 117 | .style 118 | .format( 119 | lambda s: _fmt_total_percentages(s, total), 120 | subset=pd.IndexSlice[:, 'Total'] 121 | ) 122 | .applymap(lambda x : 'text-align:center;') 123 | .applymap(lambda x: "border-left:solid thin #d65f5f", subset=idx[:, barplot_column]) 124 | .bar(subset=idx[group_ids, barplot_column], color='#FFDACF', vmin=0, vmax=barplot_max) 125 | .applymap(lambda x: "padding:0.5em 1em 0.5em 1em") 126 | .applymap(lambda x: "background:#444;color:white;border:solid thin #000;font-weight:bold", subset=idx['Grand Total', :]) 127 | .applymap(lambda x: "border-left:solid thin #ddd", subset=idx[:, 'Total']) 128 | .set_table_styles( 129 | [ 130 | {'selector' : 'table', 131 | 'props' : [('font-family', font_family), ('font-size', '30px'), ('border', 'solid thin #999')] 132 | }, 133 | {'selector' : 'thead, tbody', 'props' : [ 134 | ('border', 'solid 1px #ddd'), 135 | ] 136 | }, 137 | {'selector' : 138 | 'thead', 'props' : [ 139 | ('border-bottom', 'solid 2px #ddd'), 140 | ('border-top', 'solid 2px #ddd'), 141 | ('background', '#fefefe'), ('text-align', 'center'), 142 | ('font-family', font_family), 143 | ('font-size' , '1em') 144 | ] 145 | }, 146 | {'selector' : 'th', 147 | 'props' : [ 148 | ('text-align', 'center'), 149 | ('color' , '#444'), 150 | ] 151 | }, 152 | {'selector' : 'th.col_heading', 153 | 'props' : [ 154 | ('max-width', '8em') 155 | ] 156 | }, 157 | {'selector' : 'th:not(.blank)', 158 | 'props' : [ 159 | # ('border-left','solid thin #ddd'), 160 | # ('border-right','solid thin #ddd'), 161 | ] 162 | }, 163 | {'selector' : 'tbody', 'props' : [ 164 | ('text-align', 'center'), ('background', '#fff'), ('font-size' , '1.em'), 165 | ('font-family', font_family)]}, 166 | {'selector' : '.row_heading', 167 | 'props' : [('border-right', 'solid thin #ddd'), ('text-align', 'left')]} 168 | ] 169 | ) 170 | ) 171 | if 'Test Compounds Total' in table.index: 172 | table_stylized = table_stylized.applymap(lambda x: "border-top:solid thin #aaa", subset=idx['Test Compounds Total', :]) 173 | 174 | return table_stylized 175 | -------------------------------------------------------------------------------- /cmapPy/visualization/scattergram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 30, 2019 3 | @author: Navid Dianati 4 | @contact: navid@broadinstitute.org 5 | ''' 6 | 7 | import logging 8 | import os 9 | 10 | import matplotlib 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | import pandas as pd 15 | import seaborn as sns 16 | 17 | logger = logging.getLogger() 18 | 19 | 20 | def scattergram( 21 | df, columns, column_names, title="", 22 | outfile='', 23 | fig_dpi=150, 24 | fontfamily="Roboto" 25 | ): 26 | ''' 27 | Make a grid of scatterplots of a set of columns against each other. 28 | The values should all be "normalized", i.e., between 0 and 1. 29 | @param df: Pandas DataFrame containing the variables to be scattered. 30 | @param columns: list of column names to plot. 31 | @param column_names: list of display names corresponding to the 32 | variable columns. 33 | @return: g: Seaborn PairGrid object 34 | ''' 35 | 36 | df = df.copy()[columns] 37 | 38 | # rename the columns 39 | df.columns = column_names 40 | 41 | df = df.dropna() 42 | with sns.axes_style('ticks') as c1: 43 | 44 | g = sns.PairGrid( 45 | data=df, vars=column_names, 46 | palette="Greys", despine=False, 47 | height=2 48 | ) 49 | g.map_lower( 50 | plt.scatter, 51 | s=10, 52 | lw=0, 53 | alpha=0.5, 54 | color="#555555" 55 | ) 56 | g.map_diag( 57 | _plot_hist, 58 | **dict( 59 | normed=True, 60 | alpha=0.5, 61 | bins=np.linspace(-0.00001, 1.00001, 21), 62 | histtype="bar", 63 | edgecolor="#ffffff" 64 | ) 65 | ) 66 | 67 | if title: 68 | g.fig.text(1, 1, 69 | "{} (N = {:,})".format(title, len(df)), 70 | fontsize=30, 71 | fontname=fontfamily, 72 | fontweight="bold", 73 | horizontalalignment="right", 74 | verticalalignment="top" 75 | ) 76 | 77 | plt.subplots_adjust(wspace=0, hspace=0.0) 78 | font_properties = dict(family=fontfamily, weight="bold") 79 | _adjust_axes(g, font_properties) 80 | _draw_row_labels(g, column_names) 81 | 82 | if outfile: 83 | plt.savefig(outfile, dpi=fig_dpi) 84 | return g 85 | 86 | 87 | def _adjust_axes(g, font_properties={}): 88 | for i, j in zip(*np.triu_indices_from(g.axes, 1)): 89 | g.axes[i, j].set_visible(False) 90 | 91 | for i in range(g.axes.shape[0]): 92 | for j in range(g.axes.shape[1]): 93 | ax = g.axes[i, j] 94 | if i > j: 95 | ax.set_zorder(100) 96 | ax.set_xlim(-0.1, 1.1) 97 | ax.set_ylim(-0.1, 1.1) 98 | ax.set_ylabel('') 99 | ax.set_xlabel('') 100 | frame_line_width = 2 101 | _set_axis_thickness(ax, frame_line_width) 102 | ax.xaxis.set_tick_params(width=frame_line_width) 103 | ax.yaxis.set_tick_params(width=frame_line_width) 104 | _set_ticks_fontproperties(ax, font_properties) 105 | for i in range(g.axes.shape[0]): 106 | ax = g.axes[i, i] 107 | ax.set_ylim(-0.1, 1.1) 108 | ax.set_xlim(-0.1, 1.1) 109 | ax.set_ylabel('') 110 | ax.set_xlabel('') 111 | ax.set_yticks([0, 0.5, 1]) 112 | ax.set_xticks([0, 0.5, 1]) 113 | _set_axis_thickness(ax, 1) 114 | _set_axis_style(ax, '--') 115 | _set_ticks_fontproperties(ax, font_properties) 116 | 117 | 118 | def _draw_row_labels(g, column_names): 119 | for i in range(g.axes.shape[0]): 120 | label = column_names[i] 121 | ax = g.axes[i, i] 122 | ax.annotate(label, (0.5, .5), 123 | horizontalalignment="center", 124 | verticalalignment="center", 125 | fontweight="bold", 126 | fontname="Roboto", 127 | fontsize=18, 128 | zorder=100, 129 | # bbox=dict(boxstyle="square,pad=0.5", fc="white", ec="#dddddd", lw=0) 130 | ) 131 | 132 | 133 | def _set_axis_thickness(ax, width): 134 | for axis in ['top', 'bottom', 'left', 'right']: 135 | ax.spines[axis].set_linewidth(width) 136 | 137 | 138 | def _set_axis_style(ax, linestyle): 139 | for axis in ['top', 'bottom', 'left', 'right']: 140 | ax.spines[axis].set_linestyle(linestyle) 141 | 142 | 143 | def _plot_hist(data, **kwargs): 144 | plt.hist(data, clip_on=True, **kwargs) 145 | 146 | 147 | def _set_ticks_fontproperties(ax, font_properties): 148 | ax.set_xticklabels(ax.get_xticks(), font_properties) 149 | ax.set_yticklabels(ax.get_yticks(), font_properties) 150 | 151 | 152 | def plot_selected_points_among_all(*args, **kwargs): 153 | ''' 154 | Legacy function. 155 | ''' 156 | return scattergram(*args, **kwargs) 157 | 158 | -------------------------------------------------------------------------------- /cmapPy/visualization/test_cohort_view.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 30, 2019 3 | 4 | @author: Navid Dianati 5 | @contact: navid@broadinstitute.org 6 | ''' 7 | import unittest 8 | 9 | import cohort_view 10 | import matplotlib.pyplot as plt 11 | import pandas as pd 12 | 13 | 14 | class Test(unittest.TestCase): 15 | 16 | def testCohortView(self): 17 | filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt" 18 | df = pd.read_csv(filename, sep="\t") 19 | 20 | df['is_reproducible'] = (df['cc_q75'] > 0.2) + 0 21 | df['is_high_mag'] = (df['mag_vi'] > 0.2) + 0 22 | flags = ['is_reproducible', 'is_high_mag'] 23 | column_names = ['Reproducible', 'magnitude'] 24 | table = cohort_view.cohort_view_table( 25 | df, 26 | category_label="category_label", 27 | category_order="category_order", 28 | flags=flags, 29 | flag_display_labels=column_names 30 | 31 | ) 32 | print(table) 33 | # plt.savefig("./test_files/cohort_view_test.html", dpi=150) 34 | 35 | 36 | if __name__ == "__main__": 37 | # import sys;sys.argv = ['', 'Test.testStratogram'] 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /cmapPy/visualization/test_scattergram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 30, 2019 3 | 4 | @author: Navid Dianati 5 | @contact: navid@broadinstitute.org 6 | ''' 7 | import unittest 8 | 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | import scattergram 12 | 13 | 14 | class Test(unittest.TestCase): 15 | 16 | def testScattergram1(self): 17 | filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt" 18 | df = pd.read_csv(filename, sep="\t") 19 | plot_columns = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi'] 20 | column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude'] 21 | scattergram.scattergram( 22 | df, 23 | columns=plot_columns, 24 | column_names=column_names, 25 | title="This is a test" 26 | ) 27 | plt.show() 28 | 29 | def testScattergram2(self): 30 | filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt" 31 | df = pd.read_csv(filename, sep="\t") 32 | plot_columns = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi'] 33 | column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude'] 34 | scattergram.scattergram( 35 | df, 36 | columns=plot_columns, 37 | column_names=column_names, 38 | title="This is a test", 39 | outfile="./test_files/deleteme.png", 40 | fig_dpi=50 41 | ) 42 | 43 | 44 | if __name__ == "__main__": 45 | # import sys;sys.argv = ['', 'Test.testScattergram'] 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /cmapPy/visualization/test_stratogram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Sep 30, 2019 3 | 4 | @author: Navid Dianati 5 | @contact: navid@broadinstitute.org 6 | ''' 7 | import unittest 8 | 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | import stratogram 12 | 13 | 14 | class Test(unittest.TestCase): 15 | 16 | def testStratogram(self): 17 | filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt" 18 | df = pd.read_csv(filename, sep="\t") 19 | metrics = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi'] 20 | column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude'] 21 | stratogram.stratogram( 22 | df, 23 | category_definition="category_label", 24 | category_label="category_label_abridged", 25 | category_order="category_order", 26 | metrics=metrics, 27 | figsize=(20, 15), 28 | column_display_names=column_names, 29 | xtick_orientation="horizontal", 30 | ylabel_fontsize=15, 31 | xlabel_fontsize=15, 32 | xlabel_fontcolor="#555555", 33 | ylabel_fontcolor="#555555", 34 | fontfamily="Roboto" 35 | ) 36 | plt.savefig("./test_files/stratogram_test.png", dpi=150) 37 | 38 | 39 | if __name__ == "__main__": 40 | # import sys;sys.argv = ['', 'Test.testStratogram'] 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | cmapPy==2.2.0 2 | -------------------------------------------------------------------------------- /docs/source/available_modules.rst: -------------------------------------------------------------------------------- 1 | .. _available_modules: 2 | 3 | Available Modules 4 | ================= 5 | 6 | clue_api_client 7 | --------------- 8 | 9 | A Python client for easy interaction with the Connectivity Map (CLUE) API. 10 | 11 | Maintainer: David Lahr, dlahr@broadinstitute.org 12 | 13 | pandasGEXpress 14 | -------------- 15 | 16 | A package (integrated with Python's pandas package) allowing users to easily read, modify, and write .gct and .gctx files. 17 | 18 | Also features the following command-line tools: 19 | 20 | ``gct2gctx``: converts .gct to .gctx file. Type ``gct2gctx -h`` for help. 21 | 22 | ``gctx2gct``: converts .gctx to .gct file. Type ``gctx2gct -h`` for help. 23 | 24 | ``concat``: Concats two or more .gct/x files as specified by user. Type ``concat -h`` for help. 25 | 26 | Maintainer: Oana Enache, oana@broadinstitute.org 27 | 28 | set_io 29 | ------ 30 | 31 | set_io contains simple scripts for parsing two other common file types used by the Connectivity Map: GRP and GMT files. The GRP file is used for storing a single set of things (e.g. a single gene set), while the GMT file is used for storing multiple sets of things (e.g. several gene sets). 32 | 33 | Maintainer: Lev Litichevskiy, lev@broadinstitute.org 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /docs/source/build.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Installation 4 | ============ 5 | 6 | We highly recommend the using a prebuilt distribution of cmapPy along with a virtual environment (here we demonstrate how to use it with conda). 7 | 8 | **Option 1 (recommended): Setup pandasGEXpress in a new conda environment** 9 | 10 | * (All operating systems; If you haven't already) install ``miniconda`` 11 | * Download/follow instructions provided `here `_. Unless you have personal preferences/reasons to do so, we recommend installing Miniconda over Anaconda because it's more lightweight. 12 | * On the command line, type ``conda info`` to verify that conda has been properly instaled on your system. You should see some information about the "current conda install"; if not, your installation didn't work. 13 | 14 | * (Linux and Mac) Create & activate your cmapPy environment: 15 | 16 | Note. We currently use Python 2.7.11 for our production code (hence its specification); however, other versions of Python 2 should be stable as well. We do not currently support Python 3. 17 | 18 | **Step 1** 19 | 20 | Python 2: ``conda create --name my_cmapPy_env -c bioconda python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0 cmappy`` 21 | 22 | * ``-c bionconda`` tells conda that it should look for packages in the bioconda channel (that's where cmapPy lives) 23 | 24 | 25 | **Step 2** 26 | 27 | ``source activate my_cmapPy_env`` 28 | 29 | * (Windows) Create & activate your cmapPy environment: 30 | 31 | **Step 1** 32 | 33 | Python 2: ``conda create --name my_cmapPy_env python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0`` 34 | 35 | **Step 2** 36 | 37 | ``pip install cmapPy`` 38 | 39 | ``source activate my_cmapPy_env`` 40 | 41 | ...and then cmapPy (including command line tools) should be available for use. 42 | 43 | To update cmapPy in your conda environment (from activate environment): ``conda update cmappy`` 44 | 45 | **Option 2: Install cmapPy from PyPI** 46 | 47 | * ``pip install cmapPy`` 48 | * Note: For use of other virtualenvs, we include a requirements.txt file in the cmapPy package that you can use to install the proper versions of depencies. 49 | 50 | **Option 3: Install as a development environment** 51 | 52 | A development environment will allow you to use the cmapPy code as it is in a clone of the repository, allowing you to try out changes and modifications you may wish to make. 53 | 54 | Follow the instructions for Option 1 or Option 2 above but change the name of the environment to e.g. ``my_cmapPy_dev_env`` and do not include ``cmappy`` in the list of packages to install (or do not install it with pip), then activate this environment, i.e.: 55 | 56 | Python 2.7: ``conda create --name my_cmapPy_dev_env python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0`` 57 | 58 | ``source activate my_cmapPy_dev_env`` 59 | 60 | Clone the cmapPy github repository, cd into the repo's top-level directory, and run: 61 | 62 | ``$ python setup.py develop`` 63 | 64 | To test your setup, change into a directory outside the repo, run the python interpreter and try: 65 | ``cd `` 66 | 67 | ``$ python`` 68 | 69 | ``>> import cmapPy.pandasGEXpress.parse_gct as pg`` 70 | -------------------------------------------------------------------------------- /docs/source/citing.rst: -------------------------------------------------------------------------------- 1 | .. _citing: 2 | 3 | Citation Information 4 | ==================== 5 | 6 | If you use GCTx and/or cmapPy, please cite `Enache et al.`_ 7 | 8 | .. _Enache et al.: https://www.biorxiv.org/content/early/2017/11/30/227041 9 | -------------------------------------------------------------------------------- /docs/source/clue_api_client.rst: -------------------------------------------------------------------------------- 1 | .. _clueclient: 2 | 3 | API (clue_api_client) 4 | ===================== 5 | 6 | To use the CLUE API client, put a copy of the file ``example_cmapPy_config_file.cfg`` in your home directory and name the copy ``.cmapPy.cfg``. Replace the clue_api_user_key entries in that file with your CLUE API user key that you obtained from the CLUE website. 7 | 8 | .. automodule:: cmapPy.clue_api_client.clue_api_client 9 | :members: 10 | 11 | .. automodule:: cmapPy.clue_api_client.cell_queries 12 | :members: 13 | 14 | .. automodule:: cmapPy.clue_api_client.gene_queries 15 | :members: 16 | 17 | .. automodule:: cmapPy.clue_api_client.macchiato_queries 18 | :members: 19 | 20 | .. automodule:: cmapPy.clue_api_client.pert_queries 21 | :members: 22 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | Contribution guidelines 4 | ======================= 5 | 6 | We welcome contributors! For your pull requests, please include the following: 7 | 8 | * Sample code/file that reproducibly causes the bug/issue 9 | * Documented code (include a docstring for new functions!) providing fix 10 | * Unit tests evaluating added/modified methods. -------------------------------------------------------------------------------- /docs/source/faq.rst: -------------------------------------------------------------------------------- 1 | .. _faq: 2 | 3 | FAQ 4 | === 5 | 6 | We will be adding FAQs as they come up. 7 | 8 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | cmapPy: Python utilities for Connectivity Map Resources 2 | ******************************************************* 3 | 4 | Provided by the Connectivity Map, Broad Institute of MIT and Harvard. More information 5 | `on our website `_ 6 | 7 | Where to Start 8 | -------------- 9 | 10 | * :ref:`Installation ` 11 | * :ref:`Summary of Available Modules ` 12 | 13 | 14 | High-level API reference 15 | ------------------------ 16 | 17 | .. toctree:: 18 | :maxdepth: 1 19 | 20 | clue_api_client 21 | pandasGEXpress 22 | set_io 23 | 24 | 25 | Other resources 26 | --------------- 27 | 28 | * `GitHub project `_ 29 | * `Tutorials and additional reference `_ 30 | 31 | 32 | Meta-info about cmapPy 33 | ---------------------- 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | 38 | contributing 39 | faq 40 | licenses 41 | citing 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /docs/source/licenses.rst: -------------------------------------------------------------------------------- 1 | .. _licenses: 2 | 3 | BSD 3-Clause License 4 | ==================== 5 | 6 | Copyright (c) 2017, Connectivity Map (CMap) at the Broad Institute, Inc. 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, this 13 | list of conditions and the following disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above copyright notice, 16 | this list of conditions and the following disclaimer in the documentation 17 | and/or other materials provided with the distribution. 18 | 19 | * Neither the name of the copyright holder nor the names of its 20 | contributors may be used to endorse or promote products derived from 21 | this software without specific prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /docs/source/pandasGEXpress.rst: -------------------------------------------------------------------------------- 1 | .. _pandasGEXpress: 2 | 3 | GCT, GCTx (pandasGEXpress) 4 | ========================== 5 | 6 | pandasGEXpress package (integrated with Python's `pandas `_ package) allowing users to easily read, modify, and write .gct and .gctx files. Note that .gctx files are more performant than .gct, and we recommend their use. 7 | 8 | 9 | GCToo Class 10 | ----------- 11 | .. autoclass:: cmapPy.pandasGEXpress.GCToo.GCToo 12 | 13 | Parsing 14 | ------- 15 | 16 | .. autofunction:: cmapPy.pandasGEXpress.parse.parse 17 | 18 | Writing 19 | ------- 20 | 21 | .. autofunction:: cmapPy.pandasGEXpress.write_gctx.write 22 | 23 | .. autofunction:: cmapPy.pandasGEXpress.write_gct.write 24 | 25 | Concatenating 26 | ------------- 27 | 28 | .. automodule:: cmapPy.pandasGEXpress.concat 29 | :members: 30 | 31 | Converting .gct <-> .gctx 32 | ------------------------- 33 | 34 | .. automodule:: cmapPy.pandasGEXpress.gct2gctx 35 | :members: 36 | 37 | .. automodule:: cmapPy.pandasGEXpress.gctx2gct 38 | :members: 39 | 40 | Extracting from .grp files 41 | -------------------------- 42 | 43 | .. automodule:: cmapPy.pandasGEXpress.plategrp 44 | :members: 45 | 46 | Subsetting 47 | ------- 48 | 49 | .. automodule:: cmapPy.pandasGEXpress.random_slice 50 | :members: 51 | 52 | .. automodule:: cmapPy.pandasGEXpress.subset 53 | :members: 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/source/pandasgexpress_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/docs/source/pandasgexpress_fig.png -------------------------------------------------------------------------------- /docs/source/set_io.rst: -------------------------------------------------------------------------------- 1 | .. _set_io: 2 | 3 | GRP, GMT (set_io) 4 | ================= 5 | 6 | set_io contains simple scripts for parsing two other common file types used by the Connectivity Map: GRP and GMT files. 7 | The GRP file is used for storing a single set of things (e.g. a single gene set), while the GMT file is used for storing multiple sets of things (e.g. several gene sets). 8 | 9 | Further details on GRP and GMT files can be found `here 10 | `_. 11 | 12 | Reading GRP files 13 | ----------------- 14 | 15 | .. autofunction:: cmapPy.set_io.grp.read 16 | 17 | Writing GRP files 18 | ----------------- 19 | 20 | .. autofunction:: cmapPy.set_io.grp.write 21 | 22 | Reading GMT files 23 | ----------------- 24 | 25 | .. autofunction:: cmapPy.set_io.gmt.read 26 | 27 | Verifying GMT integrity 28 | ----------------------- 29 | 30 | .. autofunction:: cmapPy.set_io.gmt.verify_gmt_integrity 31 | 32 | Writing GMT files 33 | ----------------- 34 | 35 | .. autofunction:: cmapPy.set_io.gmt.write 36 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | # on alpine, copy to /etc/nginx/nginx.conf 2 | user root; 3 | worker_processes auto; 4 | 5 | error_log /var/log/nginx/error.log warn; 6 | 7 | events { 8 | worker_connections 1024; 9 | } 10 | 11 | http { 12 | include /etc/nginx/mime.types; 13 | default_type application/octet-stream; 14 | sendfile off; 15 | access_log off; 16 | keepalive_timeout 3000; 17 | server { 18 | listen 9081; 19 | index index.html; 20 | server_name cmapPy; 21 | client_max_body_size 16m; 22 | port_in_redirect off; 23 | location ~ ^/cmapPy$ { 24 | try_files $uri @rewrite; 25 | } 26 | location @rewrite { 27 | return 302 $scheme://$http_host$uri/; 28 | } 29 | location ~ ^/cmapPy(?:/(.*))?$ { 30 | root /usr/share/nginx/html; 31 | access_log off; 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /performance_testing/python_parse_timing.py: -------------------------------------------------------------------------------- 1 | # '/path/with/gctx/files/to/test/*gct*' refers to a directory of GCT and/or GCTX files to time parsing operations on. 2 | # Cache was cleared in between consecutive operations. 3 | 4 | import time 5 | import pandas as pd 6 | import glob 7 | import cmapPy.pandasGEXpress.parse as parse 8 | 9 | # for storing timing results 10 | parse_times = {} 11 | 12 | # input directory of files (gct or gctx) to test 13 | input_files = glob.glob("/path/with/gctx/files/to/test/*gct*") 14 | 15 | for f in input_files: 16 | start = time.clock() 17 | in_gctoo = parse.parse(f) 18 | end = time.clock() 19 | elapsed_time = end - start 20 | parse_times[f] = elapsed_time 21 | 22 | # write results to file 23 | parse_time_series = pd.Series(parse_times) 24 | parse_time_series.to_csv("python_parsing_results.txt", sep="\t") 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /performance_testing/python_write_timing.py: -------------------------------------------------------------------------------- 1 | # '/path/to/large/gctx/file' refers to a large GCTX file (any size above 10174x100000 should work) from which file subsets are made. 2 | # In testing, the large GCTX file used lacked metadata; including metadata would cause slight variation in results. 3 | # Cache was cleared in between consecutive operations. 4 | 5 | import os 6 | import time 7 | import pandas as pd 8 | import cmapPy.pandasGEXpress.write_gctx as write_gctx 9 | import cmapPy.pandasGEXpress.write_gct as write_gct 10 | import cmapPy.pandasGEXpress.parse as parse 11 | import cmapPy.pandasGEXpress.subset_gctoo as sg 12 | 13 | # for storing timing results 14 | gct_times = {} 15 | gctx_times = {} 16 | 17 | # large input gctx; see notes above for more info about this 18 | big_gctoo = parse.parse("/path/to/large/gctx/file") 19 | 20 | # column and row spaces to test writing on 21 | col_spaces = [96, 384, 1536, 3000, 6000, 12000, 24000, 48000, 100000] 22 | row_spaces = [978, 10174] 23 | 24 | for c in col_spaces: 25 | for r in row_spaces: 26 | curr_gctoo = sg.subset_gctoo(big_gctoo, ridx = range(0, r), cidx=range(0,c)) 27 | # gct writing 28 | out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct" 29 | start = time.clock() 30 | write_gct.write(curr_gctoo, out_fname) 31 | end = time.clock() 32 | elapsed_time = end - start 33 | gct_times[out_fname] = elapsed_time 34 | os.remove(out_fname) 35 | # gctx writing 36 | out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx" 37 | start = time.clock() 38 | write_gctx.write(curr_gctoo, out_fname) 39 | end = time.clock() 40 | elapsed_time = end - start 41 | gctx_times[out_fname] = elapsed_time 42 | os.remove(out_fname) 43 | 44 | # write results to file 45 | gct_df = pd.DataFrame(pd.Series(gct_times)) 46 | gctx_df = pd.DataFrame(pd.Series(gctx_times)) 47 | write_times_df = pd.concat([gct_df, gctx_df]) 48 | write_times_df.columns = ["write_time"] 49 | write_times_df.to_csv("python_writing_results.txt", sep="\t") 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.11.2 2 | pandas==0.20.3 3 | h5py==2.7.0 4 | requests==2.20.0 5 | 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # Only Python 2.7 supported 3 | universal=0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Always prefer setuptools over distutils 2 | from setuptools import setup, find_packages 3 | # To use a consistent encoding 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | setup( 10 | name='cmapPy', 11 | 12 | # Versions should comply with PEP440. For a discussion on single-sourcing 13 | # the version across setup.py and the project code, see 14 | # https://packaging.python.org/en/latest/single_source_version.html 15 | version='3.3.3', 16 | 17 | description='Assorted tools for interacting with .gct, .gctx files and other Connectivity Map (Broad Institute) data/tools', 18 | long_description="cmapPy: Tools for interacting with .gctx and .gct files, and other Connectivity Map resources. See our documentation at http://cmappy.readthedocs.io/en/latest/, and for more information on the file formats and available resources, please see clue.io/gctx.", 19 | 20 | # The project's main homepage. 21 | url='https://github.com/cmap/cmapPy', 22 | 23 | # Author details 24 | maintainer='Oana Enache', 25 | maintainer_email='oana@broadinstitute.org', 26 | 27 | # Choose your license 28 | license='BSD 3-clause', 29 | 30 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 31 | classifiers=[ 32 | # How mature is this project? Common values are 33 | # 3 - Alpha 34 | # 4 - Beta 35 | # 5 - Production/Stable 36 | 'Development Status :: 5 - Production/Stable', 37 | 38 | # Indicate who your project is intended for 39 | 'Intended Audience :: Science/Research', 40 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 41 | 42 | # Pick your license as you wish (should match "license" above) 43 | 'License :: OSI Approved :: BSD License', 44 | 45 | # Specify the Python versions you support here. In particular, ensure 46 | # that you indicate whether you support Python 2, Python 3 or both. 47 | 'Programming Language :: Python :: 2', 48 | 'Programming Language :: Python :: 2.7' 49 | ], 50 | 51 | # What does your project relate to? 52 | keywords='gct gctx file-manipulation Connectivity Map CMap Broad Institute', 53 | 54 | # You can just specify the packages manually here if your project is 55 | # simple. Or you can use find_packages(). 56 | packages=find_packages(exclude=['contrib','docs','tutorials', 'tests', 'performance_testing']), 57 | 58 | # List run-time dependencies here. These will be installed by pip when 59 | # your project is installed. For an analysis of "install_requires" vs pip's 60 | # requirements files see: 61 | # https://packaging.python.org/en/latest/requirements.html 62 | install_requires=['numpy>=1.11.2', 'pandas>=0.18', 'h5py>=2.6.0', 'requests>=2.13.0', 'six'], 63 | 64 | # List additional groups of dependencies here (e.g. development 65 | # dependencies). You can install these using the following syntax, 66 | # for example: 67 | # $ pip install -e .[dev,test] 68 | extras_require={}, 69 | 70 | # If there are data files included in your packages that need to be 71 | # installed, specify them here. If using Python 2.6 or less, then these 72 | # have to be included in MANIFEST.in as well. 73 | #package_data={}, 74 | include_package_data=True, # reads these from MANIFEST.in 75 | 76 | # Although 'package_data' is the preferred approach, in some case you may 77 | # need to place data files outside of your packages. See: 78 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 79 | # In this case, 'data_file' will be installed into '/my_data' 80 | data_files=[], 81 | 82 | # To provide executable scripts, use entry points in preference to the 83 | # "scripts" keyword. Entry points provide cross-platform support and allow 84 | # pip to create the appropriate form of executable for the target platform. 85 | entry_points={'console_scripts': ['gctx2gct=cmapPy.pandasGEXpress.gctx2gct:main', 'gct2gctx=cmapPy.pandasGEXpress.gct2gctx:main', 86 | 'concat=cmapPy.pandasGEXpress.concat:main', 'subset=cmapPy.pandasGEXpress.subset:main']}, 87 | 88 | tests_require=['unittest'] 89 | ) 90 | -------------------------------------------------------------------------------- /tutorials/GCTX_mockup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/tutorials/GCTX_mockup.png -------------------------------------------------------------------------------- /tutorials/GCT_mockup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/tutorials/GCT_mockup.png --------------------------------------------------------------------------------