├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── cmapPy
    ├── __init__.py
    ├── clue_api_client
    │   ├── __init__.py
    │   ├── cell_queries.py
    │   ├── clue_api_client.py
    │   ├── gene_queries.py
    │   ├── macchiato_queries.py
    │   ├── mock_clue_api_client.py
    │   ├── pert_queries.py
    │   ├── setup_logger.py
    │   └── tests
    │   │   ├── test_cell_queries.py
    │   │   ├── test_clue_api_client.py
    │   │   ├── test_gene_queries.py
    │   │   ├── test_macchiato_queries.py
    │   │   ├── test_mock_clue_api_client.py
    │   │   └── test_pert_queries.py
    ├── example_cmapPy_config_file.cfg
    ├── math
    │   ├── __init__.py
    │   ├── agg_wt_avg.py
    │   ├── fast_corr.py
    │   ├── fast_cov.py
    │   ├── robust_zscore.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_agg_wt_avg.py
    │   │   ├── test_fast_corr.py
    │   │   ├── test_fast_cov.py
    │   │   └── test_robust_zscore.py
    ├── pandasGEXpress
    │   ├── GCToo.py
    │   ├── README.rst
    │   ├── __init__.py
    │   ├── concat.py
    │   ├── concat_gctoo.py
    │   ├── diff_gctoo.py
    │   ├── gct2gctx.py
    │   ├── gctx2gct.py
    │   ├── mini_gctoo_for_testing.py
    │   ├── parse.py
    │   ├── parse_gct.py
    │   ├── parse_gctx.py
    │   ├── random_slice.py
    │   ├── setup_GCToo_logger.py
    │   ├── simple_GCT_to_GCToo_figure.png
    │   ├── slice_gct.py
    │   ├── slice_gctoo.py
    │   ├── subset.py
    │   ├── subset_gctoo.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── functional_tests
    │   │   │   ├── LJP_row_metadata.txt
    │   │   │   ├── both_metadata_example_n1476x978.gct
    │   │   │   ├── both_metadata_example_n1476x978.gctx
    │   │   │   ├── col_meta_only_example_n355x355.gct
    │   │   │   ├── col_meta_only_example_n355x355.gctx
    │   │   │   ├── concated.gctx
    │   │   │   ├── metadata_writer_test.gctx
    │   │   │   ├── mini_folder
    │   │   │   │   ├── both_metadata_example_n1476x978.gctx
    │   │   │   │   ├── col_meta_only_example_n355x355.gctx
    │   │   │   │   ├── row_meta_only_example_n2x1203.gctx
    │   │   │   │   └── tsne_n2x1203.gctx
    │   │   │   ├── mini_gctoo_data_matrix.gctx
    │   │   │   ├── mini_gctoo_for_testing.gct
    │   │   │   ├── mini_gctoo_for_testing.gctx
    │   │   │   ├── mini_gctoo_for_testing_nometa.gct
    │   │   │   ├── mini_gctoo_for_testing_nometa.gctx
    │   │   │   ├── mini_gctx_with_metadata_n2x3.gctx
    │   │   │   ├── older_version_v1_2.gct
    │   │   │   ├── row_meta_only_example_n2x1203.gct
    │   │   │   ├── row_meta_only_example_n2x1203.gctx
    │   │   │   ├── test_colmeta_n6.txt
    │   │   │   ├── test_concat
    │   │   │   │   └── test_main
    │   │   │   │   │   ├── a.gct
    │   │   │   │   │   └── b.gct
    │   │   │   ├── test_concat_gctoo_test_main_fake_empty_file.gct
    │   │   │   ├── test_l1000.gct
    │   │   │   ├── test_l1000.gctx
    │   │   │   ├── test_l1000_highprecision.gct
    │   │   │   ├── test_l1000_highprecision.gctx
    │   │   │   ├── test_merge_bottom.gct
    │   │   │   ├── test_merge_left.gct
    │   │   │   ├── test_merge_right.gct
    │   │   │   ├── test_merge_top.gct
    │   │   │   ├── test_merged_left_right.gct
    │   │   │   ├── test_merged_top_bottom.gct
    │   │   │   ├── test_missing_colmeta.txt
    │   │   │   ├── test_missing_rowmeta.txt
    │   │   │   ├── test_p100.gct
    │   │   │   ├── test_parse_gct_int_ids.gct
    │   │   │   ├── test_parse_gctx_rid_entrez_id.gctx
    │   │   │   ├── test_rowmeta_n6.txt
    │   │   │   ├── test_subset_expected.gct
    │   │   │   ├── test_subset_in.gct
    │   │   │   ├── test_subset_rid.grp
    │   │   │   ├── test_v1point2_n5x10.gct
    │   │   │   └── tsne_n2x1203.gctx
    │   │   ├── python2_tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_GCToo.py
    │   │   │   ├── test_concat.py
    │   │   │   ├── test_diff_gctoo.py
    │   │   │   ├── test_edge_cases.py
    │   │   │   ├── test_gct2gctx.py
    │   │   │   ├── test_gctx2gct.py
    │   │   │   ├── test_parse.py
    │   │   │   ├── test_parse_gct.py
    │   │   │   ├── test_parse_gctx.py
    │   │   │   ├── test_random_slice.py
    │   │   │   ├── test_subset.py
    │   │   │   ├── test_subset_gctoo.py
    │   │   │   ├── test_write_gct.py
    │   │   │   └── test_write_gctx.py
    │   │   ├── python3_tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_GCToo.py
    │   │   │   ├── test_concat.py
    │   │   │   ├── test_diff_gctoo.py
    │   │   │   ├── test_edge_cases.py
    │   │   │   ├── test_gct2gctx.py
    │   │   │   ├── test_gctx2gct.py
    │   │   │   ├── test_parse.py
    │   │   │   ├── test_parse_gct.py
    │   │   │   ├── test_parse_gctx.py
    │   │   │   ├── test_random_slice.py
    │   │   │   ├── test_subset.py
    │   │   │   ├── test_subset_gctoo.py
    │   │   │   ├── test_transform_gctoo.py
    │   │   │   ├── test_write_gct.py
    │   │   │   └── test_write_gctx.py
    │   │   └── test_python2_python3_compatibility.py
    │   ├── transform_gctoo.py
    │   ├── write_gct.py
    │   └── write_gctx.py
    ├── set_io
    │   ├── __init__.py
    │   ├── gmt.py
    │   ├── grp.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── functional_tests
    │   │       ├── test.gmt
    │   │       ├── test.grp
    │   │       ├── test_bad.gmt
    │   │       └── test_bad2.gmt
    │   │   ├── test_gmt.py
    │   │   └── test_grp.py
    └── visualization
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── cohort_view.py
    │   ├── scattergram.py
    │   ├── stratogram.py
    │   ├── test_cohort_view.py
    │   ├── test_files
    │       └── PBRANT_CYCLE1_key_metrics_expanded_sample.txt
    │   ├── test_scattergram.py
    │   └── test_stratogram.py
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── available_modules.rst
    │   ├── build.rst
    │   ├── citing.rst
    │   ├── clue_api_client.rst
    │   ├── conf.py
    │   ├── contributing.rst
    │   ├── faq.rst
    │   ├── index.rst
    │   ├── licenses.rst
    │   ├── pandasGEXpress.rst
    │   ├── pandasgexpress_fig.png
    │   └── set_io.rst
├── nginx.conf
├── performance_testing
    ├── python_parse_timing.py
    └── python_write_timing.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tutorials
    ├── GCTX_mockup.png
    ├── GCT_mockup.png
    └── cmapPy_pandasGEXpress_tutorial.ipynb


/.dockerignore:
--------------------------------------------------------------------------------
 1 | cmapPy
 2 | performance_testing
 3 | tutorials
 4 | .travis.yml
 5 | LICENSE.txt
 6 | MANIFEST.in
 7 | README.rst
 8 | requirements.txt
 9 | setup.cfg
10 | setup.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.pyc
 3 | cmapPy.egg-info/
 4 | .vscode
 5 | .gitignore
 6 | .idea
 7 | docs/build
 8 | .DS_Store
 9 | *-checkpoint.ipynb
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # set language
 2 | language: python
 3 | 
 4 | # requirements
 5 | install: 
 6 |   - pip install -r requirements.txt
 7 |   - python setup.py develop
 8 | 
 9 | matrix:
10 |   include:
11 |     # run pandasGEXpress python2_tests      
12 |     - python: "2.7"
13 |       script:
14 |         - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python2_tests/
15 | 
16 |     # run pandasGEXpress python3_tests         
17 |     - python: "3.6"
18 |       script:
19 |         - python -m unittest discover -p "test_*.py" -s cmapPy/pandasGEXpress/tests/python3_tests/
20 |     
21 |     # run set_io tests for python2    
22 |     - python: "2.7"
23 |       script:
24 |         - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/
25 | 
26 |     # run set_io tests for python3    
27 |     - python: "3.6"
28 |       script:
29 |         - python -m unittest discover -p "test_*.py" -s cmapPy/set_io/tests/
30 |       
31 |     # run math tests for python2
32 |     - python: "2.7"
33 |       script:
34 |         - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/
35 |     
36 |      # run math tests for python3
37 |     - python: "3.6"
38 |       script:
39 |         - python -m unittest discover -p "test_*.py" -s cmapPy/math/tests/
40 | 
41 |     # run python2_python3_comaptibility tests for python2      
42 |     - python: "2.7"
43 |       script:
44 |         - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/
45 |    
46 |     # run python2_python3_comaptibility tests for python3        
47 |     - python: "3.6"
48 |       script:
49 |         - python -m unittest discover -p "test_python2_python3_*.py" -s cmapPy/pandasGEXpress/tests/
50 | 
51 | # what branches of github to use
52 | branches:
53 |   only:
54 |     - master


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nginx
2 | RUN mkdir -p /usr/share/nginx/html/cmapPy
3 | COPY docs/build/html /usr/share/nginx/html/cmapPy/
4 | COPY nginx.conf /etc/nginx/
5 | EXPOSE 9081
6 | CMD ["nginx", "-g", "daemon off;"]


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Connectivity Map (CMap) at the Broad Institute, Inc. 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include cmapPy *.py
2 | recursive-include *.gct
3 | recursive-include *.gctx
4 | recursive-include *.cfg 
5 | include LICENSE.txt
6 | include requirements.txt


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | |install with bioconda|
 2 | 
 3 | .. |install with bioconda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square
 4 |    :target: http://bioconda.github.io/recipes/cmappy/README.html
 5 |    
 6 | .. image:: https://badge.fury.io/py/cmapPy.svg
 7 |     :target: https://badge.fury.io/py/cmapPy
 8 | 
 9 | .. image:: https://travis-ci.org/cmap/cmapPy.svg?branch=master
10 |     :target: https://travis-ci.org/cmap/cmapPy
11 | 
12 | .. image:: https://readthedocs.org/projects/cmappy/badge/?version=latest
13 |     :target: http://cmappy.readthedocs.io/en/latest/?badge=latest
14 |     :alt: Documentation Status
15 | 
16 | **cmapPy:** Tools for interacting with .gctx and .gct files, and other Connectivity Map resources
17 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18 | **Connectivity Map, Broad Institute of MIT and Harvard**
19 | 
20 | Documentation: `<https://clue.io/cmapPy/index.html>`_
21 | 
22 | For questions/problems, please add an issue (that includes code/files that reproduce your problem) to the repository. 
23 | 
24 | Contributing
25 | ====================
26 | 
27 | We welcome contributors! For your pull requests, please include the following:
28 | 
29 | * Sample code/file that reproducibly causes the bug/issue
30 | * Documented code providing fix
31 | * Unit tests evaluating added/modified methods. 
32 |  
33 | 
34 | Citation
35 | ====================
36 | 
37 | If you use cmapPy and/or GCTx for your research, please cite `Enache et al.`_
38 | 
39 | .. _Enache et al.: https://academic.oup.com/bioinformatics/article/35/8/1427/5094509
40 | 


--------------------------------------------------------------------------------
/cmapPy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/__init__.py


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/clue_api_client/__init__.py


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/cell_queries.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | 
 4 | __authors__ = "David L. Lahr"
 5 | __email__ = "dlahr@broadinstitute.org"
 6 | 
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | resource_name = "cells"
11 | 
12 | 
13 | def is_cell_line_in_api(my_clue_api_client, cell_id):
14 |         query_result = my_clue_api_client.run_count_query(resource_name, {"cell_id":cell_id})
15 |         logger.debug("query_result:  {}".format(query_result))
16 |         return query_result["count"] == 1


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/clue_api_client.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | import cmapPy.clue_api_client.setup_logger as setup_logger
 4 | import json
 5 | 
 6 | __authors__ = "David L. Lahr"
 7 | __email__ = "dlahr@broadinstitute.org"
 8 | 
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class ClueApiClient(object):
14 |     """Basic class for running queries against CLUE api
15 |     """
16 | 
17 |     def __init__(self, base_url=None, user_key=None):
18 |         """
19 |         Args:
20 |             base_url: specific URL to use for the CLUE api, e.g. https://dev-api.clue.io/api/
21 |             user_key: user key to use for authentication, available from CLUE account
22 | 
23 |         Returns:
24 |         """
25 |         self.base_url = base_url
26 |         self.headers = {"user_key":user_key}
27 | 
28 |     def run_filter_query(self, resource_name, filter_clause):
29 |         """run a query (get) against the CLUE api, using the API and user key fields of self and the fitler_clause provided
30 | 
31 |         Args:
32 |             resource_name: str - name of the resource / collection to query - e.g. genes, perts, cells etc.
33 |             filter_clause: dictionary - contains filter to pass to API to; uses loopback specification
34 | 
35 |         Returns: list of dictionaries containing the results of the query
36 |         """
37 |         url = self.base_url + "/" + resource_name
38 |         params = {"filter":json.dumps(filter_clause)}
39 | 
40 |         r = requests.get(url, headers=self.headers, params=params)
41 |         logger.debug("requests.get result r.status_code:  {}".format(r.status_code))
42 | 
43 |         ClueApiClient._check_request_response(r)
44 | 
45 |         return r.json()
46 | 
47 |     def run_count_query(self, resource_name, where_clause):
48 |         """run a query (get) against CLUE api
49 | 
50 |         Args:
51 |             resource_name: str - name of the resource / collection to query - e.g. genes, perts, cells etc.
52 |             where_clause: dictionary - contains where clause to pass to API to; uses loopback specification
53 | 
54 |         Returns: dictionary containing the results of the query
55 |         """
56 |         url = self.base_url + "/" + resource_name + "/count"
57 |         params = {"where":json.dumps(where_clause)}
58 | 
59 |         r = requests.get(url, headers=self.headers, params=params)
60 |         logger.debug("requests.get result r.status_code:  {}".format(r.status_code))
61 | 
62 |         ClueApiClient._check_request_response(r)
63 | 
64 |         return r.json()
65 | 
66 |     def run_post(self, resource_name, data):
67 |         url = self.base_url + "/" + resource_name
68 | 
69 |         r = requests.post(url, data=data, headers=self.headers)
70 |         logger.debug("requests.post result r.status_code:  {}".format(r.status_code))
71 | 
72 |         ClueApiClient._check_request_response(r)
73 | 
74 |         return r.json()
75 | 
76 |     def run_delete(self, resource_name, id):
77 |         url = self.base_url + "/" + resource_name + "/" + id
78 |         r = requests.delete(url, headers=self.headers)
79 |         logger.debug("requests.delete result r.status_code:  {}".format(r.status_code))
80 | 
81 |         ClueApiClient._check_request_response(r)
82 | 
83 |         did_delete = r.json()["count"] == 1
84 |         return did_delete
85 | 
86 |     def run_put(self, resource_name, id, data):
87 |         url = self.base_url + "/" + resource_name + "/" + id
88 | 
89 |         r = requests.put(url, data=data, headers=self.headers)
90 |         logger.debug("requests.put result r.status_code:  {}".format(r.status_code))
91 | 
92 |         ClueApiClient._check_request_response(r)
93 | 
94 |         return r.json()
95 | 
96 |     @staticmethod
97 |     def _check_request_response(response):
98 |         assert response.status_code == 200, "ClueApiClient request failed response.status_code:  {}  response.reason:  {}".format(
99 |             response.status_code, response.reason)


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/gene_queries.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 3 | 
 4 | __authors__ = "David L. Lahr"
 5 | __email__ = "dlahr@broadinstitute.org"
 6 | 
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | resource_name = "genes"
11 | 
12 | 
13 | def are_genes_in_api(my_clue_api_client, gene_symbols):
14 |     """determine if genes are present in the API
15 | 
16 |     Args:
17 |         my_clue_api_client:
18 |         gene_symbols: collection of gene symbols to query the API with
19 | 
20 |     Returns: set of the found gene symbols
21 | 
22 |     """
23 |     if len(gene_symbols) > 0:
24 |         query_gene_symbols = gene_symbols if type(gene_symbols) is list else list(gene_symbols)
25 | 
26 |         query_result = my_clue_api_client.run_filter_query(resource_name,
27 |             {"where":{"gene_symbol":{"inq":query_gene_symbols}}, "fields":{"gene_symbol":True}})
28 |         logger.debug("query_result:  {}".format(query_result))
29 | 
30 |         r = set([x["gene_symbol"] for x in query_result])
31 |         return r
32 |     else:
33 |         logger.warning("provided gene_symbols was empty, cannot run query")
34 |         return set()
35 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/macchiato_queries.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | 
 4 | __authors__ = "David L. Lahr"
 5 | __email__ = "dlahr@broadinstitute.org"
 6 | 
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | resource_name = "macchiato"
11 | 
12 | uploading_status = "UPLOADING"
13 | uploaded_status = "UPLOADED"
14 | 
15 | 
16 | def is_brew_prefix_in_api(my_clue_api_client, brew_prefix):
17 |     my_where_clause = {"brew_prefix":brew_prefix}
18 |     query_result = my_clue_api_client.run_count_query(resource_name, my_where_clause)
19 |     logger.debug("query_result:  {}".format(query_result))
20 |     return query_result["count"] == 1
21 | 
22 | 
23 | def get_api_id(my_clue_api_client, brew_prefix):
24 |     my_filter = {"where":{"brew_prefix":brew_prefix}, "fields":{"id":True}}
25 |     id_result = my_clue_api_client.run_filter_query(resource_name, my_filter)
26 |     logger.debug("id_result:  {}".format(id_result))
27 |     return id_result[0]["id"]
28 | 
29 | 
30 | def change_status(my_clue_api_client, api_id, new_status):
31 |     r = my_clue_api_client.run_put(resource_name, api_id, {"status":new_status})
32 |     return r
33 | 
34 | 
35 | def create_brew_prefix_in_api(my_clue_api_client, brew_prefix, status=uploading_status):
36 |     data = {"brew_prefix":brew_prefix, "status":uploading_status}
37 |     r = my_clue_api_client.run_post(resource_name, data)
38 |     return r
39 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/mock_clue_api_client.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import cmapPy.clue_api_client.clue_api_client as clue_api_client
 4 | 
 5 | __authors__ = "David L. Lahr"
 6 | __email__ = "dlahr@broadinstitute.org"
 7 | 
 8 | 
 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
10 | 
11 | 
12 | class MockClueApiClient(clue_api_client.ClueApiClient):
13 |     def __init__(self, base_url=None, user_key=None, default_return_values=None, filter_query_result=None,
14 |         count_query_result=None, post_result=None, delete_result=None, put_result=None):
15 | 
16 |         super(MockClueApiClient, self).__init__(base_url=base_url, user_key=user_key)
17 | 
18 |         self.default_return_values = default_return_values if default_return_values else []
19 | 
20 |         self.filter_query_result = filter_query_result if filter_query_result else self.default_return_values
21 | 
22 |         self.count_query_result = count_query_result if count_query_result else self.default_return_values
23 | 
24 |         self.post_result = post_result if post_result else self.default_return_values
25 | 
26 |         self.delete_result = delete_result if delete_result else self.default_return_values
27 | 
28 |         self.put_result = put_result if put_result else self.default_return_values
29 | 
30 |     def run_filter_query(self, resource_name, filter_clause):
31 |         return self.filter_query_result
32 | 
33 |     def run_count_query(self, resource_name, where_clause):
34 |         return self.count_query_result
35 | 
36 |     def run_post(self, resource_name, data):
37 |         return self.post_result
38 | 
39 |     def run_delete(self, resource_name, id):
40 |         return self.delete_result
41 | 
42 |     def run_put(self, resource_name, id, data):
43 |         return self.put_result
44 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/pert_queries.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | 
 4 | __authors__ = "David L. Lahr"
 5 | __email__ = "dlahr@broadinstitute.org"
 6 | 
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | resource_name = "perts"
11 | 
12 | 
13 | def retrieve_pert_id_pert_iname_map(pert_ids, my_clue_api_client):
14 |     my_filter = {"where":{"pert_id":{"inq":pert_ids}}, "fields":{"pert_id":True, "pert_iname":True}}
15 |     query_result = my_clue_api_client.run_filter_query(resource_name, my_filter)
16 |     logger.debug("query_result:  {}".format(query_result))
17 | 
18 |     r = _build_map_from_clue_api_result(query_result, "pert_id", "pert_iname")
19 |     return r
20 | 
21 | 
22 | def retrieve_pert_id_pert_type_map(pert_ids, my_clue_api_client):
23 |     my_filter = {"where":{"pert_id":{"inq":pert_ids}}, "fields":{"pert_id":True, "pert_type":True}}
24 |     query_result = my_clue_api_client.run_filter_query(resource_name, my_filter)
25 |     logger.debug("query_result:  {}".format(query_result))
26 | 
27 |     r = _build_map_from_clue_api_result(query_result, "pert_id", "pert_type")
28 |     return r
29 | 
30 | 
31 | def _build_map_from_clue_api_result(clue_api_result, key_field, value_field):
32 |     r = {}
33 |     for car in clue_api_result:
34 |         key = car[key_field]
35 |         value = car[value_field]
36 |         r[key] = value
37 | 
38 |     return r
39 | 
40 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/setup_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.handlers
 3 | 
 4 | __author__ = "David Lahr"
 5 | __email__ = "dlahr@broadinstitute.org"
 6 | 
 7 | LOGGER_NAME = "cmap_logger"
 8 | 
 9 | _LOG_FORMAT = "%(levelname)s %(asctime)s %(module)s %(funcName)s %(message)s"
10 | _LOG_FILE_MAX_BYTES = 10000000
11 | _LOG_FILE_BACKUP_COUNT = 5
12 | 
13 | 
14 | def setup(verbose=False, log_file=None):
15 |     logger = logging.getLogger(LOGGER_NAME)
16 | 
17 |     level = (logging.DEBUG if verbose else logging.INFO)
18 | 
19 |     if log_file is None:
20 |         logging.basicConfig(level=level, format=_LOG_FORMAT)
21 |     else:
22 |         logger.setLevel(level)
23 |         handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=_LOG_FILE_MAX_BYTES,
24 |                                                        backupCount=_LOG_FILE_BACKUP_COUNT)
25 |         handler.setFormatter(logging.Formatter(fmt=_LOG_FORMAT))
26 |         logger.addHandler(handler)
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_cell_queries.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import logging
 4 | import test_clue_api_client
 5 | import cmapPy.clue_api_client.cell_queries as cq
 6 | 
 7 | __authors__ = "David L. Lahr"
 8 | __email__ = "dlahr@broadinstitute.org"
 9 | 
10 | 
11 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
12 | 
13 | cao = None
14 | 
15 | 
16 | class TestCellQueries(unittest.TestCase):
17 |     def test_is_cell_line_in_api(self):
18 |         r = cq.is_cell_line_in_api(cao, "A375")
19 |         self.assertTrue(r)
20 |         r = cq.is_cell_line_in_api(cao, "Dave Lahr's fake cell line that never existed")
21 |         self.assertFalse(r)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     setup_logger.setup(verbose=True)
26 | 
27 |     cao = test_clue_api_client.build_clue_api_client_from_default_test_config()
28 | 
29 |     unittest.main()


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_clue_api_client.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import cmapPy.clue_api_client.setup_logger as setup_logger
  3 | import logging
  4 | import cmapPy.clue_api_client.clue_api_client as clue_api_client
  5 | import os.path
  6 | import collections
  7 | 
  8 | __authors__ = "David L. Lahr"
  9 | __email__ = "dlahr@broadinstitute.org"
 10 | 
 11 | 
 12 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 13 | 
 14 | config_filepath = os.path.expanduser("~/.cmapPy.cfg")
 15 | config_section = "test"
 16 | cao = None
 17 | 
 18 | test_brew_prefix = "dlahr brew prefix 001"
 19 | test_status = "my fake status"
 20 | 
 21 | class TestClueApiClient(unittest.TestCase):
 22 |     def test_run_query(self):
 23 |         #get one gene
 24 |         r = cao.run_filter_query("genes", {"where":{"entrez_id":5720}})
 25 |         self.assertIsNotNone(r)
 26 |         logger.debug("len(r):  {}".format(len(r)))
 27 |         logger.debug("r:  {}".format(r))
 28 |         self.assertEqual(1, len(r))
 29 | 
 30 |         #get multiple genes
 31 |         r = cao.run_filter_query("genes", {"where":{"entrez_id":{"inq":[5720,207]}}})
 32 |         self.assertIsNotNone(r)
 33 |         logger.debug("len(r):  {}".format(len(r)))
 34 |         logger.debug("r:  {}".format(r))
 35 |         self.assertEqual(2, len(r))
 36 | 
 37 |         r = cao.run_filter_query("perts", {"where":{"pert_id":"BRD-K12345678"}})
 38 |         self.assertIsNotNone(r)
 39 |         logger.debug("len(r):  {}".format(len(r)))
 40 |         self.assertEqual(0, len(r))
 41 | 
 42 |     def test_run_query_handle_fail(self):
 43 |         with self.assertRaises(Exception) as context:
 44 |             cao.run_filter_query("fakeresource", {})
 45 |         self.assertIsNotNone(context.exception)
 46 |         logger.debug("context.exception:  {}".format(context.exception))
 47 |         self.assertIn("ClueApiClient request failed", str(context.exception))
 48 | 
 49 |     def test_run_where_query(self):
 50 |         r = cao.run_count_query("cells", {"cell_id":"A375"})
 51 |         self.assertIsNotNone(r)
 52 |         logger.debug("r:  {}".format(r))
 53 |         self.assertIn("count", r)
 54 |         self.assertEqual(1, r["count"])
 55 | 
 56 |     def test__check_request_response(self):
 57 |         FakeResponse = collections.namedtuple("FakeResponse", ["status_code", "reason"])
 58 | 
 59 |         #happy path
 60 |         fr = FakeResponse(200, None)
 61 |         clue_api_client.ClueApiClient._check_request_response(fr)
 62 | 
 63 |         #response status code that should cause failure
 64 |         fr2 = FakeResponse(404, "I don't need a good reason!")
 65 |         with self.assertRaises(Exception) as context:
 66 |             clue_api_client.ClueApiClient._check_request_response(fr2)
 67 |         logger.debug("context.exception:  {}".format(context.exception))
 68 |         self.assertIn(str(fr2.status_code), str(context.exception))
 69 |         self.assertIn(fr2.reason, str(context.exception))
 70 | 
 71 |     def test_run_post(self):
 72 |         #check that the entry isn't already there, if it is delete it
 73 |         check_result = cao.run_count_query("macchiato", {"brew_prefix":test_brew_prefix})
 74 |         if check_result["count"] == 1:
 75 |             lookup_result = cao.run_filter_query("macchiato", {"where":{"brew_prefix":test_brew_prefix}})[0]
 76 |             cao.run_delete("macchiato", lookup_result["id"])
 77 | 
 78 |         #happy path
 79 |         data = {"brew_prefix":test_brew_prefix, "status":test_status}
 80 |         r = cao.run_post("macchiato", data)
 81 |         self.assertIsNotNone(r)
 82 |         logger.debug("r:  {}".format(r))
 83 |         self.assertIn("brew_prefix", r)
 84 |         self.assertEqual(data["brew_prefix"], r["brew_prefix"])
 85 |         self.assertIn("id", r)
 86 |         #check that user key has not been added to entry
 87 |         self.assertNotIn("user_key", r)
 88 | 
 89 |         #clean up
 90 |         r = cao.run_delete("macchiato", r["id"])
 91 | 
 92 |     def test_run_delete(self):
 93 |         #check that there is an entry to delete, if not create it
 94 |         lookup_result = add_entry_if_not_already_present(cao, "macchiato", {"brew_prefix":test_brew_prefix},
 95 |                                          {"brew_prefix":test_brew_prefix, "status": test_status})
 96 | 
 97 |         delete_id = lookup_result["id"]
 98 | 
 99 |         #happy path
100 |         r = cao.run_delete("macchiato", delete_id)
101 |         self.assertIsNotNone(r)
102 |         logger.debug("r:  {}".format(r))
103 |         self.assertTrue(r)
104 | 
105 |     def test_run_put(self):
106 |         #check that there is an entry to update, if not create it
107 |         lookup_result = add_entry_if_not_already_present(cao, "macchiato", {"brew_prefix":test_brew_prefix},
108 |                                          {"brew_prefix":test_brew_prefix, "status": test_status})
109 | 
110 |         put_id = lookup_result["id"]
111 | 
112 |         expected_status = "test status for test_clue_api_client test_run_put"
113 |         r = cao.run_put("macchiato", put_id, {"status":expected_status})
114 |         self.assertIsNotNone(r)
115 |         logger.debug("r:  {}".format(r))
116 |         self.assertIn("status", r)
117 |         self.assertEqual(expected_status, r["status"])
118 |         self.assertNotIn("user_key", r)
119 | 
120 | 
121 | def build_clue_api_client_from_default_test_config():
122 |     cfg = configparser.RawConfigParser()
123 |     cfg.read(config_filepath)
124 |     cao = clue_api_client.ClueApiClient(base_url=cfg.get(config_section, "clue_api_url"),
125 |                                   user_key=cfg.get(config_section, "clue_api_user_key"))
126 |     return cao
127 | 
128 | 
129 | def add_entry_if_not_already_present(my_clue_api_orm, resource_name, where_query, default_data):
130 |     check_result = my_clue_api_orm.run_count_query(resource_name, where_query)
131 |     if check_result["count"] == 0:
132 |         lookup_result = my_clue_api_orm.run_post(resource_name, default_data)
133 |     else:
134 |         lookup_result = my_clue_api_orm.run_filter_query(resource_name, {"where":where_query})[0]
135 | 
136 |     return lookup_result
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     setup_logger.setup(verbose=True)
141 | 
142 |     cao = build_clue_api_client_from_default_test_config()
143 | 
144 |     unittest.main()
145 | 


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_gene_queries.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import logging
 4 | import test_clue_api_client
 5 | import cmapPy.clue_api_client.gene_queries as gq
 6 | 
 7 | __authors__ = "David L. Lahr"
 8 | __email__ = "dlahr@broadinstitute.org"
 9 | 
10 | 
11 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
12 | 
13 | cao = None
14 | 
15 | 
16 | class TestGeneQueries(unittest.TestCase):
17 |     def test_are_genes_in_api(self):
18 |         #happy path mix of valid and invalid genes
19 |         r = gq.are_genes_in_api(cao, ["AKT1", "BRAF", "Dave Lahr's fake cell line that never existed"])
20 |         logger.debug("r:  {}".format(r))
21 |         self.assertIsNotNone(r)
22 |         self.assertEqual(2, len(r))
23 |         self.assertIn("AKT1", r)
24 |         self.assertIn("BRAF", r)
25 | 
26 |         #happy path provide genes as set
27 |         r = gq.are_genes_in_api(cao, {"AKT1"})
28 |         logger.debug("r:  {}".format(r))
29 |         self.assertIsNotNone(r)
30 |         self.assertEqual(1, len(r))
31 |         self.assertIn("AKT1", r)
32 | 
33 |     def test_are_genes_in_api_no_genes_provided(self):
34 |         r = gq.are_genes_in_api(cao, set())
35 |         logger.debug("r:  {}".format(r))
36 |         self.assertIsNotNone(r)
37 |         self.assertEqual(0, len(r))
38 | 
39 | if __name__ == "__main__":
40 |     setup_logger.setup(verbose=True)
41 | 
42 |     cao = test_clue_api_client.build_clue_api_client_from_default_test_config()
43 | 
44 |     unittest.main()


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_macchiato_queries.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import logging
 4 | import test_clue_api_client
 5 | import cmapPy.clue_api_client.macchiato_queries as mq
 6 | 
 7 | __authors__ = "David L. Lahr"
 8 | __email__ = "dlahr@broadinstitute.org"
 9 | 
10 | 
11 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
12 | 
13 | cao = None
14 | 
15 | test_brew_prefix = "test_brew_prefix_for_test_macchiato_queries"
16 | test_status = "test macchiato status for test_macchiato_queries"
17 | 
18 | 
19 | class TestMacchiatoQueries(unittest.TestCase):
20 |     def setUp(self):
21 |         test_clue_api_client.add_entry_if_not_already_present(cao, mq.resource_name,
22 |             {"brew_prefix":test_brew_prefix}, {"brew_prefix":test_brew_prefix, "status": test_status})
23 | 
24 |     def test_is_brew_prefix_in_api(self):
25 |         r = mq.is_brew_prefix_in_api(cao, test_brew_prefix)
26 |         self.assertTrue(r)
27 | 
28 |         r = mq.is_brew_prefix_in_api(cao, "Dave Lahr's fake brew prefix that hopefully will never exist in the API")
29 |         self.assertFalse(r)
30 | 
31 |     def test_get_api_id(self):
32 |         r = mq.get_api_id(cao, test_brew_prefix)
33 |         self.assertIsNotNone(r)
34 |         logger.debug("r:  {}".format(r))
35 | 
36 |     def test_change_status(self):
37 |         cur_id = mq.get_api_id(cao, test_brew_prefix)
38 | 
39 |         expected_new_status = "test status for test_macchiato_queries TestMacchiatoQueries.test_change_status"
40 |         r = mq.change_status(cao, cur_id, expected_new_status)
41 |         self.assertIsNotNone(r)
42 |         logger.debug("r:  {}".format(r))
43 |         self.assertIn("status", r)
44 |         self.assertEqual(expected_new_status, r["status"])
45 | 
46 |     def test_create_brew_prefix_in_api(self):
47 |         #happy path
48 |         expected_brew_prefix = "brew_prefix for TestMacchiatoQueries.test_create_brew_prefix_in_api"
49 |         r = mq.create_brew_prefix_in_api(cao, expected_brew_prefix, status=test_status)
50 |         self.assertIsNotNone(r)
51 |         logger.debug("r:  {}".format(r))
52 |         self.assertIn("id", r)
53 |         self.assertIsNotNone(r["id"])
54 | 
55 |         #cleanup by deleting created entry
56 |         cao.run_delete(mq.resource_name, r["id"])
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     setup_logger.setup(verbose=True)
61 | 
62 |     cao = test_clue_api_client.build_clue_api_client_from_default_test_config()
63 | 
64 |     unittest.main()


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_mock_clue_api_client.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import logging
 4 | import cmapPy.clue_api_client.mock_clue_api_client as mock_clue_api_client
 5 | 
 6 | __authors__ = "David L. Lahr"
 7 | __email__ = "dlahr@broadinstitute.org"
 8 | 
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class TestMockClueApiClient(unittest.TestCase):
14 |     def test_run(self):
15 |         mcao = mock_clue_api_client.MockClueApiClient(default_return_values=[{"hello":"world"}])
16 |         method_list = [mcao.run_filter_query, mcao.run_count_query, mcao.run_delete, mcao.run_post, mcao.run_put]
17 |         for ml in method_list:
18 |             if ml == mcao.run_put:
19 |                 r = ml("fake resource name", {"unused":"filter"}, None)
20 |             else:
21 |                 r = ml("fake resource name", {"unused":"filter"})
22 |             self.assertIsNotNone(r)
23 |             logger.debug("r:  {}".format(r))
24 |             self.assertEqual(1, len(r))
25 |             r = r[0]
26 |             self.assertEqual(1, len(r))
27 |             self.assertIn("hello", r)
28 |             self.assertEqual("world", r["hello"])
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     setup_logger.setup(verbose=True)
33 | 
34 |     unittest.main()


--------------------------------------------------------------------------------
/cmapPy/clue_api_client/tests/test_pert_queries.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import cmapPy.clue_api_client.setup_logger as setup_logger
 3 | import logging
 4 | import test_clue_api_client
 5 | import cmapPy.clue_api_client.cell_queries as pq
 6 | 
 7 | __authors__ = "David L. Lahr"
 8 | __email__ = "dlahr@broadinstitute.org"
 9 | 
10 | 
11 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
12 | 
13 | cao = None
14 | 
15 | 
16 | class TestPertQueries(unittest.TestCase):
17 |     def test__build_map_from_clue_api_result(self):
18 |         r = pq._build_map_from_clue_api_result([{"a": "b", "c": "d"}], "a", "c")
19 |         self.assertIsNotNone(r)
20 |         logger.debug("r:  {}".format(r))
21 |         self.assertEqual(1, len(r))
22 |         self.assertIn("b", r)
23 |         self.assertEqual("d", r["b"])
24 | 
25 |     def test_retrieve_pert_id_pert_iname_map(self):
26 |         r = pq.retrieve_pert_id_pert_iname_map(["BRD-K21680192", "BRD-K88378636", "not a valid BRD"], cao)
27 |         self.assertIsNotNone(r)
28 |         logger.debug("r:  {}".format(r))
29 |         self.assertEqual(2, len(r))
30 |         self.assertIn("BRD-K21680192", r)
31 |         self.assertIsNotNone(r["BRD-K21680192"])
32 |         self.assertIn("BRD-K88378636", r)
33 |         self.assertIsNotNone(r["BRD-K88378636"])
34 |         self.assertNotIn("not a valid BRD", r)
35 | 
36 |     def test_retrieve_pert_id_pert_type_map(self):
37 |         r = pq.retrieve_pert_id_pert_type_map(["BRD-K21680192", "BRD-K88378636", "not a valid BRD"], cao)
38 |         self.assertIsNotNone(r)
39 |         logger.debug("r:  {}".format(r))
40 |         self.assertEqual(2, len(r))
41 |         self.assertIn("BRD-K21680192", r)
42 |         self.assertIsNotNone(r["BRD-K21680192"])
43 |         self.assertIn("BRD-K88378636", r)
44 |         self.assertIsNotNone(r["BRD-K88378636"])
45 |         self.assertNotIn("not a valid BRD", r)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     setup_logger.setup(verbose=True)
50 | 
51 |     cao = test_clue_api_client.build_clue_api_client_from_default_test_config()
52 | 
53 |     unittest.main()


--------------------------------------------------------------------------------
/cmapPy/example_cmapPy_config_file.cfg:
--------------------------------------------------------------------------------
1 | [prod]
2 | clue_api_url = https://api.clue.io/api
3 | clue_api_user_key = CHANGE_ME
4 | 
5 | [test]
6 | clue_api_url = https://dev-api.clue.io/api
7 | clue_api_user_key = CHANGE_ME


--------------------------------------------------------------------------------
/cmapPy/math/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/math/__init__.py


--------------------------------------------------------------------------------
/cmapPy/math/agg_wt_avg.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | agg_wt_avg.py
  3 | 
  4 | Aggregate a matrix of replicate profiles into a single signature using
  5 | a weighted average based on the correlation between replicates. That is, if
  6 | one replicate is less correlated with the other replicates, its values will
  7 | not be weighted as highly in the aggregated signature.
  8 | 
  9 | Equivalent to the 'modz' method in mortar.
 10 | '''
 11 | 
 12 | import numpy as np
 13 | 
 14 | rounding_precision = 4
 15 | 
 16 | 
 17 | def get_upper_triangle(correlation_matrix):
 18 |     ''' Extract upper triangle from a square matrix. Negative values are
 19 |     set to 0.
 20 | 
 21 |     Args:
 22 |     correlation_matrix (pandas df): Correlations between all replicates
 23 | 
 24 |     Returns:
 25 |     upper_tri_df (pandas df): Upper triangle extracted from
 26 |         correlation_matrix; rid is the row index, cid is the column index,
 27 |         corr is the extracted correlation value
 28 |     '''
 29 |     upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool))
 30 | 
 31 |     # convert matrix into long form description
 32 |     upper_tri_df = upper_triangle.stack().reset_index(level=1)
 33 |     upper_tri_df.columns = ['rid', 'corr']
 34 | 
 35 |     # Index at this point is cid, it now becomes a column
 36 |     upper_tri_df.reset_index(level=0, inplace=True)
 37 | 
 38 |     # Get rid of negative values
 39 |     upper_tri_df['corr'] = upper_tri_df['corr'].clip(lower=0)
 40 | 
 41 |     return upper_tri_df.round(rounding_precision)
 42 | 
 43 | 
 44 | def calculate_weights(correlation_matrix, min_wt):
 45 |     ''' Calculate a weight for each profile based on its correlation to other
 46 |     replicates. Negative correlations are clipped to 0, and weights are clipped
 47 |     to be min_wt at the least.
 48 | 
 49 |     Args:
 50 |     correlation_matrix (pandas df): Correlations between all replicates
 51 |     min_wt (float): Minimum raw weight when calculating weighted average
 52 | 
 53 |     Returns:
 54 |     raw weights (pandas series):  Mean correlation to other replicates
 55 |     weights (pandas series): raw_weights normalized such that they add to 1
 56 |     '''
 57 |     # fill diagonal of correlation_matrix with np.nan
 58 |     np.fill_diagonal(correlation_matrix.values, np.nan)
 59 | 
 60 |     # remove negative values
 61 |     correlation_matrix = correlation_matrix.clip(lower=0)
 62 | 
 63 |     # get average correlation for each profile (will ignore NaN)
 64 |     raw_weights = correlation_matrix.mean(axis=1)
 65 | 
 66 |     # threshold weights
 67 |     raw_weights = raw_weights.clip(lower=min_wt)
 68 | 
 69 |     # normalize raw_weights so that they add to 1
 70 |     weights = raw_weights / sum(raw_weights)
 71 | 
 72 |     return raw_weights.round(rounding_precision), weights.round(rounding_precision)
 73 | 
 74 | 
 75 | def agg_wt_avg(mat, min_wt = 0.01, corr_metric='spearman'):
 76 |     ''' Aggregate a set of replicate profiles into a single signature using
 77 |     a weighted average.
 78 | 
 79 |     Args:
 80 |     mat (pandas df): a matrix of replicate profiles, where the columns are
 81 |         samples and the rows are features; columns correspond to the
 82 |         replicates of a single perturbagen
 83 |     min_wt (float): Minimum raw weight when calculating weighted average
 84 |     corr_metric (string): Spearman or Pearson; the correlation method
 85 | 
 86 |     Returns:
 87 |     out_sig (pandas series): weighted average values
 88 |     upper_tri_df (pandas df): the correlations between each profile that went into the signature
 89 |     raw weights (pandas series): weights before normalization
 90 |     weights (pandas series): weights after normalization
 91 |     '''
 92 |     assert mat.shape[1] > 0, "mat is empty! mat: {}".format(mat)
 93 | 
 94 |     if mat.shape[1] == 1:
 95 | 
 96 |         out_sig = mat
 97 |         upper_tri_df = None
 98 |         raw_weights = None
 99 |         weights = None
100 | 
101 |     else:
102 | 
103 |         assert corr_metric in ["spearman", "pearson"]
104 | 
105 |         # Make correlation matrix column wise
106 |         corr_mat = mat.corr(method=corr_metric)
107 | 
108 |         # Save the values in the upper triangle
109 |         upper_tri_df = get_upper_triangle(corr_mat)
110 | 
111 |         # Calculate weight per replicate
112 |         raw_weights, weights = calculate_weights(corr_mat, min_wt)
113 | 
114 |         # Apply weights to values
115 |         weighted_values = mat * weights
116 |         out_sig = weighted_values.sum(axis=1)
117 | 
118 |     return out_sig, upper_tri_df, raw_weights, weights


--------------------------------------------------------------------------------
/cmapPy/math/fast_corr.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
  3 | import numpy
  4 | import cmapPy.math.fast_cov as fast_cov
  5 | import pandas
  6 | 
  7 | 
  8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
  9 | 
 10 | 
 11 | def fast_corr(x, y=None, destination=None):
 12 |     """calculate the pearson correlation matrix for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix
 13 |     between x and y (with dimensions OxP).  If destination is provided, put the results there.  
 14 |     In the language of statistics the columns are the variables and the rows are the observations.
 15 | 
 16 |     Args:
 17 |         x (numpy array-like) MxN in shape
 18 |         y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
 19 |         destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
 20 |             memmap of a file)
 21 | 
 22 |         returns (numpy array-like) array of the covariance values
 23 |             for defaults (y=None), shape is NxN
 24 |             if y is provied, shape is NxP
 25 |     """
 26 |     if y is None:
 27 |         y = x
 28 |     
 29 |     r = fast_cov.fast_cov(x, y, destination=destination)
 30 | 
 31 |     std_x = numpy.std(x, axis=0, ddof=1)
 32 |     if numpy.isscalar(std_x):
 33 |         std_x = numpy.array((std_x,))
 34 | 
 35 |     std_y = numpy.std(y, axis=0, ddof=1)
 36 |     if numpy.isscalar(std_y):
 37 |         std_y = numpy.array((std_y,))
 38 | 
 39 |     numpy.divide(r, std_x[:, numpy.newaxis], out=r)
 40 |     numpy.divide(r, std_y[numpy.newaxis, :], out=r)
 41 | 
 42 |     return r
 43 | 
 44 | 
 45 | def calculate_moments_with_additional_mask(x, mask):
 46 |     """calculate the moments (y, y^2, and variance) of the columns of x, excluding masked within x, for each of the masking columns in mask
 47 |     Number of rows in x and mask must be the same.
 48 | 
 49 |     Args:
 50 |         x (numpy.ma.array like)
 51 |         mask (numpy array-like boolean) 
 52 |     """
 53 |     non_mask_overlaps = fast_cov.calculate_non_mask_overlaps(x.mask, mask)
 54 | 
 55 |     unmask = 1.0 * ~mask
 56 |     
 57 |     expect_x = numpy.ma.dot(x.T, unmask) / non_mask_overlaps
 58 |     expect_x = expect_x.T
 59 | 
 60 |     expect_x_squared = numpy.ma.dot(
 61 |         numpy.power(x, 2.0).T, unmask
 62 |     ) / non_mask_overlaps
 63 |     expect_x_squared = expect_x_squared.T
 64 | 
 65 |     var_x = (expect_x_squared - numpy.power(expect_x, 2.0)) * non_mask_overlaps.T / (non_mask_overlaps.T - 1)
 66 | 
 67 |     return expect_x, expect_x_squared, var_x
 68 | 
 69 | 
 70 | def nan_fast_corr(x, y=None, destination=None):
 71 |     """calculate the pearson correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the pearson correlaton matrix
 72 |     between x and y (with dimensions OxP).  If destination is provided, put the results there.  
 73 |     In the language of statistics the columns are the variables and the rows are the observations.
 74 | 
 75 |     Args:
 76 |         x (numpy array-like) MxN in shape
 77 |         y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
 78 |         destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
 79 |             memmap of a file)
 80 | 
 81 |         returns (numpy array-like) array of the covariance values
 82 |             for defaults (y=None), shape is NxN
 83 |             if y is provied, shape is NxP
 84 |     """
 85 |     x_masked = numpy.ma.array(x, mask=numpy.isnan(x))
 86 | 
 87 |     if y is None:
 88 |         y_masked = x_masked
 89 |     else:
 90 |         y_masked = numpy.ma.array(y, mask=numpy.isnan(y))
 91 | 
 92 |     r = fast_cov.nan_fast_cov(x_masked, y_masked, destination=destination)
 93 | 
 94 |     # calculate the standard deviation of the columns of each matrix, given the masking from the other
 95 |     _, _, var_x = calculate_moments_with_additional_mask(x_masked, y_masked.mask)
 96 |     std_x = numpy.sqrt(var_x)
 97 | 
 98 |     _, _, var_y = calculate_moments_with_additional_mask(y_masked, x_masked.mask)
 99 |     std_y = numpy.sqrt(var_y)
100 | 
101 |     numpy.divide(r, std_x.T, out=r)
102 |     numpy.divide(r, std_y, out=r)
103 | 
104 |     return r
105 | 
106 | 
107 | def fast_spearman(x, y=None, destination=None):
108 |     """calculate the spearman correlation matrix for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
109 |     matrix between the columns of x and the columns of y (with dimensions OxP).  If destination is provided, put the results there.
110 |     In the language of statistics the columns are the variables and the rows are the observations.
111 | 
112 |     Args:
113 |         x (numpy array-like) MxN in shape
114 |         y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
115 |         destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
116 |             memmap of a file)
117 | 
118 |         returns:
119 |             (numpy array-like) array of the covariance values
120 |                 for defaults (y=None), shape is NxN
121 |                 if y is provied, shape is NxP
122 |     """
123 |     r = _fast_spearman(fast_corr, x, y, destination)
124 |     return r
125 | 
126 | 
127 | def _fast_spearman(corr_method, x, y, destination):
128 |     """internal method for calculating spearman correlation, allowing subsititution of methods for calculationg correlation (corr_method),
129 |     allowing to choose methods that are fast (fast_corr) or tolerant of nan's (nan_fast_corr) to be used
130 |     """
131 |     logger.debug("x.shape:  {}".format(x.shape))
132 |     if hasattr(y, "shape"):  
133 |         logger.debug("y.shape:  {}".format(y.shape))
134 | 
135 |     x_ranks = pandas.DataFrame(x).rank(method="average", na_option="keep").values
136 |     logger.debug("some min and max ranks of x_ranks:\n{}\n{}".format(numpy.min(x_ranks[:10], axis=0), numpy.max(x_ranks[:10], axis=0)))
137 | 
138 |     y_ranks = pandas.DataFrame(y).rank(method="average", na_option="keep").values if y is not None else None
139 | 
140 |     return corr_method(x_ranks, y_ranks, destination=destination)
141 | 
142 | 
143 | def nan_fast_spearman(x, y=None, destination=None):
144 |     """calculate the spearman correlation matrix (ignoring nan values) for the columns of x (with dimensions MxN), or optionally, the spearman correlaton
145 |     matrix between the columns of x and the columns of y (with dimensions OxP).  If destination is provided, put the results there.
146 |     In the language of statistics the columns are the variables and the rows are the observations.
147 |     Note that the ranks will be slightly miscalculated in the masked situations leading to slight errors in the spearman rho value.
148 | 
149 |     Args:
150 |         x (numpy array-like) MxN in shape
151 |         y (optional, numpy array-like) OxP in shape.  M (# rows in x) must equal O (# rows in y)
152 |         destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
153 |             memmap of a file)
154 | 
155 |         returns:
156 |             (numpy array-like) array of the covariance values
157 |                 for defaults (y=None), shape is NxN
158 |                 if y is provied, shape is NxP
159 |     """
160 |     r = _fast_spearman(nan_fast_corr, x, y, destination)
161 |     return r
162 | 


--------------------------------------------------------------------------------
/cmapPy/math/fast_cov.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
  3 | import numpy
  4 | 
  5 | 
  6 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
  7 | 
  8 | 
  9 | def _fast_dot_divide(x, y, destination):
 10 |     """helper method for use within the _fast_cov method - carry out the dot product and subsequent 
 11 |     division to generate the covariance values.  For use when there are no missing values.
 12 |     """
 13 |     numpy.dot(x.T, y, out=destination)
 14 |     numpy.divide(destination, (x.shape[0] - 1), out=destination)
 15 | 
 16 | 
 17 | def calculate_non_mask_overlaps(x_mask, y_mask):
 18 |     """for two mask arrays (x_mask, y_mask - boolean arrays) determine the number of entries in common there would be for each 
 19 |     entry if their dot product were taken
 20 |     """
 21 |     x_is_not_nan = 1 * ~x_mask
 22 |     y_is_not_nan = 1 * ~y_mask
 23 | 
 24 |     r = numpy.dot(x_is_not_nan.T, y_is_not_nan)
 25 |     return r
 26 | 
 27 | 
 28 | def _nan_dot_divide(x, y, destination):
 29 |     """helper method for use within the _fast_cov method - carry out the dot product and subsequent
 30 |     division to generate the covariance values.  For use when there are missing values.
 31 |     """
 32 |     numpy.ma.dot(x.T, y, out=destination)
 33 | 
 34 |     divisor = calculate_non_mask_overlaps(x.mask, y.mask) - 1
 35 | 
 36 |     numpy.ma.divide(destination, divisor, out=destination)
 37 | 
 38 | 
 39 | def fast_cov(x, y=None, destination=None):
 40 |     """calculate the covariance matrix for the columns of x (MxN), or optionally, the covariance matrix between the
 41 |     columns of x and and the columns of y (MxP).  (In the language of statistics, the columns are variables, the rows
 42 |     are observations).
 43 | 
 44 |     Args:
 45 |         x (numpy array-like) MxN in shape
 46 |         y (numpy array-like) MxP in shape
 47 |         destination (numpy array-like) optional location where to store the results as they are calculated (e.g. a numpy
 48 |             memmap of a file)
 49 | 
 50 |         returns (numpy array-like) array of the covariance values
 51 |             for defaults (y=None), shape is NxN
 52 |             if y is provided, shape is NxP
 53 |     """
 54 |     r = _fast_cov(numpy.mean, _fast_dot_divide, x, y, destination)
 55 | 
 56 |     return r
 57 | 
 58 | 
 59 | def _fast_cov(mean_method, dot_divide_method, x, y, destination):
 60 |     validate_inputs(x, y, destination)
 61 | 
 62 |     new_x = x if len(x.shape) == 2 else x[:, numpy.newaxis]
 63 | 
 64 |     if y is None:
 65 |         y = new_x
 66 |     new_y = y if len(y.shape) == 2 else y[:, numpy.newaxis]
 67 | 
 68 |     if destination is None:
 69 |         destination = numpy.zeros((new_x.shape[1], new_y.shape[1]))
 70 | 
 71 |     mean_x = mean_method(new_x, axis=0)
 72 |     mean_y = mean_method(new_y, axis=0)
 73 | 
 74 |     mean_centered_x = (new_x - mean_x).astype(destination.dtype)
 75 |     mean_centered_y = (new_y - mean_y).astype(destination.dtype)
 76 |     
 77 |     dot_divide_method(mean_centered_x, mean_centered_y, destination)
 78 | 
 79 |     return destination
 80 | 
 81 | 
 82 | def validate_inputs(x, y, destination):
 83 |     error_msg = ""
 84 | 
 85 |     if not hasattr(x, "shape"):
 86 |         error_msg += "x needs to be numpy array-like but it does not have \"shape\" attribute - type(x):  {}\n".format(type(x))
 87 |     
 88 |     if destination is not None and not hasattr(destination, "shape"):
 89 |         error_msg += "destination needs to be numpy array-like but it does not have \"shape\" attribute - type(destination):  {}\n".format(type(destination))
 90 | 
 91 |     if y is None:
 92 |         if destination is not None:
 93 |             expected_dim = x.shape[1] if len(x.shape) == 2 else 1
 94 |             expected_shape = (expected_dim, expected_dim)
 95 |             if destination.shape != expected_shape:
 96 |                 error_msg += "x and destination provided, therefore destination must have shape matching number of columns of x but it does not - x.shape:  {}  expected_shape:  {}  destination.shape:  {}\n".format(
 97 |                     x.shape, expected_shape, destination.shape)
 98 |     else:
 99 |         if not hasattr(y, "shape"):
100 |             error_msg += "y needs to be numpy array-like but it does not have \"shape\" attribute - type(y):  {}\n".format(type(y))
101 |         elif x.shape[0] != y.shape[0]:
102 |             error_msg += "the number of rows in the x and y matrices must be the same - x.shape:  {}  y.shape:  {}\n".format(x.shape, y.shape)
103 |         elif destination is not None:
104 |             expected_rows = x.shape[1] if len(x.shape) == 2 else 1
105 |             expected_cols = y.shape[1] if len(y.shape) == 2 else 1
106 |             expected_shape = (expected_rows, expected_cols)
107 |             if destination.shape != expected_shape:
108 |                 error_msg += "x, y, and destination provided, therefore destination must have number of rows matching number of columns of x and destination needs to have number of columns matching number of columns of y - x.shape:  {}  y.shape:  {}  expected_shape:  {}  destination.shape:  {}\n".format(
109 |                     x.shape, y.shape, expected_shape, destination.shape)
110 | 
111 |     if error_msg != "":
112 |         raise CmapPyMathFastCovInvalidInputXY(error_msg)
113 | 
114 | 
115 | def nan_fast_cov(x, y=None, destination=None):
116 |     """calculate the covariance matrix (ignoring nan values) for the columns of x (MxN), or optionally, the covariance matrix between the
117 |     columns of x and and the columns of y (MxP).  (In the language of statistics, the columns are variables, the rows
118 |     are observations).
119 | 
120 |     Args:
121 |         x (numpy array-like) MxN in shape
122 |         y (numpy array-like) MxP in shape
123 |         destination (numpy masked array-like) optional location where to store the results as they are calculated (e.g. a numpy
124 |             memmap of a file)
125 | 
126 |         returns (numpy array-like) array of the covariance values
127 |             for defaults (y=None), shape is NxN
128 |             if y is provided, shape is NxP
129 |     """
130 |     x_masked = numpy.ma.array(x, mask=numpy.isnan(x))
131 | 
132 |     if y is None:
133 |         y_masked = x_masked
134 |     else:
135 |         y_masked = numpy.ma.array(y, mask=numpy.isnan(y))
136 | 
137 |     dest_was_None = False
138 |     if destination is None:
139 |         num_rows = x_masked.shape[1] if len(x_masked.shape) == 2 else 1
140 |         num_cols = y_masked.shape[1] if len(y_masked.shape) == 2 else 1
141 |         destination = numpy.ma.zeros((num_rows, num_cols))
142 |         dest_was_None = True
143 | 
144 |     r = _fast_cov(numpy.nanmean, _nan_dot_divide, x_masked, y_masked, destination)
145 | 
146 |     r[numpy.isinf(r)] = numpy.nan
147 | 
148 |     r = numpy.ma.filled(r, fill_value=numpy.nan) if dest_was_None else r
149 | 
150 |     return r
151 | 
152 | 
153 | class CmapPyMathFastCovInvalidInputXY(Exception):
154 |     pass
155 | 


--------------------------------------------------------------------------------
/cmapPy/math/robust_zscore.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | robust_zscore.py
 3 | 
 4 | Robustly z-scores a pandas df along the rows (i.e. the z-score is made relative
 5 | to a row). A robust z-score means that median is used instead of mean and
 6 | median absolute deviation (MAD) instead of standard deviation in the
 7 | standard z-score calculation:
 8 | 
 9 | z = (x - u) / s
10 | 
11 | x: input value
12 | u: median
13 | s: MAD
14 | 
15 | Optionally, the median and MAD can be computed from a control df, instead of the
16 | input df. This functionality is useful for "vehicle-control"; that is, if
17 | the control df consists only of negative control samples, the median and MAD
18 | can be computed using just those samples but applied to the input df.
19 | '''
20 | 
21 | rounding_precision = 4
22 | 
23 | 
24 | def robust_zscore(mat, ctrl_mat=None, min_mad=0.1):
25 |     ''' Robustly z-score a pandas df along the rows.
26 | 
27 |     Args:
28 |     mat (pandas df): Matrix of data that z-scoring will be applied to
29 |     ctrl_mat (pandas df): Optional matrix from which to compute medians and MADs
30 |         (e.g. vehicle control)
31 |     min_mad (float): Minimum MAD to threshold to; tiny MAD values will cause
32 |         z-scores to blow up
33 | 
34 |     Returns:
35 |     zscore_df (pandas_df): z-scored data
36 |     '''
37 | 
38 |     # If optional df exists, calc medians and mads from it
39 |     if ctrl_mat is not None:
40 |         medians = ctrl_mat.median(axis=1)
41 |         median_devs = abs(ctrl_mat.subtract(medians, axis=0))
42 | 
43 |     # Else just use plate medians
44 |     else:
45 |         medians = mat.median(axis=1)
46 |         median_devs = abs(mat.subtract(medians, axis=0))
47 | 
48 |     sub = mat.subtract(medians, axis='index')
49 |     mads = median_devs.median(axis=1)
50 | 
51 |     # Threshold mads
52 |     mads = mads.clip(lower=min_mad)
53 | 
54 |     # Must multiply values by 1.4826 to make MAD comparable to SD
55 |     # (https://en.wikipedia.org/wiki/Median_absolute_deviation)
56 |     zscore_df = sub.divide(mads * 1.4826, axis='index')
57 | 
58 |     return zscore_df.round(rounding_precision)


--------------------------------------------------------------------------------
/cmapPy/math/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/math/tests/__init__.py


--------------------------------------------------------------------------------
/cmapPy/math/tests/test_agg_wt_avg.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.math.agg_wt_avg as agg_wt_avg
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | 
 9 | test_mat = pd.DataFrame({'A':[1,2,3], 'B': [2,8,6], 'C': [6,8,9]})
10 | test_mat_corr = test_mat.corr()
11 | 
12 | 
13 | class TestAggWtAvg(unittest.TestCase):
14 |     def test_calculate_weights(self):
15 |         # happy path
16 |         raw_weights, weights = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.1)
17 |         self.assertTrue(len(weights == 3))
18 |         self.assertTrue(raw_weights.tolist() == [0.8183, 0.7202, 0.8838])
19 |         self.assertTrue(weights.tolist() == [0.3378, 0.2973, 0.3649])
20 | 
21 |         # test that min_wt works
22 |         raw_weights2, weights2 = agg_wt_avg.calculate_weights(test_mat_corr, min_wt=0.85)
23 |         self.assertEqual(raw_weights2[1], 0.85)
24 | 
25 |     def test_get_upper_triangle(self):
26 |         # happy path
27 |         upper_tri_df = agg_wt_avg.get_upper_triangle(test_mat_corr)
28 |         self.assertTrue(upper_tri_df['corr'].tolist() == [0.6547, 0.982, 0.7857])
29 |         self.assertTrue(upper_tri_df['rid'].tolist() == ['B', 'C', 'C'])
30 |         self.assertTrue(upper_tri_df['index'].tolist() == ['A', 'A', 'B'])
31 | 
32 |     def test_agg_wt_avg(self):
33 |         # use spearman
34 |         out_sig, upper_tri_df, raw_weights, weights = agg_wt_avg.agg_wt_avg(test_mat)
35 |         self.assertTrue(out_sig.tolist() == [3.125, 5.75, 6.0])
36 |         self.assertAlmostEqual(upper_tri_df.loc[upper_tri_df.index[0], "corr"], 0.5)
37 |         self.assertAlmostEqual(raw_weights[0], 0.75)
38 |         self.assertAlmostEqual(weights[0], 0.375)
39 | 
40 |         # test on a single signature
41 |         out_sig2, _, _, _ = agg_wt_avg.agg_wt_avg(test_mat[["C"]])
42 |         pd.util.testing.assert_frame_equal(out_sig2, test_mat[["C"]])
43 | 
44 |         # should break if empty input
45 |         with self.assertRaises(AssertionError) as e:
46 |             agg_wt_avg.agg_wt_avg(test_mat[[]])
47 |         self.assertIn("mat is empty!", str(e.exception))
48 | 
49 | if __name__ == "__main__":
50 |     setup_logger.setup(verbose=True)
51 |     unittest.main()
52 | 
53 | 


--------------------------------------------------------------------------------
/cmapPy/math/tests/test_robust_zscore.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.math.robust_zscore as robust_zscore
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | 
 9 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9], 'D': [5,2,1]})
10 | test_ctl_mat = pd.DataFrame({'E':[8,8,6], 'F': [7,6,6]})
11 | test_ctl_mat2 = pd.DataFrame({'E':[8,8,6], 'F': [8,6,6]})
12 | 
13 | 
14 | class TestRobustZscore(unittest.TestCase):
15 |     def test_zscore_pc(self):
16 |         pc_zscores = robust_zscore.robust_zscore(test_mat)
17 |         self.assertTrue(pc_zscores.shape == (3, 4))
18 | 
19 |         pd.util.testing.assert_frame_equal(pc_zscores, pd.DataFrame(
20 |             {'A': [-0.3372, -0.6745, -0.4047],
21 |              'B': [-1.6862, 2.0235, 0.4047],
22 |              'C': [1.0117, 0.6745, 1.2141],
23 |              'D': [0.3372, -0.6745, -0.9443]}))
24 | 
25 |     def test_zscore_vc(self):
26 |         vc_zscores = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat)
27 |         self.assertTrue(vc_zscores.shape == (3, 4))
28 |         pd.util.testing.assert_frame_equal(vc_zscores, pd.DataFrame(
29 |             {'A': [-4.7214, -3.3725, -20.2347],
30 |              'B': [-7.4194, 0.6745, 0.0],
31 |              'C': [-2.0235, -1.349, 20.2347],
32 |              'D': [-3.3725, -3.3725, -33.7245]}))
33 | 
34 |         # check that min_mad works
35 |         vc_zscores2 = robust_zscore.robust_zscore(test_mat, ctrl_mat=test_ctl_mat2)
36 |         self.assertEqual(vc_zscores2.iloc[0, 0], -26.9796)
37 |         self.assertEqual(vc_zscores2.iloc[1, 1], 0.6745)
38 | 
39 | if __name__ == "__main__":
40 |     setup_logger.setup(verbose=True)
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/README.rst:
--------------------------------------------------------------------------------
 1 | pandasGEXpress library
 2 | ======================
 3 | 
 4 | This is a package of Python scripts that enable reading, writing, and
 5 | basic modifications (subsetting, concatenation) of .gct and .gctx files.
 6 | 
 7 | Installation instructions and documentation can be found  `on the package's ReadTheDocs page <https://clue.io/cmapPy/index.html>`_. 
 8 | 
 9 | Questions/issues
10 | ======================
11 | 
12 | Please add an issue to the cmapPy repository. We would appreciate if your issue included sample code/files (as appropriate) so that we can reproduce your bug/issue. 
13 | 
14 | Contributing
15 | ======================
16 | 
17 | We welcome contributors! For your pull requests, please include the following:
18 | 
19 | * Sample code/file that reproducibly causes the bug/issue
20 | * Documented code providing fix
21 | * Unit tests evaluating added/modified methods. 
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/__init__.py


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/concat_gctoo.py:
--------------------------------------------------------------------------------
1 | msg = "concat_gctoo.py is deprecated. Please use concat.py instead."
2 | raise(DeprecationWarning(msg))


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/diff_gctoo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | diff_gctoo.py
 3 | 
 4 | Converts a matrix of values (e.g. gene expression, viability, etc.) into a
 5 | matrix of differential values. Values can be made differential relative to all
 6 | samples in the dataset ("plate-control") or relative to just negative control
 7 | samples ("vehicle-control"). The method of computing the differential can be
 8 | either a robust z-score ("robust_z") or simply median normalization
 9 | ("median_norm").
10 | 
11 | '''
12 | import cmapPy.math.robust_zscore as robust_zscore
13 | import cmapPy.pandasGEXpress.GCToo as GCToo
14 | 
15 | possible_diff_methods = ["robust_z", "median_norm"]
16 | 
17 | 
18 | def diff_gctoo(gctoo, plate_control=True, group_field='pert_type', group_val='ctl_vehicle',
19 |                diff_method="robust_z", upper_diff_thresh=10, lower_diff_thresh=-10):
20 |     ''' Converts a matrix of values (e.g. gene expression, viability, etc.)
21 |     into a matrix of differential values.
22 | 
23 |     Args:
24 |     df (pandas df): data to make diff_gctoo
25 |     plate_control (bool): True means calculate diff_gctoo using plate control.
26 |         False means vehicle control.
27 |     group_field (string): Metadata field in which to find group_val
28 |     group_val (string): Value in group_field that indicates use in vehicle control
29 |     diff_method (string): Method of computing differential data; currently only
30 |         support either "robust_z" or "median_norm"
31 |     upper_diff_thresh (float): Maximum value for diff data
32 |     lower_diff_thresh (float): Minimum value for diff data
33 | 
34 |     Returns:
35 |     out_gctoo (GCToo object): GCToo with differential data values
36 |     '''
37 |     assert diff_method in possible_diff_methods, (
38 |         "possible_diff_methods: {}, diff_method: {}".format(
39 |             possible_diff_methods, diff_method))
40 | 
41 |     # Compute median and MAD using all samples in the dataset
42 |     if plate_control:
43 | 
44 |         # Compute differential data
45 |         if diff_method == "robust_z":
46 |             diff_data = robust_zscore.robust_zscore(gctoo.data_df)
47 | 
48 |         elif diff_method == "median_norm":
49 |             medians = gctoo.data_df.median(axis=1)
50 |             diff_data = gctoo.data_df.subtract(medians, axis='index')
51 | 
52 |     # Compute median and MAD from negative controls, rather than all samples
53 |     else:
54 | 
55 |         assert group_field in gctoo.col_metadata_df.columns.values, (
56 |             "group_field {} not present in column metadata. " +
57 |             "gctoo.col_metadata_df.columns.values: {}").format(
58 |             group_field, gctoo.col_metadata_df.columns.values)
59 | 
60 |         assert sum(gctoo.col_metadata_df[group_field] == group_val) > 0, (
61 |             "group_val {} not present in the {} column.").format(
62 |             group_val, group_field)
63 | 
64 |         # Find negative control samples
65 |         neg_ctl_samples = gctoo.col_metadata_df.index[gctoo.col_metadata_df[group_field] == group_val]
66 |         neg_ctl_df = gctoo.data_df[neg_ctl_samples]
67 | 
68 |         # Compute differential data
69 |         if diff_method == "robust_z":
70 |             diff_data = robust_zscore.robust_zscore(gctoo.data_df, neg_ctl_df)
71 | 
72 |         elif diff_method == "median_norm":
73 |             medians = gctoo.data_df.median(axis=1)
74 |             diff_data = gctoo.data_df.subtract(medians, axis='index')
75 | 
76 |     # Threshold differential data before returning
77 |     diff_data = diff_data.clip(lower=lower_diff_thresh, upper=upper_diff_thresh)
78 | 
79 |     # Construct output GCToo object
80 |     out_gctoo = GCToo.GCToo(data_df=diff_data,
81 |                             row_metadata_df=gctoo.row_metadata_df,
82 |                             col_metadata_df=gctoo.col_metadata_df)
83 | 
84 |     return out_gctoo
85 | 
86 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/gct2gctx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Command-line script to convert a .gct file to .gctx. 
 3 | 
 4 | Main method takes in a .gct file path (and, optionally, an 
 5 | 	out path and/or name to which to save the equivalent .gctx)
 6 | 	and saves the enclosed content to a .gctx file. 
 7 | 
 8 | Note: Only supports v1.3 .gct files. 
 9 | """
10 | import sys
11 | import logging
12 | import argparse
13 | import os.path
14 | import pandas as pd
15 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
16 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
17 | import cmapPy.pandasGEXpress.write_gctx as write_gctx
18 | 
19 | __author__ = "Oana Enache"
20 | __email__ = "oana@broadinstitute.org"
21 | 
22 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
23 | 
24 | 
25 | def build_parser():
26 |     parser = argparse.ArgumentParser(description=__doc__,
27 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
28 |     # required
29 |     parser.add_argument("-filename", "-f", required=True,
30 |                         help=".gct file that you would like to convert to .gctx")
31 |     # optional
32 |     parser.add_argument("-output_filepath", "-o", default=None,
33 |                         help=("out path/name for output gctx file. " +
34 |                               "Default is just to modify the extension"))
35 |     parser.add_argument("-verbose", "-v",
36 |                         help="Whether to print a bunch of output.", action="store_true", default=False)
37 |     parser.add_argument("-row_annot_path", help="Path to annotations file for rows")
38 |     parser.add_argument("-col_annot_path", help="Path to annotations file for columns")
39 |     return parser
40 | 
41 | 
42 | def main():
43 |     args = build_parser().parse_args(sys.argv[1:])
44 |     setup_logger.setup(verbose=args.verbose)
45 |     gct2gctx_main(args)
46 | 
47 | 
48 | def gct2gctx_main(args):
49 |     """ Separate from main() in order to make command-line tool. """
50 | 
51 |     in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
52 | 
53 |     if args.output_filepath is None:
54 |         basename = os.path.basename(args.filename)
55 |         out_name = os.path.splitext(basename)[0] + ".gctx"
56 |     else:
57 |         out_name = args.output_filepath
58 | 
59 |     """ If annotations are supplied, parse table and set metadata_df """
60 |     if args.row_annot_path is None:
61 |         pass
62 |     else:
63 |         row_metadata = pd.read_csv(args.row_annot_path, sep='\t', index_col=0, header=0, low_memory=False)
64 |         assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \
65 |             "Row ids in matrix missing from annotations file"
66 |         in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(in_gctoo.data_df.index)]
67 | 
68 |     if args.col_annot_path is None:
69 |         pass
70 |     else:
71 |         col_metadata = pd.read_csv(args.col_annot_path, sep='\t', index_col=0, header=0, low_memory=False)
72 |         assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \
73 |             "Column ids in matrix missing from annotations file"
74 |         in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(in_gctoo.data_df.columns)]
75 | 
76 |     write_gctx.write(in_gctoo, out_name)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/gctx2gct.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Command-line script to convert a .gctx file to .gct. 
 3 | 
 4 | Main method takes in a .gctx file path (and, optionally, an 
 5 | 	out path and/or name to which to save the equivalent .gct)
 6 | 	and saves the enclosed content to a .gct file. 
 7 | 
 8 | Note: Only supports v1.0 .gctx files. 
 9 | """
10 | import sys
11 | import logging
12 | import argparse
13 | import os.path
14 | import pandas as pd
15 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
16 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
17 | import cmapPy.pandasGEXpress.write_gct as write_gct
18 | 
19 | __author__ = "Oana Enache"
20 | __email__ = "oana@broadinstitute.org"
21 | 
22 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
23 | 
24 | 
25 | def build_parser():
26 |     parser = argparse.ArgumentParser(description=__doc__,
27 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
28 |     # required
29 |     parser.add_argument("-filename", "-f", required=True,
30 |                         help=".gctx file that you would like to converted to .gct")
31 |     # optional
32 |     parser.add_argument("-output_filepath", "-o", default=None,
33 |                         help=("out path/name for output gct file. " +
34 |                               "Default is just to modify the extension"))
35 |     parser.add_argument("-verbose", "-v",
36 |                         help="Whether to print a bunch of output.", action="store_true", default=False)
37 |     parser.add_argument("-row_annot_path", help="Path to annotations file for rows")
38 |     parser.add_argument("-col_annot_path", help="Path to annotations file for columns")
39 |     return parser
40 | 
41 | 
42 | def main():
43 |     args = build_parser().parse_args(sys.argv[1:])
44 |     setup_logger.setup(verbose=args.verbose)
45 |     gctx2gct_main(args)
46 | 
47 | 
48 | def gctx2gct_main(args):
49 |     """ Separate from main() in order to make command-line tool. """
50 | 
51 |     in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
52 | 
53 |     if args.output_filepath is None:
54 |         basename = os.path.basename(args.filename)
55 |         out_name = os.path.splitext(basename)[0] + ".gct"
56 |     else:
57 |         out_name = args.output_filepath
58 | 
59 |     """ If annotations are supplied, parse table and set metadata_df """
60 |     if args.row_annot_path is None:
61 |         pass
62 |     else:
63 |         row_metadata = pd.read_csv(args.row_annot_path, sep='\t', index_col=0, header=0, low_memory=False)
64 |         assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \
65 |             "Row ids in matrix missing from annotations file"
66 |         in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(in_gctoo.data_df.index)]
67 | 
68 |     if args.col_annot_path is None:
69 |         pass
70 |     else:
71 |         col_metadata = pd.read_csv(args.col_annot_path, sep='\t', index_col=0, header=0, low_memory=False)
72 |         assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \
73 |             "Column ids in matrix missing from annotations file"
74 |         in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(in_gctoo.data_df.columns)]
75 | 
76 |     write_gct.write(in_gctoo, out_name)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/mini_gctoo_for_testing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Creates a small GCToo instance (with representative examples of typically found fields); can use for testing.
 3 | 
 4 | ex:
 5 |     import mini_gctoo_for testing
 6 |     my_mini_gctoo = mini_gctoo_for_testing.make()
 7 | """
 8 | import logging
 9 | import pandas
10 | import numpy
11 | import cmapPy.pandasGEXpress.GCToo as GCToo
12 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
13 | 
14 | __author__ = 'Oana Enache'
15 | __email__ = 'oana@broadinstitute.org'
16 | 
17 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
18 | 
19 | 
20 | def make(convert_neg_666=True):
21 |     """
22 |     Creates a small GCToo instance (with representative examples of typically found fields); can use for testing.
23 |     """
24 |     # metadata examples; should be one of each type reasonable to find
25 |     id_vals = ["LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33", "MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33",
26 |                "LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10", "LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10",
27 |                "LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666", "LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10"]
28 |     count_cv = ["14|15|14", "13|14|13",
29 |                 "13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13",
30 |                 "13", "13", "14"]
31 |     distil_ss = [9.822065353, 6.8915205, 1.35840559, 5.548898697, 3.355231762, 4.837643147]
32 |     zmad_ref = ["population", "population", "population", "population", "population", "population"]
33 |     distil_nsample = [3, 3, 66, 2, 9, 111111]
34 |     mfc_plate_id = ["-666", "-666", "-666", "-666", "-666", "-666"]
35 | 
36 |     # build metadata dataframe
37 |     mini_meta_dict = {}
38 |     mini_meta_dict["id"] = id_vals
39 |     mini_meta_dict["count_cv"] = count_cv
40 |     mini_meta_dict["distil_ss"] = distil_ss
41 |     mini_meta_dict["zmad_ref"] = zmad_ref
42 |     mini_meta_dict["distil_nsample"] = distil_nsample
43 |     mini_meta_dict["mfc_plate_id"] = mfc_plate_id
44 |     mini_row_metadata = pandas.DataFrame(mini_meta_dict,
45 |                                          columns=['id', 'count_cv', 'distil_nsample', 'distil_ss', 'mfc_plate_id', 'zmad_ref'])
46 | 
47 |     if convert_neg_666:
48 |         mini_row_metadata = mini_row_metadata.replace([-666, "-666", -666.0], [numpy.nan, numpy.nan, numpy.nan])
49 |         # if all values in a column are nanpandas.Series(mini_row_metadata.isna().sum() == mini_row_metadata.shape[0]) convert dtype of that column to float
50 |         all_nan_columns = (mini_row_metadata.isnull().sum() == numpy.array(mini_row_metadata.shape[0])).to_numpy().nonzero()[0]
51 |         mini_row_metadata = mini_row_metadata.astype({d: 'float' for d in mini_row_metadata.columns[all_nan_columns.tolist()]})
52 |     else:
53 |         mini_row_metadata = mini_row_metadata.replace([-666, -666.0], ["-666", "-666"])
54 | 
55 |     # for now (at least) col and row metadata are the same
56 |     mini_col_metadata = mini_row_metadata.copy()
57 | 
58 |     # data example values
59 |     r1 = [1, 2, 3, 4, 5, 6]
60 |     r2 = [4.3, 4.5, 4.3, 4.3, 4.3, 4.3]
61 |     r3 = [7, 8, 9, 0, 1.23476, 9.758320]
62 |     r4 = [0.11, 3.3456356, 2.345667, 9.822065353, 4.78865099, 4.7886]
63 |     r5 = [-0.11, -3.3456356, -2.345667, -9.822065353, -4.78865099, -4.7886]
64 |     r6 = [1, -2, 3, -4, 5, -6]
65 | 
66 |     # build data dataframe
67 |     mini_data_mat = pandas.DataFrame([r1, r2, r3, r4, r5, r6], dtype=numpy.float32)
68 |     mini_data_mat.index = id_vals
69 |     mini_data_mat.columns = id_vals
70 | 
71 |     # instantiate & assign attributes of GCToo instance
72 |     mini_version = "GCTX1.0"
73 |     mini_src = "mini_gctoo.gctx"
74 | 
75 |     mini_row_metadata_df = mini_row_metadata
76 |     mini_row_metadata_df.set_index("id", inplace=True, drop=True)
77 |     mini_row_metadata.index.name = "rid"
78 |     mini_row_metadata_df.columns.name = "rhd"
79 | 
80 |     mini_col_metadata_df = mini_col_metadata
81 |     mini_col_metadata_df.set_index("id", inplace=True, drop=True)
82 |     mini_col_metadata.index.name = "cid"
83 |     mini_col_metadata_df.columns.name = "chd"
84 | 
85 |     mini_data_df = mini_data_mat
86 |     mini_data_df.index.name = "rid"
87 |     mini_data_df.columns.name = "cid"
88 | 
89 |     logger.debug("Making mini_gctoo instance...")
90 |     mini_gctoo = GCToo.GCToo(data_df=mini_data_df, row_metadata_df=mini_row_metadata_df,
91 |                              col_metadata_df=mini_col_metadata_df, src=mini_src, version=mini_version)
92 | 
93 |     return mini_gctoo
94 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/parse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generic parse method to parse either a .gct or a .gctx. 
 3 | 
 4 | Takes in a file path corresponding to either a .gct or .gctx, 
 5 |     and parses to a GCToo instance accordingly.
 6 | 
 7 | Note: Supports GCT1.2, GCT1.3, and GCTX1.0 files. 
 8 | """
 9 | import logging
10 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
11 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
12 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
13 | 
14 | import numpy
15 | 
16 | __author__ = "Oana Enache"
17 | __email__ = "oana@broadinstitute.org"
18 | 
19 | # instantiate logger
20 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
21 | 
22 | 
23 | def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=None,
24 |           row_meta_only=False, col_meta_only=False, make_multiindex=False, 
25 |           gct_data_type=numpy.float32):
26 |     """
27 |     Identifies whether file_path corresponds to a .gct or .gctx file and calls the
28 |     correct corresponding parse method.
29 | 
30 |     Input:
31 |         Mandatory:
32 |         - gct(x)_file_path (str): full path to gct(x) file you want to parse.
33 | 
34 |         Optional:
35 |         - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
36 |             (see Note below for more details on this). Default = False.
37 |         - rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
38 |         - cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
39 |         - ridx (list of integers): only read the rows corresponding to this
40 |             list of integer ids. Default=None.
41 |         - cidx (list of integers): only read the columns corresponding to this
42 |             list of integer ids. Default=None.
43 |         - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
44 |             as pandas DataFrame
45 |         - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
46 |             as pandas DataFrame
47 |         - make_multiindex (bool): whether to create a multi-index df combining
48 |             the 3 component dfs
49 |         - gct_data_type (numpy datatype): if loading a gct file, what data type the matrix should be converted into
50 |             i.e. default is numpy float32
51 | 
52 |     Output:
53 |         - out (GCToo object or pandas df): if row_meta_only or col_meta_only, then
54 |             out is a metadata df; otherwise, it's a GCToo instance containing
55 |             content of parsed gct(x) file
56 | 
57 |     Note: why does convert_neg_666 exist?
58 |         - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
59 |         for metadata. However (so that users can take full advantage of pandas' methods,
60 |         including those for filtering nan's etc) we provide the option of converting these
61 |         into numpy.NaN values, the pandas default.
62 |     """
63 |     if file_path.endswith(".gctx"):
64 |         out = parse_gctx.parse(file_path, convert_neg_666=convert_neg_666,
65 |                               rid=rid, cid=cid, ridx=ridx, cidx=cidx,
66 |                               row_meta_only=row_meta_only, col_meta_only=col_meta_only,
67 |                               make_multiindex=make_multiindex)
68 | 
69 |     else:
70 |         if file_path.endswith(".gct"):
71 |             logger.info("parsing gct file")
72 |         else:
73 |             logger.info("parsing file of unknown extension, assuming/trying gct format")
74 | 
75 |         out = parse_gct.parse(file_path, convert_neg_666=convert_neg_666,
76 |                               rid=rid, cid=cid, ridx=ridx, cidx=cidx,
77 |                               row_meta_only=row_meta_only, col_meta_only=col_meta_only,
78 |                               make_multiindex=make_multiindex, data_type=gct_data_type)
79 | 
80 |     return out
81 | 
82 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/random_slice.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Slices a random subset of a GCToo instance of a user-specified size. 
 3 | """
 4 | import logging
 5 | import numpy
 6 | import cmapPy.pandasGEXpress.GCToo as GCToo
 7 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 8 | 
 9 | __author__ = "Oana Enache"
10 | __email__ = "oana@broadinstitute.org"
11 | 
12 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
13 | 
14 | 
15 | def make_specified_size_gctoo(og_gctoo, num_entries, dim):
16 |     """
17 | 	Subsets a GCToo instance along either rows or columns to obtain a specified size.
18 | 
19 | 	Input:
20 | 		- og_gctoo (GCToo): a GCToo instance 
21 | 		- num_entries (int): the number of entries to keep
22 | 		- dim (str): the dimension along which to subset. Must be "row" or "col"
23 | 
24 | 	Output:
25 | 		- new_gctoo (GCToo): the GCToo instance subsetted as specified. 
26 | 	"""
27 |     assert dim in ["row", "col"], "dim specified must be either 'row' or 'col'"
28 | 
29 |     dim_index = 0 if "row" == dim else 1
30 |     assert num_entries <= og_gctoo.data_df.shape[dim_index], ("number of entries must be smaller than dimension being "
31 |             "subsetted - num_entries:  {}  dim:  {}  dim_index:  {}  og_gctoo.data_df.shape[dim_index]:  {}".format(
32 |         num_entries, dim, dim_index, og_gctoo.data_df.shape[dim_index]))
33 | 
34 |     if dim == "col":
35 |         columns = [x for x in og_gctoo.data_df.columns.values]
36 |         numpy.random.shuffle(columns)
37 |         columns = columns[0:num_entries]
38 |         rows = og_gctoo.data_df.index.values
39 |     else:
40 |         rows = [x for x in og_gctoo.data_df.index.values]
41 |         numpy.random.shuffle(rows)
42 |         rows = rows[0:num_entries]
43 |         columns = og_gctoo.data_df.columns.values
44 | 
45 |     new_data_df = og_gctoo.data_df.loc[rows, columns]
46 |     new_row_meta = og_gctoo.row_metadata_df.loc[rows]
47 |     new_col_meta = og_gctoo.col_metadata_df.loc[columns]
48 | 
49 |     logger.debug(
50 |         "after slice - new_col_meta.shape: {}  new_row_meta.shape:  {}".format(new_col_meta.shape, new_row_meta.shape))
51 | 
52 |     # make & return new gctoo instance
53 |     new_gctoo = GCToo.GCToo(data_df=new_data_df, row_metadata_df=new_row_meta, col_metadata_df=new_col_meta)
54 | 
55 |     return new_gctoo
56 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/setup_GCToo_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.handlers
 3 | 
 4 | 
 5 | __author__ = "David Lahr"
 6 | __email__ = "dlahr@broadinstitute.org"
 7 | 
 8 | LOGGER_NAME = "cmap_logger"
 9 | 
10 | _LOG_FORMAT = "%(levelname)s %(asctime)s %(module)s %(funcName)s %(message)s"
11 | _LOG_FILE_MAX_BYTES = 10000000
12 | _LOG_FILE_BACKUP_COUNT = 5
13 | 
14 | 
15 | def setup(verbose=False, log_file=None):
16 |     logger = logging.getLogger(LOGGER_NAME)
17 | 
18 |     level = (logging.DEBUG if verbose else logging.INFO)
19 | 
20 |     if log_file is None:
21 |         logging.basicConfig(level=level, format=_LOG_FORMAT)
22 |     else:
23 |         logger.setLevel(level)
24 |         handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=_LOG_FILE_MAX_BYTES,
25 |                                                        backupCount=_LOG_FILE_BACKUP_COUNT)
26 |         handler.setFormatter(logging.Formatter(fmt=_LOG_FORMAT))
27 |         logger.addHandler(handler)
28 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/simple_GCT_to_GCToo_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/simple_GCT_to_GCToo_figure.png


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/slice_gct.py:
--------------------------------------------------------------------------------
1 | msg = "slice_gct.py is deprecated. Please use subset.py instead."
2 | raise(DeprecationWarning(msg))


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/slice_gctoo.py:
--------------------------------------------------------------------------------
1 | msg = "slice_gctoo.py is deprecated. Please use subset_gctoo.py instead."
2 | raise(DeprecationWarning(msg))


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/subset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | subset.py
  3 | 
  4 | Extract a subset of data from a GCT(x) file using the command line. ids can
  5 | be provided as a list or as a path to a grp file. See subset_gctoo for the
  6 | equivalent method to be used on GCToo objects.
  7 | 
  8 | """
  9 | import logging
 10 | import sys
 11 | import os
 12 | import argparse
 13 | 
 14 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 15 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
 16 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
 17 | import cmapPy.pandasGEXpress.subset_gctoo as sg
 18 | import cmapPy.pandasGEXpress.write_gct as wg
 19 | import cmapPy.pandasGEXpress.write_gct as wgx
 20 | import cmapPy.set_io.grp as grp
 21 | 
 22 | __author__ = "Lev Litichevskiy"
 23 | __email__ = "lev@broadinstitute.org"
 24 | 
 25 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 26 | 
 27 | 
 28 | def build_parser():
 29 |     """Build argument parser."""
 30 | 
 31 |     parser = argparse.ArgumentParser(description=__doc__,
 32 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 33 | 
 34 |     # Required args
 35 |     parser.add_argument("--in_path", "-i", required=True,
 36 |                         help="file path to input GCT(x) file")
 37 | 
 38 |     parser.add_argument("--rid", nargs="+", help="filepath to grp file or string array for including rows")
 39 |     parser.add_argument("--cid", nargs="+", help="filepath to grp file or string array for including cols")
 40 |     parser.add_argument("--exclude_rid", "-er", nargs="+", help="filepath to grp file or string array for excluding rows")
 41 |     parser.add_argument("--exclude_cid", "-ec", nargs="+", help="filepath to grp file or string array for excluding cols")
 42 |     parser.add_argument("--out_name", "-o", default="ds_subsetted.gct",
 43 |                         help="what to name the output file")
 44 |     parser.add_argument("--out_type", default="gct", choices=["gct", "gctx"],
 45 |                         help="whether to write output as GCT or GCTx")
 46 |     parser.add_argument("--verbose", "-v", action="store_true", default=False,
 47 |                         help="whether to increase the # of messages reported")
 48 | 
 49 |     return parser
 50 | 
 51 | 
 52 | def main():
 53 |     # Get args
 54 |     args = build_parser().parse_args(sys.argv[1:])
 55 |     setup_logger.setup(verbose=args.verbose)
 56 |     subset_main(args)
 57 | 
 58 | 
 59 | def subset_main(args):
 60 |     """ Separate method from main() in order to make testing easier and to
 61 |     enable command-line access. """
 62 | 
 63 |     # Read in each of the command line arguments
 64 |     rid = _read_arg(args.rid)
 65 |     cid = _read_arg(args.cid)
 66 |     exclude_rid = _read_arg(args.exclude_rid)
 67 |     exclude_cid = _read_arg(args.exclude_cid)
 68 | 
 69 |     # If GCT, use subset_gctoo
 70 |     if args.in_path.endswith(".gct"):
 71 | 
 72 |         in_gct = parse_gct.parse(args.in_path)
 73 |         out_gct = sg.subset_gctoo(in_gct, rid=rid, cid=cid,
 74 |                                  exclude_rid=exclude_rid,
 75 |                                  exclude_cid=exclude_cid)
 76 | 
 77 |     # If GCTx, use parse_gctx
 78 |     else:
 79 | 
 80 |         if (exclude_rid is not None) or (exclude_cid is not None):
 81 |             msg = "exclude_{rid,cid} args not currently supported for parse_gctx."
 82 |             raise(Exception(msg))
 83 | 
 84 |         logger.info("Using hyperslab selection functionality of parse_gctx...")
 85 |         out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid)
 86 | 
 87 |     # Write the output gct
 88 |     if args.out_type == "gctx":
 89 |         wgx.write(out_gct, args.out_name)
 90 |     else:
 91 |         wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
 92 | 
 93 | 
 94 | def _read_arg(arg):
 95 |     """
 96 |     If arg is a list with 1 element that corresponds to a valid file path, use
 97 |     set_io.grp to read the grp file. Otherwise, check that arg is a list of strings.
 98 | 
 99 |     Args:
100 |         arg (list or None)
101 | 
102 |     Returns:
103 |         arg_out (list or None)
104 |     """
105 | 
106 |     # If arg is None, just return it back
107 |     if arg is None:
108 |         arg_out = arg
109 | 
110 |     else:
111 |         # If len(arg) == 1 and arg[0] is a valid filepath, read it as a grp file
112 |         if len(arg) == 1 and os.path.exists(arg[0]):
113 |             arg_out = grp.read(arg[0])
114 |         else:
115 |             arg_out = arg
116 | 
117 |         # Make sure that arg_out is a list of strings
118 |         assert isinstance(arg_out, list), "arg_out must be a list."
119 |         assert type(arg_out[0]) == str, "arg_out must be a list of strings."
120 | 
121 |     return arg_out
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/__init__.py


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/both_metadata_example_n1476x978.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/both_metadata_example_n1476x978.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/col_meta_only_example_n355x355.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/col_meta_only_example_n355x355.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/concated.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/concated.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/metadata_writer_test.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/metadata_writer_test.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/both_metadata_example_n1476x978.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/both_metadata_example_n1476x978.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/col_meta_only_example_n355x355.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/col_meta_only_example_n355x355.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/row_meta_only_example_n2x1203.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/row_meta_only_example_n2x1203.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/tsne_n2x1203.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_folder/tsne_n2x1203.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_data_matrix.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_data_matrix.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 6	6	5	5
 3 | id	count_cv	distil_nsample	distil_ss	mfc_plate_id	zmad_ref	LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10
 4 | count_cv	-666	-666	-666	-666	-666	14|15|14	13|14|13	13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13	13	13	14
 5 | distil_nsample	-666	-666	-666	-666	-666	3	3	66	2	9	111111
 6 | distil_ss	-666	-666	-666	-666	-666	9.822065353	6.8915205	1.35840559	5.548898697	3.355231762	4.837643147
 7 | mfc_plate_id	-666	-666	-666	-666	-666	-666	-666	-666	-666	-666	-666
 8 | zmad_ref	-666	-666	-666	-666	-666	population	population	population	population	population	population
 9 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	14|15|14	3	9.822065353	-666	population	1.0	2.0	3.0	4.0	5.0	6.0
10 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	13|14|13	3	6.8915205	-666	population	4.300000190734863	4.5	4.300000190734863	4.300000190734863	4.300000190734863	4.300000190734863
11 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13	66	1.35840559	-666	population	7.0	8.0	9.0	0.0	1.234760046005249	9.758319854736328
12 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	13	2	5.548898697	-666	population	0.10999999940395355	3.3456356525421143	2.3456668853759766	9.822065353393555	4.788650989532471	4.788599967956543
13 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	13	9	3.355231762	-666	population	-0.10999999940395355	-3.3456356525421143	-2.3456668853759766	-9.822065353393555	-4.788650989532471	-4.788599967956543
14 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	14	111111	4.837643147	-666	population	1.0	-2.0	3.0	-4.0	5.0	-6.0
15 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 6	6	0	0
 3 | id	LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10
 4 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	1.0000	2.0000	3.0000	4.0000	5.0000	6.0000
 5 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	4.3000	4.5000	4.3000	4.3000	4.3000	4.3000
 6 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	7.0000	8.0000	9.0000	0.0000	1.2348	9.7583
 7 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	0.1100	3.3456	2.3457	9.8221	4.7887	4.7886
 8 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	-0.1100	-3.3456	-2.3457	-9.8221	-4.7887	-4.7886
 9 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	1.0000	-2.0000	3.0000	-4.0000	5.0000	-6.0000
10 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctx_with_metadata_n2x3.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/mini_gctx_with_metadata_n2x3.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/older_version_v1_2.gct:
--------------------------------------------------------------------------------
1 | #1.2
2 | 5	3
3 | NAME	Description	DLBCL.205	DLBCL.206	DLBCL.232
4 | 1007_s_at	U48705 /FEATURE=mRNA	280.53	271.48	113.57
5 | 1053_at	M87338 /FEATURE= /DEFINITION=HUMA	32.13	91.6	117.43
6 | 117_at	X5175 /FEATURE = cds	41.27	61.12	24.1
7 | 121_at	blah blah blah	738.32	330.59	249.37
8 | 1320_at	first/ second /ok	88.45	12.94	18.46


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/row_meta_only_example_n2x1203.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/row_meta_only_example_n2x1203.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt:
--------------------------------------------------------------------------------
1 | cid	count_cv	distil_nsample	distil_ss	mfc_plate_id	zmad_ref
2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	14|15|14	3	9.822065353	-666	population
3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	13|14|13	3	6.8915205	-666	population
4 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13	66	1.35840559	-666	population
5 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	13	2	5.548898697	-666	population
6 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	13	9	3.355231762	-666	population
7 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	14	111111	4.837643147	-666	population
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_concat/test_main/a.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 2	2	1	0
3 | id	rhd1	a	b
4 | rid1	c	1.1	2.2
5 | rid2	d	3.3	4.4
6 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_concat/test_main/b.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 2	2	1	0
3 | id	rhd1	g	f
4 | rid1	e	1.1	2.2
5 | rid2	d	3.3	4.4
6 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_concat_gctoo_test_main_fake_empty_file.gct:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_concat_gctoo_test_main_fake_empty_file.gct


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_l1000_highprecision.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_l1000_highprecision.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merge_bottom.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 2	3	2	2
3 | id	rhd1	rhd2	s1	s3	s2
4 | chd1	NA	NA	s1_1	s3_1	s2_1
5 | chd2	NA	NA	s1_2	s3_2	s2_2
6 | p5	p5_1	p5_2	0.5	NaN	0.1
7 | p4	p4_1	p4_2	0.3	0.9	0.8
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merge_left.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 4	3	2	3
 3 | id	rhd1	rhd2	s1	s2	s3
 4 | chd1	NA	NA	s1_1	s2_1	s3_1
 5 | chd2	NA	NA	s1_2	s2_2	s3_2
 6 | chd3	NA	NA	s1_3	s2_3	s3_3
 7 | p1	p1_1	p1_2	0.1	0.2	0.3
 8 | p2	p2_1	p2_2	0.4	NaN	0.5
 9 | p3	p3_1	p3_2	0.6	0.7	0.8
10 | p4	p4_1	p4_2	0.9	1.0	1.1
11 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merge_right.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 3	3	2	4
 3 | id	rhd1	rhd2	s6	s5	s4
 4 | chd1	NA	NA	s6_1	s5_1	s4_1
 5 | chd2	NA	NA	s6_2	s5_2	s4_2
 6 | chd3	NA	NA	s6_3	s5_3	s4_3
 7 | chd4	NA	NA	s6_4	s5_4	s4_4
 8 | p1	p1_1	p1_2	1.1	1.2	1.3
 9 | p3	p3_1	p3_2	1.4	1.5	NaN
10 | p4	p4_1	p4_2	1.6	1.7	1.8
11 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merge_top.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 2	3	2	2
3 | id	rhd1	rhd2	s1	s2	s3
4 | chd1	NA	NA	s1_1	s2_1	s3_1
5 | chd2	NA	NA	s1_2	s2_2	s3_2
6 | p1	p1_1	p1_2	0.1	0.2	0.3
7 | p2	p2_1	p2_2	0.4	NaN	0.5
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merged_left_right.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 4	6	2	4
 3 | id	rhd1	rhd2	s1	s2	s3	s4	s5	s6
 4 | chd1	NA	NA	s1_1	s2_1	s3_1	s4_1	s5_1	s6_1
 5 | chd2	NA	NA	s1_2	s2_2	s3_2	s4_2	s5_2	s6_2
 6 | chd3	NA	NA	s1_3	s2_3	s3_3	s4_3	s5_3	s6_3
 7 | chd4	NA	NA	NA	NA	NA	s4_4	s5_4	s6_4
 8 | p1	p1_1	p1_2	0.1	0.2	0.3	1.3	1.2	1.1
 9 | p2	p2_1	p2_2	0.4	NaN	0.5	NaN	NaN	NaN
10 | p3	p3_1	p3_2	0.6	0.7	0.8	NaN	1.5	1.4
11 | p4	p4_1	p4_2	0.9	1.0	1.1	1.8	1.7	1.6
12 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_merged_top_bottom.gct:
--------------------------------------------------------------------------------
 1 | #1.3
 2 | 4	3	2	2
 3 | id	rhd1	rhd2	s1	s2	s3
 4 | chd1	NA	NA	s1_1	s2_1	s3_1
 5 | chd2	NA	NA	s1_2	s2_2	s3_2
 6 | p1	p1_1	p1_2	0.1	0.2	0.3
 7 | p2	p2_1	p2_2	0.4	NaN	0.5
 8 | p4	p4_1	p4_2	0.3	0.8	0.9
 9 | p5	p5_1	p5_2	0.5	0.1	NaN
10 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_missing_colmeta.txt:
--------------------------------------------------------------------------------
1 | cid	count_cv	distil_nsample	distil_ss	mfc_plate_id	zmad_ref
2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	14|15|14	3	9.822065353	-666	population
3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	13|14|13	3	6.8915205	-666	population
4 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	13	2	5.548898697	-666	population
5 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	13	9	3.355231762	-666	population
6 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	14	111111	4.837643147	-666	population
7 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_missing_rowmeta.txt:
--------------------------------------------------------------------------------
1 | rid	count_cv	distil_nsample	distil_ss	mfc_plate_id	zmad_ref
2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	14|15|14	3	9.822065353	-666	population
3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	13|14|13	3	6.8915205	-666	population
4 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	13	2	5.548898697	-666	population
5 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	13	9	3.355231762	-666	population
6 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	14	111111	4.837643147	-666	population
7 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gct_int_ids.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 3	2	1	1
3 | id	rhd1	1	2
4 | chd1	-666	a	b
5 | 3	e	5	7
6 | 11	f	13	17
7 | -3	c	-7	-11
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gctx_rid_entrez_id.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/test_parse_gctx_rid_entrez_id.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt:
--------------------------------------------------------------------------------
1 | rid	count_cv	distil_nsample	distil_ss	mfc_plate_id	zmad_ref
2 | LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33	14|15|14	3	9.822065353	-666	population
3 | MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33	13|14|13	3	6.8915205	-666	population
4 | LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10	13|15|14|14|15|14|14|13|14|15|15|14|14|15|14|15|14|14|15|14|15|14|14|14|14|14|14|15|14|14|15|14|14|14|14|13|14|14|14|14|14|14|15|14|13|13|15|14|14|15|14|14|14|15|13|13|15|13|14|13|13|14|14|14|14|13	66	1.35840559	-666	population
5 | LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10	13	2	5.548898697	-666	population
6 | LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666	13	9	3.355231762	-666	population
7 | LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10	14	111111	4.837643147	-666	population
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_subset_expected.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 2	3	2	2
3 | id	rhd1	rhd2	d	e	g
4 | chd1	NA	NA	d1	e1	g1
5 | chd2	NA	NA	d2	e2	g2
6 | a	a1	a2	1	2	5
7 | c	c1	c2	19	23	31
8 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_subset_in.gct:
--------------------------------------------------------------------------------
1 | #1.3
2 | 3	4	2	2
3 | id	rhd1	rhd2	d	e	f	g
4 | chd1	NA	NA	d1	e1	f1	g1
5 | chd2	NA	NA	d2	e2	f2	g2
6 | a	a1	a2	1	2	3	5
7 | b	b1	b2	7	11	13	17
8 | c	c1	c2	19	23	29	31
9 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_subset_rid.grp:
--------------------------------------------------------------------------------
1 | # used by test_subset.py
2 | a
3 | Bb
4 | c
5 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/test_v1point2_n5x10.gct:
--------------------------------------------------------------------------------
 1 | #1.2						
 2 | 10	5					
 3 | Name	Description	LJP005_A375_24H_X1_B19:A03	LJP005_A375_24H_X1_B19:A04	LJP005_A375_24H_X1_B19:A05	LJP005_A375_24H_X1_B19:A06	LJP005_A375_24H_X1_B19:A07
 4 | 200814_at	PSME1	11.3819	11.3336	11.4486	11.3117	11.6321
 5 | 218597_s_at	CISD1	10.445	10.445	10.3658	10.5809	11.0401
 6 | 217140_s_at	VDAC1	6.3682	5.9869	6.0089	6.9966	6.7862
 7 | 209253_at	SORBS3	8.1372	8.2499	8.4592	7.9091	7.5321
 8 | 214404_x_at	SPDEF	4.9227	5.1192	4.95	4.8193	6.0052
 9 | 222103_at	ATF1	7.9259	8.1555	8.0674	8.0616	8.7338
10 | 219888_at	SPAG4	4.028	4.583	4.6234	4.4257	4.0465
11 | 207042_at	E2F2	3.8934	4.1096	3.8643	4.7922	4.2392
12 | 201453_x_at	RHEB	11.4787	11.6041	11.7341	11.5345	11.5706
13 | 203627_at	IGF1R	7.6509	7.5775	7.4636	7.3899	8.1654
14 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/functional_tests/tsne_n2x1203.gctx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/functional_tests/tsne_n2x1203.gctx


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/python2_tests/__init__.py


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_diff_gctoo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.pandasGEXpress.GCToo as GCToo
 6 | import cmapPy.pandasGEXpress.diff_gctoo as diff_gctoo
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9],
11 |                          'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]})
12 | test_col_meta = pd.DataFrame(
13 |     {'pert_type': ['trt_cp', 'trt_cp', 'trt_cp',
14 |                    'trt_cp', 'ctl_vehicle', 'ctl_vehicle'],
15 |      'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']},
16 |     index=['A', 'B', 'C', 'D', 'E', 'F'])
17 | test_gctoo = GCToo.GCToo(data_df=test_mat,
18 |                          col_metadata_df=test_col_meta)
19 | 
20 | 
21 | class TestDifferential(unittest.TestCase):
22 |     def test_diff_gctoo_pc(self):
23 |         pc_zscores = diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, lower_diff_thresh=-2)
24 |         self.assertTrue(pc_zscores.data_df.shape == (3, 6))
25 | 
26 |         pd.util.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame(
27 |             {'A': [-0.6745, -0.9443, -1.349],
28 |              'C': [0.2248, -0.1349, 1.349],
29 |              'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0],
30 |              'D': [-0.2248, -0.9443, -2], # last val should be -2 bc of thresholding
31 |              'F': [0.6745, 0.1349, 0.0]}))
32 | 
33 |         # test diff_method assertion
34 |         with self.assertRaises(AssertionError) as e:
35 |             diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, diff_method="robust_zs")
36 |         self.assertIn("diff_method: robust_zs", str(e.exception))
37 | 
38 |         # test median norm
39 |         pc_median_normed_df = diff_gctoo.diff_gctoo(test_gctoo, diff_method="median_norm")
40 |         self.assertEqual(pc_median_normed_df.data_df.iloc[0, 0], -1.5)
41 |         self.assertEqual(pc_median_normed_df.data_df.loc[2, "B"], 0)
42 | 
43 |     def test_diff_gctoo_vc(self):
44 |         vc_zscores1 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False)
45 |         vc_zscores2 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False,
46 |                                             group_field='pert_iname',
47 |                                             group_val='DMSO')
48 |         self.assertTrue(vc_zscores1.data_df.shape == (3, 6))
49 |         self.assertTrue(vc_zscores2.data_df.shape == (3, 6))
50 | 
51 |         pd.util.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame(
52 |             {'A': [-4.7214, -3.3725, -10.0], # check for thresholding
53 |              'C': [-2.0235, -1.349, 10.0],
54 |              'B': [-7.4194, 0.6745, 0.0],
55 |              'E': [0.6745, 0.6745, 0.0],
56 |              'D': [-3.3725, -3.3725, -10.0],
57 |              'F': [-0.6745, -0.6745, 0.0]}))
58 | 
59 |         pd.util.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame(
60 |             {'A': [-2.0235, -0.6745, -0.3372],
61 |              'C': [0.6745, 0.6745, 0.6745],
62 |              'B': [-4.7214, 2.0235, 0.1686],
63 |              'E': [3.3725, 2.0235, 0.1686],
64 |              'D': [-0.6745, -0.6745, -0.6745],
65 |              'F': [2.0235, 1.1242, 0.1686]}))
66 | 
67 |         # test group_val assertion
68 |         with self.assertRaises(AssertionError) as e:
69 |             diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, group_val="dmso")
70 |         self.assertIn("dmso not present", str(e.exception))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     setup_logger.setup(verbose=True)
75 |     unittest.main()
76 | 
77 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_gct2gctx.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import os
 5 | import cmapPy.pandasGEXpress.gct2gctx as gct2gctx
 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class TestGCT2GCTx(unittest.TestCase):
14 | 
15 | 	def test_gct2gctx_main(self):
16 | 
17 | 		in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct"
18 | 		out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out.gctx"
19 | 		args_string = "-f {} -o {}".format(in_name, out_name)
20 | 		args = gct2gctx.build_parser().parse_args(args_string.split())
21 | 
22 | 		gct2gctx.gct2gctx_main(args)
23 | 
24 | 		# Make sure the input is identical to output
25 | 		in_gct = parse_gct.parse(in_name)
26 | 		out_gctx = parse_gctx.parse(out_name)
27 | 
28 | 		pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df)
29 | 		pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df)
30 | 		pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df)
31 | 
32 | 		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
33 | 		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gct2gctx_out_annotated.gctx"
34 | 		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
35 | 		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
36 | 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta)
37 | 		args = gct2gctx.build_parser().parse_args(args_string.split())
38 | 
39 | 		gct2gctx.gct2gctx_main(args)
40 | 
41 | 		annotated_gctx = parse_gctx.parse(added_meta)
42 | 
43 | 		# Check added annotations are the same as original input GCTX
44 | 		pd.util.testing.assert_frame_equal(in_gct.data_df, annotated_gctx.data_df, check_less_precise=3)
45 | 		pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, annotated_gctx.col_metadata_df)
46 | 		pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, annotated_gctx.row_metadata_df)
47 | 
48 | 		# Clean up
49 | 		os.remove(out_name)
50 | 		os.remove(added_meta)
51 | 
52 | 	def test_missing_annotations(self):
53 | 		with self.assertRaises(Exception) as context:
54 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
55 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
56 | 			row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
57 | 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
58 | 			args = gct2gctx.build_parser().parse_args(args_string.split())
59 | 
60 | 			gct2gctx.gct2gctx_main(args)
61 | 
62 | 		self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)
63 | 
64 | 		with self.assertRaises(Exception) as context:
65 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gct"
66 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gctx"
67 | 			col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
68 | 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
69 | 			args = gct2gctx.build_parser().parse_args(args_string.split())
70 | 
71 | 			gct2gctx.gct2gctx_main(args)
72 | 
73 | 		self.assertTrue('Column ids in matrix missing from annotations file' in context.exception)
74 | 
75 | 
76 | if __name__ == "__main__":
77 | 	setup_logger.setup(verbose=True)
78 | 	unittest.main()
79 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_gctx2gct.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import os
 5 | import cmapPy.pandasGEXpress.gctx2gct as gctx2gct
 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class TestGCTx2GCT(unittest.TestCase):
14 | 
15 | 	def test_gctx2gct_main(self):
16 | 
17 | 		in_name = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx"
18 | 		out_name = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out.gct"
19 | 		args_string = "-f {} -o {}".format(in_name, out_name)
20 | 		args = gctx2gct.build_parser().parse_args(args_string.split())
21 | 
22 | 		gctx2gct.gctx2gct_main(args)
23 | 
24 | 		# Make sure the input is identical to output
25 | 		in_gctx = parse_gctx.parse(in_name)
26 | 		out_gct = parse_gct.parse(out_name)
27 | 
28 | 		pd.util.testing.assert_frame_equal(in_gctx.data_df, out_gct.data_df, check_less_precise=3)
29 | 		pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df)
30 | 		pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df)
31 | 
32 | 		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
33 | 		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
34 | 		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_rowmeta_n6.txt"
35 | 		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_colmeta_n6.txt"
36 | 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta )
37 | 		args = gctx2gct.build_parser().parse_args(args_string.split())
38 | 
39 | 		gctx2gct.gctx2gct_main(args)
40 | 
41 | 		annotated_gct = parse_gct.parse(added_meta)
42 | 
43 | 		# Check added annotations are the same as original input GCTX
44 | 		pd.util.testing.assert_frame_equal(in_gctx.data_df, annotated_gct.data_df, check_less_precise=3)
45 | 		pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, annotated_gct.col_metadata_df)
46 | 		pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, annotated_gct.row_metadata_df)
47 | 
48 | 		# Clean up
49 | 		os.remove(out_name)
50 | 		os.remove(added_meta)
51 | 
52 | 	def test_missing_annotations(self):
53 | 		with self.assertRaises(Exception) as context:
54 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
55 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
56 | 			row_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_rowmeta.txt"
57 | 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
58 | 			args = gctx2gct.build_parser().parse_args(args_string.split())
59 | 
60 | 			gctx2gct.gctx2gct_main(args)
61 | 
62 | 		self.assertTrue('Row ids in matrix missing from annotations file' in context.exception)
63 | 
64 | 		with self.assertRaises(Exception) as context:
65 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing_nometa.gctx"
66 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_gctx2gct_out_annotated.gct"
67 | 			col_meta = "cmapPy/pandasGEXpress/tests/functional_tests//test_missing_colmeta.txt"
68 | 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
69 | 			args = gctx2gct.build_parser().parse_args(args_string.split())
70 | 
71 | 			gctx2gct.gctx2gct_main(args)
72 | 
73 | 		self.assertTrue('Column ids in matrix missing from annotations file' in context.exception)
74 | 
75 | 
76 | if __name__ == "__main__":
77 | 	setup_logger.setup(verbose=True)
78 | 	unittest.main()
79 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_parse.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 4 | import unittest
 5 | import pandas.util.testing as pandas_testing
 6 | import cmapPy.pandasGEXpress.subset_gctoo as subset_gctoo
 7 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing
 8 | import cmapPy.pandasGEXpress.parse as parse
 9 | 
10 | __author__ = "Oana Enache"
11 | __email__ = "oana@broadinstitute.org"
12 | 
13 | FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/"
14 | 
15 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
16 | 
17 | class TestParse(unittest.TestCase):
18 |     def test_gctx_parsing(self):
19 |         # parse in gctx, no other arguments        
20 |         mg1 = mini_gctoo_for_testing.make()
21 |         mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx")
22 | 
23 |         pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
24 |         pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
25 |         pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 
26 | 
27 |         # check convert_neg_666 worked correctly
28 |         self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())
29 | 
30 |         # parse w/o convert_neg_666
31 |         mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", convert_neg_666 = False)
32 |         self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())        
33 | 
34 |         # parsing w/rids & cids specified 
35 |         test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
36 |         test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
37 |         mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
38 |         mg4 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
39 |                     rid=test_rids, cid=test_cids)
40 |         pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
41 |         pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
42 |         pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)
43 | 
44 |         # parsing w/ridx & cidx specified 
45 |         mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
46 |                                       cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
47 |         mg6 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
48 | 
49 |         pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
50 |         pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
51 |         pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)
52 | 
53 |         # parsing row metadata only
54 |         mg7 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", row_meta_only=True)
55 |         pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df)
56 | 
57 |         # parsing col metadata only
58 |         mg8 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", col_meta_only=True)
59 |         pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df)
60 | 
61 |         # parsing w/multiindex
62 |         mg9 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", make_multiindex=True)
63 |         self.assertTrue(mg9.multi_index_df is not None)
64 | 
65 |     def test_gct_parsing(self):
66 |         # parse in gct, no other arguments
67 |         mg1 = mini_gctoo_for_testing.make()
68 |         mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct")
69 | 
70 |         pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
71 |         pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
72 |         pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)
73 | 
74 |         # check convert_neg_666 worked correctly
75 |         self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())
76 | 
77 |         # parse w/o convert_neg_666
78 |         mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct", convert_neg_666 = False)
79 |         self.assertItemsEqual(mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(),
80 |                               [-666] * 6)
81 | 
82 |         # parse in gct with subsetting
83 |         my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33"
84 |         mg3 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct",
85 |                           cidx=[0, 2], rid=[my_rid])
86 | 
87 |         self.assertEqual(mg3.data_df.shape, (1, 2))
88 |         self.assertItemsEqual(mg3.data_df.values.flatten().tolist(), [1., 3.])
89 |         self.assertEqual(mg3.row_metadata_df.index[0], my_rid)
90 | 
91 | if __name__ == "__main__":
92 |     setup_logger.setup(verbose=True)
93 |     unittest.main()
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_random_slice.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 4 | import cmapPy.pandasGEXpress.random_slice as random_slice
 5 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | 
 9 | 
10 | class TestRandomSlice(unittest.TestCase):
11 |     def test_make_specified_size_gctoo(self):
12 |         mini_gctoo = mini_gctoo_for_testing.make()
13 |         logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape))
14 |         logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape))
15 |         logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape))
16 | 
17 |         # case 1: dim isn't 'row' or 'col'
18 |         with self.assertRaises(AssertionError) as context:
19 |             random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll")
20 |         self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'")
21 | 
22 |         # case 2: row subsetting - happy
23 |         row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row")
24 |         self.assertEqual(row_subset.data_df.shape, (3, 6),
25 |                          "data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape))
26 |         self.assertEqual(row_subset.row_metadata_df.shape, (3, 5),
27 |                          "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format(
28 |                              row_subset.row_metadata_df.shape))
29 |         self.assertEqual(row_subset.col_metadata_df.shape, (6, 5),
30 |                          "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format(
31 |                              row_subset.col_metadata_df.shape))
32 | 
33 |         # case 3: row subsetting - sample subset > og # of samples
34 |         with self.assertRaises(AssertionError) as context:
35 |             random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row")
36 |         self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception))
37 | 
38 |         # case 4: col subsetting - happy
39 |         col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col")
40 |         self.assertEqual(col_subset.data_df.shape, (6, 3),
41 |                          "data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape))
42 |         self.assertEqual(col_subset.row_metadata_df.shape, (6, 5),
43 |                          "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format(
44 |                              col_subset.row_metadata_df.shape))
45 |         self.assertEqual(col_subset.col_metadata_df.shape, (3, 5),
46 |                          "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format(
47 |                              col_subset.col_metadata_df.shape))
48 | 
49 |         # case 5: col subsetting - sample subset > og # of samples
50 |         with self.assertRaises(AssertionError) as context:
51 |             random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col")
52 |         self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception))
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     setup_logger.setup(verbose=True)
57 | 
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_subset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import os
 4 | import pandas as pd
 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 6 | import cmapPy.pandasGEXpress.parse as parse
 7 | import cmapPy.pandasGEXpress.subset as sg
 8 | 
 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
10 | 
11 | 
12 | class TestSubset(unittest.TestCase):
13 | 
14 |     def test_read_arg(self):
15 |         arg_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp")
16 |         rids = sg._read_arg([arg_path])
17 |         self.assertItemsEqual(rids, ["a", "Bb", "c"])
18 | 
19 |     def test_read_arg_bad(self):
20 |         with self.assertRaises(AssertionError) as e:
21 |             sg._read_arg("a b c")
22 |         self.assertIn("arg_out must be a list", str(e.exception))
23 | 
24 |         with self.assertRaises(AssertionError) as e:
25 |             sg._read_arg([1, 2, 3])
26 |         self.assertIn("arg_out must be a list of strings", str(e.exception))
27 | 
28 |     def test_subset_main(self):
29 | 
30 |         in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct")
31 |         rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp")
32 |         out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct")
33 |         expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct")
34 | 
35 |         args_string = "-i {} --rid {} -ec {} -o {}".format(
36 |             in_gct_path, rid_grp_path, "f", out_name)
37 |         args = sg.build_parser().parse_args(args_string.split())
38 | 
39 |         # Run main method
40 |         sg.subset_main(args)
41 | 
42 |         # Compare output to expected
43 |         out_gct = parse.parse(out_name)
44 |         expected_gct = parse.parse(expected_out_path)
45 | 
46 |         pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df)
47 |         pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df)
48 |         pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df)
49 | 
50 |         # Clean up
51 |         os.remove(out_name)
52 | 
53 |         # gctx with exclude_rid should fail
54 |         args_string2 = "-i {} --rid {} -ec {} -o {}".format(
55 |             "FAKE.gctx", rid_grp_path, "f", out_name)
56 |         args2 = sg.build_parser().parse_args(args_string2.split())
57 | 
58 |         with self.assertRaises(Exception) as e:
59 |             sg.subset_main(args2)
60 |         self.assertIn("exclude_{rid,cid} args not currently supported",
61 |                       str(e.exception))
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python2_tests/test_subset_gctoo.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import logging
  3 | import pandas as pd
  4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
  5 | import cmapPy.pandasGEXpress.GCToo as GCToo
  6 | import cmapPy.pandasGEXpress.subset_gctoo as sg
  7 | 
  8 | 
  9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 10 | 
 11 | 
 12 | class TestSubsetGCToo(unittest.TestCase):
 13 | 
 14 |     @classmethod
 15 |     def setUpClass(cls):
 16 |         data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]],
 17 |                                index=["a", "b", "c", "d"], columns=["e", "f", "g"])
 18 |         row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]],
 19 |                                        index=["a", "b", "c", "d"], columns=["rhd1", "rh2"])
 20 |         col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]],
 21 |                                        index=["e", "f", "g"], columns=["chd1", "chd2"])
 22 |         cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df)
 23 | 
 24 |     def test_subset_gctoo(self):
 25 | 
 26 |         # Error if resulting GCT is empty
 27 |         with self.assertRaises(AssertionError) as e:
 28 |             sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"])
 29 |         self.assertIn("Subsetting yielded an", str(e.exception))
 30 | 
 31 |         # cid and col_bool should not both be provided
 32 |         with self.assertRaises(AssertionError) as e:
 33 |             sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False])
 34 |         self.assertIn("Only one of cid,", str(e.exception))
 35 | 
 36 |         # Providing all 3 row inputs is also bad!
 37 |         with self.assertRaises(AssertionError) as e:
 38 |             sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!")
 39 |         self.assertIn("Only one of rid,", str(e.exception))
 40 | 
 41 |         # happy path
 42 |         out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0],
 43 |                                exclude_rid=["a"])
 44 |         pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]])
 45 | 
 46 |     def test_get_rows_to_keep(self):
 47 | 
 48 |         # rid must be a list
 49 |         with self.assertRaises(AssertionError) as e:
 50 |             sg.get_rows_to_keep(self.in_gct, rid="bad")
 51 |         self.assertIn("rid must be a list", str(e.exception))
 52 | 
 53 |         # bools
 54 |         out_rows = sg.get_rows_to_keep(self.in_gct, row_bool=[True, True, True, False])
 55 |         self.assertItemsEqual(out_rows, ["a", "b", "c"])
 56 | 
 57 |         # rid and exclude_rid
 58 |         out_rows2 = sg.get_rows_to_keep(self.in_gct, rid=["a", "c", "d"], exclude_rid=["d"])
 59 |         self.assertItemsEqual(out_rows2, ["a", "c"])
 60 | 
 61 |         # keep all rows
 62 |         out_rows3 = sg.get_rows_to_keep(self.in_gct)
 63 |         self.assertItemsEqual(out_rows3, ["a", "b", "c", "d"])
 64 | 
 65 |         with self.assertRaises(AssertionError) as e:
 66 |             sg.get_rows_to_keep(self.in_gct, row_bool=[True, False, True])
 67 |         self.assertIn("row_bool must have length", str(e.exception))
 68 | 
 69 |         with self.assertRaises(AssertionError) as e:
 70 |             sg.get_rows_to_keep(self.in_gct, ridx=[True, False, True])
 71 |         self.assertIn("ridx must be a list of integers", str(e.exception))
 72 | 
 73 |         with self.assertRaises(AssertionError) as e:
 74 |             sg.get_rows_to_keep(self.in_gct, ridx=[0, 2, 5])
 75 |         self.assertIn("ridx contains an integer", str(e.exception))
 76 | 
 77 |     def test_get_cols_to_keep(self):
 78 |         # N.B. annoying that we have two extremely similar but separate methods
 79 |         # for rows and columns, but I think it's worth it to have clear error
 80 |         # messages
 81 | 
 82 |         # cid must be a list
 83 |         with self.assertRaises(AssertionError) as e:
 84 |             sg.get_cols_to_keep(self.in_gct, cid="real_bad")
 85 |         self.assertIn("cid must be a list", str(e.exception))
 86 | 
 87 |         # bools
 88 |         out_cols = sg.get_cols_to_keep(self.in_gct, col_bool=[False, True, True])
 89 |         self.assertItemsEqual(out_cols, ["f", "g"])
 90 | 
 91 |         # cid and exclude_cid
 92 |         out_cols2 = sg.get_cols_to_keep(self.in_gct, cid=["g", "e", "f"], exclude_cid=["f"], cidx=None)
 93 |         self.assertItemsEqual(out_cols2, ["g", "e"])
 94 | 
 95 |         # keep all cols
 96 |         out_cols3 = sg.get_cols_to_keep(self.in_gct)
 97 |         self.assertItemsEqual(out_cols3, ["e", "f", "g"])
 98 | 
 99 |         with self.assertRaises(AssertionError) as e:
100 |             sg.get_cols_to_keep(self.in_gct, col_bool=[True, False, True, True])
101 |         self.assertIn("col_bool must have length", str(e.exception))
102 | 
103 |         with self.assertRaises(AssertionError) as e:
104 |             sg.get_cols_to_keep(self.in_gct, cidx=[True, False, True])
105 |         self.assertIn("cidx must be a list of integers", str(e.exception))
106 | 
107 |         with self.assertRaises(AssertionError) as e:
108 |             sg.get_cols_to_keep(self.in_gct, cidx=[10])
109 |         self.assertIn("cidx contains an integer", str(e.exception))
110 | 
111 | if __name__ == '__main__':
112 |     setup_logger.setup(verbose=True)
113 |     unittest.main()
114 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/pandasGEXpress/tests/python3_tests/__init__.py


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_diff_gctoo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.pandasGEXpress.GCToo as GCToo
 6 | import cmapPy.pandasGEXpress.diff_gctoo as diff_gctoo
 7 | 
 8 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 9 | 
10 | test_mat = pd.DataFrame({'A':[4,2,3], 'B': [2,8,6], 'C': [6,5,9],
11 |                          'D': [5,2,1], 'E':[8,8,6], 'F': [7,6,6]},
12 |                         columns=['A','C','B','E','D','F'])
13 | test_col_meta = pd.DataFrame(
14 |     {'pert_type': ['trt_cp', 'trt_cp', 'trt_cp',
15 |                    'trt_cp', 'ctl_vehicle', 'ctl_vehicle'],
16 |      'pert_iname': ['bort', 'bort', 'DMSO', 'DMSO', 'bort', 'bort']},
17 |     index=['A', 'B', 'C', 'D', 'E', 'F'])
18 | test_gctoo = GCToo.GCToo(data_df=test_mat,
19 |                          col_metadata_df=test_col_meta)
20 | 
21 | 
22 | class TestDifferential(unittest.TestCase):
23 |     def test_diff_gctoo_pc(self):
24 |         pc_zscores = diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, lower_diff_thresh=-2)
25 |         self.assertTrue(pc_zscores.data_df.shape == (3, 6))
26 | 
27 |         pd.testing.assert_frame_equal(pc_zscores.data_df, pd.DataFrame(
28 |             {'A': [-0.6745, -0.9443, -1.349],
29 |              'C': [0.2248, -0.1349, 1.349],
30 |              'B': [-1.5738, 0.6745, 0.0], 'E': [1.1242, 0.6745, 0.0],
31 |              'D': [-0.2248, -0.9443, -2], # last val should be -2 bc of thresholding
32 |              'F': [0.6745, 0.1349, 0.0]},
33 |             columns=['A', 'C', 'B', 'E', 'D', 'F']))
34 | 
35 |         # test diff_method assertion
36 |         with self.assertRaises(AssertionError) as e:
37 |             diff_gctoo.diff_gctoo(test_gctoo, plate_control=True, diff_method="robust_zs")
38 |         self.assertIn("diff_method: robust_zs", str(e.exception))
39 | 
40 |         # test median norm
41 |         pc_median_normed_df = diff_gctoo.diff_gctoo(test_gctoo, diff_method="median_norm")
42 |         self.assertEqual(pc_median_normed_df.data_df.iloc[0, 0], -1.5)
43 |         self.assertEqual(pc_median_normed_df.data_df.loc[2, "B"], 0)
44 | 
45 |     def test_diff_gctoo_vc(self):
46 |         vc_zscores1 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False)
47 |         vc_zscores2 = diff_gctoo.diff_gctoo(test_gctoo, plate_control=False,
48 |                                             group_field='pert_iname',
49 |                                             group_val='DMSO')
50 |         self.assertTrue(vc_zscores1.data_df.shape == (3, 6))
51 |         self.assertTrue(vc_zscores2.data_df.shape == (3, 6))
52 | 
53 |         pd.testing.assert_frame_equal(vc_zscores1.data_df, pd.DataFrame(
54 |             {'A': [-4.7214, -3.3725, -10.0], # check for thresholding
55 |              'C': [-2.0235, -1.349, 10.0],
56 |              'B': [-7.4194, 0.6745, 0.0],
57 |              'E': [0.6745, 0.6745, 0.0],
58 |              'D': [-3.3725, -3.3725, -10.0],
59 |              'F': [-0.6745, -0.6745, 0.0]},
60 |             columns=['A', 'C', 'B', 'E', 'D', 'F']))
61 | 
62 |         pd.testing.assert_frame_equal(vc_zscores2.data_df, pd.DataFrame(
63 |             {'A': [-2.0235, -0.6745, -0.3372],
64 |              'C': [0.6745, 0.6745, 0.6745],
65 |              'B': [-4.7214, 2.0235, 0.1686],
66 |              'E': [3.3725, 2.0235, 0.1686],
67 |              'D': [-0.6745, -0.6745, -0.6745],
68 |              'F': [2.0235, 1.1242, 0.1686]},
69 |             columns=['A', 'C', 'B', 'E', 'D', 'F']))
70 | 
71 |         # test group_val assertion
72 |         with self.assertRaises(AssertionError) as e:
73 |             diff_gctoo.diff_gctoo(test_gctoo, plate_control=False, group_val="dmso")
74 |         self.assertIn("dmso not present", str(e.exception))
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     setup_logger.setup(verbose=True)
79 |     unittest.main()
80 | 
81 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_gct2gctx.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import os
 5 | import cmapPy.pandasGEXpress.gct2gctx as gct2gctx
 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class TestGCT2GCTx(unittest.TestCase):
14 | 
15 | 	def test_gct2gctx_main(self):
16 | 
17 | 		in_name = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct"
18 | 		out_name = "cmapPy/pandasGEXpress/tests/functional_tests/test_gct2gctx_out.gctx"
19 | 		args_string = "-f {} -o {}".format(in_name, out_name)
20 | 		args = gct2gctx.build_parser().parse_args(args_string.split())
21 | 
22 | 		gct2gctx.gct2gctx_main(args)
23 | 
24 | 		# Make sure the input is identical to output
25 | 		in_gct = parse_gct.parse(in_name)
26 | 		out_gctx = parse_gctx.parse(out_name)
27 | 
28 | 		pd.util.testing.assert_frame_equal(in_gct.data_df, out_gctx.data_df)
29 | 		pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, out_gctx.col_metadata_df)
30 | 		pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, out_gctx.row_metadata_df)
31 | 
32 | 		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gct"
33 | 		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gct2gctx_out_annotated.gctx"
34 | 		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt"
35 | 		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt"
36 | 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta)
37 | 		args = gct2gctx.build_parser().parse_args(args_string.split())
38 | 
39 | 		gct2gctx.gct2gctx_main(args)
40 | 
41 | 		annotated_gctx = parse_gctx.parse(added_meta)
42 | 
43 | 		# Check added annotations are the same as original input GCTX
44 | 		pd.util.testing.assert_frame_equal(in_gct.data_df, annotated_gctx.data_df, check_less_precise=3)
45 | 		pd.util.testing.assert_frame_equal(in_gct.col_metadata_df, annotated_gctx.col_metadata_df)
46 | 		pd.util.testing.assert_frame_equal(in_gct.row_metadata_df, annotated_gctx.row_metadata_df)
47 | 
48 | 		# Clean up
49 | 		os.remove(out_name)
50 | 		os.remove(added_meta)
51 | 
52 | 	def test_missing_annotations(self):
53 | 		with self.assertRaises(Exception) as context:
54 | 			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
55 | 			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
56 | 			row_meta = "../functional_tests/test_missing_rowmeta.txt"
57 | 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
58 | 			args = gct2gctx.build_parser().parse_args(args_string.split())
59 | 
60 | 			gct2gctx.gct2gctx_main(args)
61 | 
62 | 		self.assertTrue('Row ids in matrix missing from annotations file', context.exception)
63 | 
64 | 		with self.assertRaises(Exception) as context:
65 | 			no_meta = "../functional_tests/mini_gctoo_for_testing_nometa.gct"
66 | 			added_meta = "../functional_tests/test_gctx2gct_out_annotated.gctx"
67 | 			col_meta = "../functional_tests/test_missing_colmeta.txt"
68 | 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
69 | 			args = gct2gctx.build_parser().parse_args(args_string.split())
70 | 
71 | 			gct2gctx.gct2gctx_main(args)
72 | 
73 | 		self.assertTrue('Column ids in matrix missing from annotations file', context.exception)
74 | 
75 | 
76 | if __name__ == "__main__":
77 | 	setup_logger.setup(verbose=True)
78 | 	unittest.main()
79 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_gctx2gct.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import pandas as pd
 4 | import os
 5 | import cmapPy.pandasGEXpress.gctx2gct as gctx2gct
 6 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 7 | import cmapPy.pandasGEXpress.parse_gct as parse_gct
 8 | import cmapPy.pandasGEXpress.parse_gctx as parse_gctx
 9 | 
10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
11 | 
12 | 
13 | class TestGCTx2GCT(unittest.TestCase):
14 | 
15 | 	def test_gctx2gct_main(self):
16 | 
17 | 		in_name = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx"
18 | 		out_name = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out.gct"
19 | 		args_string = "-f {} -o {}".format(in_name, out_name)
20 | 		args = gctx2gct.build_parser().parse_args(args_string.split())
21 | 
22 | 		gctx2gct.gctx2gct_main(args)
23 | 
24 | 		# Make sure the input is identical to output
25 | 		in_gctx = parse_gctx.parse(in_name)
26 | 		out_gct = parse_gct.parse(out_name)
27 | 
28 | 		pd.util.testing.assert_frame_equal(in_gctx.data_df, out_gct.data_df, check_less_precise=3)
29 | 		pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, out_gct.col_metadata_df)
30 | 		pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, out_gct.row_metadata_df)
31 | 
32 | 		no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx"
33 | 		added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct"
34 | 		row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_rowmeta_n6.txt"
35 | 		col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_colmeta_n6.txt"
36 | 		args_string = "-f {} -o {} -row_annot_path {} -col_annot_path {}".format(no_meta, added_meta, row_meta, col_meta )
37 | 		args = gctx2gct.build_parser().parse_args(args_string.split())
38 | 
39 | 		gctx2gct.gctx2gct_main(args)
40 | 
41 | 		annotated_gct = parse_gct.parse(added_meta)
42 | 
43 | 		# Check added annotations are the same as original input GCTX
44 | 		pd.util.testing.assert_frame_equal(in_gctx.data_df, annotated_gct.data_df, check_less_precise=3)
45 | 		pd.util.testing.assert_frame_equal(in_gctx.col_metadata_df, annotated_gct.col_metadata_df)
46 | 		pd.util.testing.assert_frame_equal(in_gctx.row_metadata_df, annotated_gct.row_metadata_df)
47 | 
48 | 		# Clean up
49 | 		os.remove(out_name)
50 | 		os.remove(added_meta)
51 | 
52 | 	def test_missing_annotations(self):
53 | 		with self.assertRaises(Exception) as context:
54 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx"
55 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct"
56 | 			row_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_missing_rowmeta.txt"
57 | 			args_string = "-f {} -o {} -row_annot_path {}".format(no_meta, added_meta, row_meta)
58 | 			args = gctx2gct.build_parser().parse_args(args_string.split())
59 | 
60 | 			gctx2gct.gctx2gct_main(args)
61 | 
62 | 		print(context.exception)
63 | 		self.assertTrue('Row ids in matrix missing from annotations file', context.exception)
64 | 
65 | 		with self.assertRaises(Exception) as context:
66 | 			no_meta = "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing_nometa.gctx"
67 | 			added_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_gctx2gct_out_annotated.gct"
68 | 			col_meta = "cmapPy/pandasGEXpress/tests/functional_tests/test_missing_colmeta.txt"
69 | 			args_string = "-f {} -o {} -col_annot_path {}".format(no_meta, added_meta, col_meta)
70 | 			args = gctx2gct.build_parser().parse_args(args_string.split())
71 | 
72 | 			gctx2gct.gctx2gct_main(args)
73 | 
74 | 		self.assertTrue('Column ids in matrix missing from annotations file', context.exception)
75 | 
76 | 
77 | if __name__ == "__main__":
78 | 	setup_logger.setup(verbose=True)
79 | 	unittest.main()
80 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_parse.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 3 | import unittest
 4 | import pandas.util.testing as pandas_testing
 5 | import cmapPy.pandasGEXpress.subset_gctoo as subset_gctoo
 6 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing
 7 | import cmapPy.pandasGEXpress.parse as parse
 8 | 
 9 | __author__ = "Oana Enache"
10 | __email__ = "oana@broadinstitute.org"
11 | 
12 | FUNCTIONAL_TESTS_PATH = "cmapPy/pandasGEXpress/tests/functional_tests/"
13 | 
14 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
15 | 
16 | class TestParse(unittest.TestCase):
17 |     def test_gctx_parsing(self):
18 |         # parse in gctx, no other arguments        
19 |         mg1 = mini_gctoo_for_testing.make()
20 |         mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx")
21 | 
22 |         pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
23 |         pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
24 |         pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) 
25 | 
26 |         # check convert_neg_666 worked correctly
27 |         self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())
28 | 
29 |         # parse w/o convert_neg_666
30 |         mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", convert_neg_666 = False)
31 |         self.assertFalse(mg2_alt.col_metadata_df["mfc_plate_id"].isnull().all())        
32 | 
33 |         # parsing w/rids & cids specified 
34 |         test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
35 |         test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
36 |         mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
37 |         mg4 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
38 |                     rid=test_rids, cid=test_cids)
39 |         pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
40 |         pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
41 |         pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)
42 | 
43 |         # parsing w/ridx & cidx specified 
44 |         mg5 = subset_gctoo.subset_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
45 |                                       cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
46 |         mg6 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
47 | 
48 |         pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
49 |         pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
50 |         pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)
51 | 
52 |         # parsing row metadata only
53 |         mg7 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True)
54 |         pandas_testing.assert_frame_equal(mg7, mg1.row_metadata_df)
55 | 
56 |         # parsing col metadata only
57 |         mg8 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True)
58 |         pandas_testing.assert_frame_equal(mg8, mg1.col_metadata_df)
59 | 
60 |         # parsing w/multiindex
61 |         mg9 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", make_multiindex=True)
62 |         self.assertTrue(mg9.multi_index_df is not None)
63 | 
64 |     def test_gct_parsing(self):
65 |         # parse in gct, no other arguments
66 |         mg1 = mini_gctoo_for_testing.make()
67 |         mg2 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct")
68 | 
69 |         pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
70 |         pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
71 |         pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)
72 | 
73 |         # check convert_neg_666 worked correctly
74 |         self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())
75 | 
76 |         # parse w/o convert_neg_666
77 |         mg2_alt = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct", convert_neg_666 = False)
78 |         self.assertCountEqual(mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(),
79 |                               [-666] * 6)
80 | 
81 |         # parse in gct with subsetting
82 |         my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33"
83 |         mg3 = parse.parse("cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct",
84 |                           cidx=[0, 2], rid=[my_rid])
85 | 
86 |         self.assertEqual(mg3.data_df.shape, (1, 2))
87 |         self.assertCountEqual(mg3.data_df.values.flatten().tolist(), [1., 3.])
88 |         self.assertEqual(mg3.row_metadata_df.index[0], my_rid)
89 | 
90 | if __name__ == "__main__":
91 |     setup_logger.setup(verbose=True)
92 |     unittest.main()
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_random_slice.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 4 | import cmapPy.pandasGEXpress.random_slice as random_slice
 5 | import cmapPy.pandasGEXpress.mini_gctoo_for_testing as mini_gctoo_for_testing
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | 
 9 | 
10 | class TestRandomSlice(unittest.TestCase):
11 |     def test_make_specified_size_gctoo(self):
12 |         mini_gctoo = mini_gctoo_for_testing.make()
13 |         logger.debug("mini gctoo data_df shape: {}".format(mini_gctoo.data_df.shape))
14 |         logger.debug("mini gctoo row_meta shape: {}".format(mini_gctoo.row_metadata_df.shape))
15 |         logger.debug("mini gctoo col_meta shape: {}".format(mini_gctoo.col_metadata_df.shape))
16 | 
17 |         # case 1: dim isn't 'row' or 'col'
18 |         with self.assertRaises(AssertionError) as context:
19 |             random_slice.make_specified_size_gctoo(mini_gctoo, 3, "aaaalll")
20 |         self.assertEqual(str(context.exception), "dim specified must be either 'row' or 'col'")
21 | 
22 |         # case 2: row subsetting - happy
23 |         row_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "row")
24 |         self.assertEqual(row_subset.data_df.shape, (3, 6),
25 |                          "data_df after row slice is incorrect shape: {} vs (3,6)".format(row_subset.data_df.shape))
26 |         self.assertEqual(row_subset.row_metadata_df.shape, (3, 5),
27 |                          "row_metadata_df after row slice is incorrect shape: {} vs (3,5)".format(
28 |                              row_subset.row_metadata_df.shape))
29 |         self.assertEqual(row_subset.col_metadata_df.shape, (6, 5),
30 |                          "col_metadata_df after row slice is incorrect shape: {} vs (6,5)".format(
31 |                              row_subset.col_metadata_df.shape))
32 | 
33 |         # case 3: row subsetting - sample subset > og # of samples
34 |         with self.assertRaises(AssertionError) as context:
35 |             random_slice.make_specified_size_gctoo(mini_gctoo, 30, "row")
36 |         self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception))
37 | 
38 |         # case 4: col subsetting - happy
39 |         col_subset = random_slice.make_specified_size_gctoo(mini_gctoo, 3, "col")
40 |         self.assertEqual(col_subset.data_df.shape, (6, 3),
41 |                          "data_df after col slice is incorrect shape: {} vs (6,3)".format(col_subset.data_df.shape))
42 |         self.assertEqual(col_subset.row_metadata_df.shape, (6, 5),
43 |                          "row_metadata_df after col slice is incorrect shape: {} vs (6, 5)".format(
44 |                              col_subset.row_metadata_df.shape))
45 |         self.assertEqual(col_subset.col_metadata_df.shape, (3, 5),
46 |                          "col_metadata_df after col slice is incorrect shape: {} vs (3,5)".format(
47 |                              col_subset.col_metadata_df.shape))
48 | 
49 |         # case 5: col subsetting - sample subset > og # of samples
50 |         with self.assertRaises(AssertionError) as context:
51 |             random_slice.make_specified_size_gctoo(mini_gctoo, 7, "col")
52 |         self.assertTrue("number of entries must be smaller than dimension being subsetted " in str(context.exception))
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     setup_logger.setup(verbose=True)
57 | 
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_subset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import logging
 3 | import os
 4 | import pandas as pd
 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 6 | import cmapPy.pandasGEXpress.parse as parse
 7 | import cmapPy.pandasGEXpress.subset as sg
 8 | 
 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
10 | 
11 | 
12 | class TestSubset(unittest.TestCase):
13 | 
14 |     def test_read_arg(self):
15 |         arg_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp")
16 |         rids = sg._read_arg([arg_path])
17 |         self.assertCountEqual(rids, ["a", "Bb", "c"])
18 | 
19 |     def test_read_arg_bad(self):
20 |         with self.assertRaises(AssertionError) as e:
21 |             sg._read_arg("a b c")
22 |         self.assertIn("arg_out must be a list", str(e.exception))
23 | 
24 |         with self.assertRaises(AssertionError) as e:
25 |             sg._read_arg([1, 2, 3])
26 |         self.assertIn("arg_out must be a list of strings", str(e.exception))
27 | 
28 |     def test_subset_main(self):
29 | 
30 |         in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct")
31 |         rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp")
32 |         out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct")
33 |         expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct")
34 | 
35 |         args_string = "-i {} --rid {} -ec {} -o {}".format(
36 |             in_gct_path, rid_grp_path, "f", out_name)
37 |         args = sg.build_parser().parse_args(args_string.split())
38 | 
39 |         # Run main method
40 |         sg.subset_main(args)
41 | 
42 |         # Compare output to expected
43 |         out_gct = parse.parse(out_name)
44 |         expected_gct = parse.parse(expected_out_path)
45 | 
46 |         pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df)
47 |         pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df)
48 |         pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df)
49 | 
50 |         # Clean up
51 |         os.remove(out_name)
52 | 
53 |         # gctx with exclude_rid should fail
54 |         args_string2 = "-i {} --rid {} -ec {} -o {}".format(
55 |             "FAKE.gctx", rid_grp_path, "f", out_name)
56 |         args2 = sg.build_parser().parse_args(args_string2.split())
57 | 
58 |         with self.assertRaises(Exception) as e:
59 |             sg.subset_main(args2)
60 |         self.assertIn("exclude_{rid,cid} args not currently supported",
61 |                       str(e.exception))
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_subset_gctoo.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import logging
  3 | import pandas as pd
  4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
  5 | import cmapPy.pandasGEXpress.GCToo as GCToo
  6 | import cmapPy.pandasGEXpress.subset_gctoo as sg
  7 | 
  8 | 
  9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 10 | 
 11 | 
 12 | class TestSubsetGCToo(unittest.TestCase):
 13 | 
 14 |     @classmethod
 15 |     def setUpClass(cls):
 16 |         data_df = pd.DataFrame([[1, 2, 3], [5, 7, 11], [13, 17, 19], [23, 29, 31]],
 17 |                                index=["a", "b", "c", "d"], columns=["e", "f", "g"])
 18 |         row_metadata_df = pd.DataFrame([["rm1", "rm2"], ["rm3", "rm4"], ["rm5", "rm6"], ["rm7", "rm8"]],
 19 |                                        index=["a", "b", "c", "d"], columns=["rhd1", "rh2"])
 20 |         col_metadata_df = pd.DataFrame([["cm1", "cm2"], ["cm3", "cm4"], ["cm5", "cm6"]],
 21 |                                        index=["e", "f", "g"], columns=["chd1", "chd2"])
 22 |         cls.in_gct = GCToo.GCToo(data_df, row_metadata_df, col_metadata_df)
 23 | 
 24 |     def test_subset_gctoo(self):
 25 | 
 26 |         # Error if resulting GCT is empty
 27 |         with self.assertRaises(AssertionError) as e:
 28 |             sg.subset_gctoo(self.in_gct, rid=["bad"], cid=["x", "y"])
 29 |         self.assertIn("Subsetting yielded an", str(e.exception))
 30 | 
 31 |         # cid and col_bool should not both be provided
 32 |         with self.assertRaises(AssertionError) as e:
 33 |             sg.subset_gctoo(self.in_gct, cid=["e", "f", "g"], col_bool=[True, True, False])
 34 |         self.assertIn("Only one of cid,", str(e.exception))
 35 | 
 36 |         # Providing all 3 row inputs is also bad!
 37 |         with self.assertRaises(AssertionError) as e:
 38 |             sg.subset_gctoo(self.in_gct, rid="blah", ridx="bloop", row_bool="no!")
 39 |         self.assertIn("Only one of rid,", str(e.exception))
 40 | 
 41 |         # happy path
 42 |         out_g = sg.subset_gctoo(self.in_gct, rid=["d", "a", "b"], cidx=[0],
 43 |                                exclude_rid=["a"])
 44 |         pd.util.testing.assert_frame_equal(out_g.data_df, self.in_gct.data_df.iloc[[1, 3], [0]])
 45 | 
 46 |     def test_get_rows_to_keep(self):
 47 | 
 48 |         # rid must be a list
 49 |         with self.assertRaises(AssertionError) as e:
 50 |             sg.get_rows_to_keep(self.in_gct, rid="bad")
 51 |         self.assertIn("rid must be a list", str(e.exception))
 52 | 
 53 |         # bools
 54 |         out_rows = sg.get_rows_to_keep(self.in_gct, row_bool=[True, True, True, False])
 55 |         self.assertCountEqual(out_rows, ["a", "b", "c"])
 56 | 
 57 |         # rid and exclude_rid
 58 |         out_rows2 = sg.get_rows_to_keep(self.in_gct, rid=["a", "c", "d"], exclude_rid=["d"])
 59 |         self.assertCountEqual(out_rows2, ["a", "c"])
 60 | 
 61 |         # keep all rows
 62 |         out_rows3 = sg.get_rows_to_keep(self.in_gct)
 63 |         self.assertCountEqual(out_rows3, ["a", "b", "c", "d"])
 64 | 
 65 |         with self.assertRaises(AssertionError) as e:
 66 |             sg.get_rows_to_keep(self.in_gct, row_bool=[True, False, True])
 67 |         self.assertIn("row_bool must have length", str(e.exception))
 68 | 
 69 |         with self.assertRaises(AssertionError) as e:
 70 |             sg.get_rows_to_keep(self.in_gct, ridx=[True, False, True])
 71 |         self.assertIn("ridx must be a list of integers", str(e.exception))
 72 | 
 73 |         with self.assertRaises(AssertionError) as e:
 74 |             sg.get_rows_to_keep(self.in_gct, ridx=[0, 2, 5])
 75 |         self.assertIn("ridx contains an integer", str(e.exception))
 76 | 
 77 |     def test_get_cols_to_keep(self):
 78 |         # N.B. annoying that we have two extremely similar but separate methods
 79 |         # for rows and columns, but I think it's worth it to have clear error
 80 |         # messages
 81 | 
 82 |         # cid must be a list
 83 |         with self.assertRaises(AssertionError) as e:
 84 |             sg.get_cols_to_keep(self.in_gct, cid="real_bad")
 85 |         self.assertIn("cid must be a list", str(e.exception))
 86 | 
 87 |         # bools
 88 |         out_cols = sg.get_cols_to_keep(self.in_gct, col_bool=[False, True, True])
 89 |         self.assertCountEqual(out_cols, ["f", "g"])
 90 | 
 91 |         # cid and exclude_cid
 92 |         out_cols2 = sg.get_cols_to_keep(self.in_gct, cid=["g", "e", "f"], exclude_cid=["f"], cidx=None)
 93 |         self.assertCountEqual(out_cols2, ["g", "e"])
 94 | 
 95 |         # keep all cols
 96 |         out_cols3 = sg.get_cols_to_keep(self.in_gct)
 97 |         self.assertCountEqual(out_cols3, ["e", "f", "g"])
 98 | 
 99 |         with self.assertRaises(AssertionError) as e:
100 |             sg.get_cols_to_keep(self.in_gct, col_bool=[True, False, True, True])
101 |         self.assertIn("col_bool must have length", str(e.exception))
102 | 
103 |         with self.assertRaises(AssertionError) as e:
104 |             sg.get_cols_to_keep(self.in_gct, cidx=[True, False, True])
105 |         self.assertIn("cidx must be a list of integers", str(e.exception))
106 | 
107 |         with self.assertRaises(AssertionError) as e:
108 |             sg.get_cols_to_keep(self.in_gct, cidx=[10])
109 |         self.assertIn("cidx contains an integer", str(e.exception))
110 | 
111 | if __name__ == '__main__':
112 |     setup_logger.setup(verbose=True)
113 |     unittest.main()
114 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/tests/python3_tests/test_transform_gctoo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest
 3 | import logging
 4 | import pandas
 5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 6 | import cmapPy.pandasGEXpress.GCToo as GCToo
 7 | import cmapPy.pandasGEXpress.transform_gctoo as tg
 8 | 
 9 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
10 | 
11 | 
12 | class TestSubset(unittest.TestCase):
13 |     def test_transpose(self):
14 |         data_df = pandas.DataFrame({"a":range(2,5), "b":range(7,10)})
15 |         logger.debug("happy path - data_df:\n{}".format(data_df))
16 | 
17 |         row_metadata_df = pandas.DataFrame({"rm1":range(3)}, index=data_df.index)
18 |         logger.debug("row_metadata_df:\n{}".format(row_metadata_df))
19 | 
20 |         col_metadata_df = pandas.DataFrame({"cm1":range(2), "cm2":range(3,5)}, index=data_df.columns)
21 |         logger.debug("col_metadata_df:\n{}".format(col_metadata_df))
22 | 
23 |         my_gctoo = GCToo.GCToo(data_df, row_metadata_df=row_metadata_df, col_metadata_df=col_metadata_df)
24 |         logger.debug("my_gctoo:\n{}".format(my_gctoo))
25 | 
26 |         r = tg.transpose(my_gctoo)
27 |         logger.debug("result r:\n{}".format(r))
28 | 
29 |         logger.debug("r.data_df:\n{}".format(r.data_df))
30 |         self.assertTrue(data_df.equals(r.data_df.T))
31 | 
32 |         logger.debug("r.row_metadata_df:\n{}".format(row_metadata_df))
33 |         self.assertTrue(col_metadata_df.equals(r.row_metadata_df))
34 | 
35 |         logger.debug("r.col_metadata_df:\n{}".format(r.col_metadata_df))
36 |         self.assertTrue(row_metadata_df.equals(r.col_metadata_df))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     setup_logger.setup(verbose=True)
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/transform_gctoo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | transform_gctoo.py
 3 | 
 4 | module to contain various transformations of GCToo objects.  Initially just transpose.
 5 | 
 6 | """
 7 | import logging
 8 | 
 9 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
10 | import cmapPy.pandasGEXpress.GCToo as GCToo
11 | 
12 | 
13 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
14 | 
15 | def transpose(my_gctoo):
16 |     new_gctoo = GCToo.GCToo(
17 |         data_df=my_gctoo.data_df.T,
18 |         row_metadata_df=my_gctoo.col_metadata_df,
19 |         col_metadata_df=my_gctoo.row_metadata_df
20 |     )
21 | 
22 |     return new_gctoo


--------------------------------------------------------------------------------
/cmapPy/pandasGEXpress/write_gct.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | import numpy as np
  4 | import os
  5 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
  6 | 
  7 | __author__ = "Lev Litichevskiy"
  8 | __email__ = "lev@broadinstitute.org"
  9 | 
 10 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 11 | 
 12 | # Only writes GCT1.3
 13 | VERSION = "1.3"
 14 | 
 15 | 
 16 | def write(gctoo, out_fname, data_null="NaN", metadata_null="-666", filler_null="-666", data_float_format="%.4f"):
 17 |     """Write a gctoo object to a gct file.
 18 | 
 19 |     Args:
 20 |         gctoo (gctoo object)
 21 |         out_fname (string): filename for output gct file
 22 |         data_null (string): how to represent missing values in the data (default = "NaN")
 23 |         metadata_null (string): how to represent missing values in the metadata (default = "-666")
 24 |         filler_null (string): what value to fill the top-left filler block with (default = "-666")
 25 |         data_float_format (string): how many decimal points to keep in representing data
 26 |             (default = 4 digits; None will keep all digits)
 27 | 
 28 |     Returns:
 29 |         None
 30 | 
 31 |     """
 32 |     # Create handle for output file
 33 |     if not out_fname.endswith(".gct"):
 34 |         out_fname += ".gct"
 35 |     f = open(out_fname, "w")
 36 | 
 37 |     # Write first two lines
 38 |     dims = [str(gctoo.data_df.shape[0]), str(gctoo.data_df.shape[1]),
 39 |             str(gctoo.row_metadata_df.shape[1]), str(gctoo.col_metadata_df.shape[1])]
 40 |     write_version_and_dims(VERSION, dims, f)
 41 | 
 42 |     # Write top half of the gct
 43 |     write_top_half(f, gctoo.row_metadata_df, gctoo.col_metadata_df,
 44 |                    metadata_null, filler_null)
 45 | 
 46 |     # Write bottom half of the gct
 47 |     write_bottom_half(f, gctoo.row_metadata_df, gctoo.data_df,
 48 |                       data_null, data_float_format, metadata_null)
 49 | 
 50 |     f.close()
 51 |     logger.info("GCT has been written to {}".format(out_fname))
 52 | 
 53 | 
 54 | def write_version_and_dims(version, dims, f):
 55 |     """Write first two lines of gct file.
 56 | 
 57 |     Args:
 58 |         version (string): 1.3 by default
 59 |         dims (list of strings): length = 4
 60 |         f (file handle): handle of output file
 61 |     Returns:
 62 |         nothing
 63 |     """
 64 |     f.write(("#" + version + "\n"))
 65 |     f.write((dims[0] + "\t" + dims[1] + "\t" + dims[2] + "\t" + dims[3] + "\n"))
 66 | 
 67 | 
 68 | def write_top_half(f, row_metadata_df, col_metadata_df, metadata_null, filler_null):
 69 |     """ Write the top half of the gct file: top-left filler values, row metadata
 70 |     headers, and top-right column metadata.
 71 | 
 72 |     Args:
 73 |         f (file handle): handle for output file
 74 |         row_metadata_df (pandas df)
 75 |         col_metadata_df (pandas df)
 76 |         metadata_null (string): how to represent missing values in the metadata
 77 |         filler_null (string): what value to fill the top-left filler block with
 78 | 
 79 |     Returns:
 80 |         None
 81 |     """
 82 |     # Initialize the top half of the gct including the third line
 83 |     size_of_top_half_df = (1 + col_metadata_df.shape[1],
 84 |                            1 + row_metadata_df.shape[1] + col_metadata_df.shape[0])
 85 | 
 86 |     top_half_df = pd.DataFrame(np.full(size_of_top_half_df, filler_null, dtype=object))
 87 | 
 88 |     # Assemble the third line of the gct: "id", then rhds, then cids
 89 |     top_half_df.iloc[0, :] = np.hstack(("id", row_metadata_df.columns.values, col_metadata_df.index.values))
 90 | 
 91 |     # Insert the chds
 92 |     top_half_df.iloc[range(1, top_half_df.shape[0]), 0] = col_metadata_df.columns.values
 93 | 
 94 |     # Insert the column metadata, but first convert to strings and replace NaNs
 95 |     col_metadata_indices = (range(1, top_half_df.shape[0]),
 96 |                             range(1 + row_metadata_df.shape[1], top_half_df.shape[1]))
 97 |     # pd.DataFrame.loc to insert into dataframe(python3)
 98 |     top_half_df.loc[col_metadata_indices[0], col_metadata_indices[1]] = (
 99 |         col_metadata_df.astype(str).replace("nan", value=metadata_null).T.values)
100 | 
101 |     # Write top_half_df to file
102 |     top_half_df.to_csv(f, header=False, index=False, sep="\t")
103 | 
104 | 
105 | def write_bottom_half(f, row_metadata_df, data_df, data_null, data_float_format, metadata_null):
106 |     """ Write the bottom half of the gct file: row metadata and data.
107 | 
108 |     Args:
109 |         f (file handle): handle for output file
110 |         row_metadata_df (pandas df)
111 |         data_df (pandas df)
112 |         data_null (string): how to represent missing values in the data
113 |         metadata_null (string): how to represent missing values in the metadata
114 |         data_float_format (string): how many decimal points to keep in representing data
115 | 
116 |     Returns:
117 |         None
118 |     """
119 |     # create the left side of the bottom half of the gct (for the row metadata)
120 |     size_of_left_bottom_half_df = (row_metadata_df.shape[0],
121 |                               1 + row_metadata_df.shape[1])
122 |     left_bottom_half_df = pd.DataFrame(np.full(size_of_left_bottom_half_df, metadata_null, dtype=object))
123 | 
124 |     #create the full bottom half by combining with the above with the matrix data
125 |     bottom_half_df = pd.concat([left_bottom_half_df, data_df.reset_index(drop=True)], axis=1)
126 |     bottom_half_df.columns = range(bottom_half_df.shape[1])
127 | 
128 |     # Insert the rids
129 |     bottom_half_df.iloc[:, 0] = row_metadata_df.index.values
130 | 
131 |     # Insert the row metadata, but first convert to strings and replace NaNs
132 |     row_metadata_col_indices = range(1, 1 + row_metadata_df.shape[1])
133 |     bottom_half_df.iloc[:, row_metadata_col_indices] = (
134 |         row_metadata_df.astype(str).replace("nan", value=metadata_null).values)
135 | 
136 |     # Write bottom_half_df to file
137 |     bottom_half_df.to_csv(f, header=False, index=False, sep="\t",
138 |                           na_rep=data_null,
139 |                           float_format=data_float_format)
140 | 
141 | 
142 | def append_dims_and_file_extension(fname, data_df):
143 |     """Append dimensions and file extension to output filename.
144 |     N.B. Dimensions are cols x rows.
145 | 
146 |     Args:
147 |         fname (string): output filename
148 |         data_df (pandas df)
149 |     Returns:
150 |         out_fname (string): output filename with matrix dims and .gct appended
151 |     """
152 |     # If there's no .gct at the end of output file name, add the dims and .gct
153 |     if not fname.endswith(".gct"):
154 |         out_fname = '{0}_n{1}x{2}.gct'.format(fname, data_df.shape[1], data_df.shape[0])
155 |         return out_fname
156 | 
157 |     # Otherwise, only add the dims
158 |     else:
159 |         basename = os.path.splitext(fname)[0]
160 |         out_fname = '{0}_n{1}x{2}.gct'.format(basename, data_df.shape[1], data_df.shape[0])
161 |         return out_fname
162 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/set_io/__init__.py


--------------------------------------------------------------------------------
/cmapPy/set_io/gmt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | gmt.py
  3 | 
  4 | IO methods for handling GMT files.
  5 | 
  6 | A GMT is stored as a list of dictionaries.
  7 | Each line is its own dictionary.
  8 | Each dictionary has the following keys:
  9 |     - head (string): identifier for the set
 10 |     - desc (string): longer description of the set
 11 |     - entries (list): members of the set 
 12 | 
 13 | AUTHOR: Corey Flynn, Broad Institute, 2012
 14 | MODIFIED: Lev Litichevskiy, 2017
 15 | 
 16 | """
 17 | import os
 18 | 
 19 | SET_IDENTIFIER_FIELD = "head"
 20 | SET_DESC_FIELD = "desc"
 21 | SET_MEMBERS_FIELD = "entry"
 22 | 
 23 | 
 24 | def read(file_path):
 25 |     """ Read a gmt file at the path specified by file_path.
 26 | 
 27 |     Args:
 28 |         file_path (string): path to gmt file
 29 | 
 30 |     Returns:
 31 |         gmt (GMT object): list of dicts, where each dict corresponds to one
 32 |             line of the GMT file
 33 | 
 34 |     """
 35 |     # Read in file
 36 |     actual_file_path = os.path.expanduser(file_path)
 37 |     with open(actual_file_path, 'r') as f:
 38 |         lines = f.readlines()
 39 |     
 40 |     # Create GMT object
 41 |     gmt = []
 42 |     
 43 |     # Iterate over each line
 44 |     for line_num, line in enumerate(lines):
 45 |         # Separate along tabs
 46 |         fields = line.split('\t')
 47 | 
 48 |         assert len(fields) > 2, (
 49 |             "Each line must have at least 3 tab-delimited items. " +
 50 |             "line_num: {}, fields: {}").format(line_num, fields)
 51 |         
 52 |         # Get rid of trailing whitespace
 53 |         fields[-1] = fields[-1].rstrip()
 54 |         
 55 |         # Collect entries
 56 |         entries = fields[2:]
 57 |         
 58 |         # Remove empty entries
 59 |         entries = [x for x in entries if x]
 60 | 
 61 |         assert len(set(entries)) == len(entries), (
 62 |             "There should not be duplicate entries for the same set. " +
 63 |             "line_num: {}, entries: {}").format(line_num, entries)
 64 | 
 65 |         # Store this line as a dictionary
 66 |         line_dict = {SET_IDENTIFIER_FIELD: fields[0],
 67 |                      SET_DESC_FIELD: fields[1],
 68 |                      SET_MEMBERS_FIELD: entries}
 69 |         gmt.append(line_dict)
 70 | 
 71 |     verify_gmt_integrity(gmt)
 72 | 
 73 |     return gmt
 74 | 
 75 | 
 76 | def verify_gmt_integrity(gmt):
 77 |     """ Make sure that set ids are unique.
 78 | 
 79 |     Args:
 80 |         gmt (GMT object): list of dicts
 81 | 
 82 |     Returns:
 83 |         None
 84 | 
 85 |     """
 86 | 
 87 |     # Verify that set ids are unique
 88 |     set_ids = [d[SET_IDENTIFIER_FIELD] for d in gmt]
 89 |     assert len(set(set_ids)) == len(set_ids), (
 90 |         "Set identifiers should be unique. set_ids: {}".format(set_ids))
 91 | 
 92 | 
 93 | def write(gmt, out_path):
 94 |     """ Write a GMT to a text file.
 95 | 
 96 |     Args:
 97 |         gmt (GMT object): list of dicts
 98 |         out_path (string): output path
 99 | 
100 |     Returns:
101 |         None
102 | 
103 |     """
104 |     with open(out_path, 'w') as f:
105 |         for _, each_dict in enumerate(gmt):
106 |             f.write(each_dict[SET_IDENTIFIER_FIELD] + '\t')
107 |             f.write(each_dict[SET_DESC_FIELD] + '\t')
108 |             f.write('\t'.join([str(entry) for entry in each_dict[SET_MEMBERS_FIELD]]))
109 |             f.write('\n')
110 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/grp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | grp.py
 3 | 
 4 | IO methods for handling GRP files.
 5 | 
 6 | A GRP file is stored as a list. Lines beginning with # are ignored.
 7 | 
 8 | AUTHOR: David Wadden, Broad Institute, 2012
 9 | MODIFIED: Lev Litichevskiy, 2017
10 | """
11 | 
12 | import os
13 | import re
14 | 
15 | 
16 | def read(in_path):
17 |     """ Read a grp file at the path specified by in_path.
18 | 
19 |     Args:
20 |         in_path (string): path to GRP file
21 | 
22 |     Returns:
23 |         grp (list)
24 | 
25 |     """
26 |     assert os.path.exists(in_path), "The following GRP file can't be found. in_path: {}".format(in_path)
27 | 
28 |     with open(in_path, "r") as f:
29 |         lines = f.readlines()
30 |         # need the second conditional to ignore comment lines
31 |         grp = [line.strip() for line in lines if line and not re.match("^#", line)]
32 | 
33 |     return grp
34 | 
35 | 
36 | def write(grp, out_path):
37 |     """ Write a GRP to a text file.
38 | 
39 |     Args:
40 |         grp (list): GRP object to write to new-line delimited text file
41 |         out_path (string): output path
42 | 
43 |     Returns:
44 |         None
45 | 
46 |     """
47 |     with open(out_path, "w") as f:
48 |         for x in grp:
49 |             f.write(str(x) + "\n")


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/cmapPy/set_io/tests/__init__.py


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/functional_tests/test.gmt:
--------------------------------------------------------------------------------
1 | A	this one is A	a1	a3	a2
2 | B	this one is B	b4	b2	b3
3 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/functional_tests/test.grp:
--------------------------------------------------------------------------------
1 | #a
2 | r
3 | d
4 | e
5 | #f
6 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/functional_tests/test_bad.gmt:
--------------------------------------------------------------------------------
1 | A   this one is A   a1  a3  a2
2 | B	this one is B	b4	b2	b3
3 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/functional_tests/test_bad2.gmt:
--------------------------------------------------------------------------------
1 | A	this one is A	a1	a3	a2
2 | B	this one is B	b4	b2	b2
3 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/test_gmt.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.set_io.gmt as gmt
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | FUNCTIONAL_TESTS_DIR = "cmapPy/set_io/tests/functional_tests/"
 9 | 
10 | 
11 | class TestGMT(unittest.TestCase):
12 | 
13 | 	@classmethod
14 | 	def setUpClass(cls):
15 | 		cls.example_gmt = [{"head": "A", "desc": "this one is A", "entry": ["a1", "a3", "a2"]},
16 | 		                   {"head": "B", "desc": "this one is B", "entry": ["b4", "b2", "b3"]}]
17 | 
18 | 	def test_read(self):
19 | 
20 | 		in_gmt = gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test.gmt"))
21 | 
22 | 		self.assertEqual(len(self.example_gmt), len(in_gmt))
23 | 		self.assertEqual(self.example_gmt[0], in_gmt[0])
24 | 		self.assertEqual(self.example_gmt[1], in_gmt[1])
25 | 
26 | 		with self.assertRaises(AssertionError) as e:
27 | 			gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad.gmt"))
28 | 		self.assertIn("3 tab-delimited items. line_num: 0", str(e.exception))
29 | 
30 | 		with self.assertRaises(AssertionError) as e:
31 | 			gmt.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test_bad2.gmt"))
32 | 		self.assertIn("same set. line_num: 1", str(e.exception))
33 | 
34 | 	def test_verify_gmt_integrity(self):
35 | 
36 | 		bad_gmt = [{"head": "A", "desc": "blah", "entry": ["a1", "a3", "a2"]},
37 | 		           {"head": "A", "desc": "blah", "entry": ["b4", "b2", "b3"]}]
38 | 
39 | 		with self.assertRaises(AssertionError) as e:
40 | 			gmt.verify_gmt_integrity(bad_gmt)
41 | 		self.assertIn("Set identifiers should be unique", str(e.exception))
42 | 
43 | 	def test_write(self):
44 | 
45 | 		out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_write.gmt")
46 | 		gmt.write(self.example_gmt, out_path)
47 | 		self.assertTrue(os.path.exists(out_path))
48 | 
49 | 		read_back_in = gmt.read(out_path)
50 | 		self.assertEqual(len(self.example_gmt), len(read_back_in))
51 | 		self.assertEqual(self.example_gmt[0], read_back_in[0])
52 | 		self.assertEqual(self.example_gmt[1], read_back_in[1])
53 | 
54 | 		# Cleanup
55 | 		os.remove(out_path)
56 | 
57 | if __name__ == "__main__":
58 | 	setup_logger.setup(verbose=True)
59 | 
60 | 	unittest.main()
61 | 


--------------------------------------------------------------------------------
/cmapPy/set_io/tests/test_grp.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 5 | import cmapPy.set_io.grp as grp
 6 | 
 7 | logger = logging.getLogger(setup_logger.LOGGER_NAME)
 8 | FUNCTIONAL_TESTS_DIR = "cmapPy/set_io/tests/functional_tests/"
 9 | 
10 | 
11 | class TestGRP(unittest.TestCase):
12 | 
13 | 	def test_read(self):
14 | 
15 | 		in_grp = grp.read(os.path.join(FUNCTIONAL_TESTS_DIR, "test.grp"))
16 | 		self.assertEqual(in_grp, ["r", "d", "e"])
17 | 
18 | 		with self.assertRaises(AssertionError) as e:
19 | 			grp.read("testt.grp")
20 | 		self.assertIn("The following GRP file", str(e.exception))
21 | 
22 | 	def test_write(self):
23 | 
24 | 		example_grp = ["x", "z", "w"]
25 | 
26 | 		out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_write.grp")
27 | 		grp.write(example_grp, out_path)
28 | 		self.assertTrue(os.path.exists(out_path))
29 | 
30 | 		read_back_in = grp.read(out_path)
31 | 		self.assertEqual(example_grp, read_back_in)
32 | 
33 | 		# Cleanup
34 | 		os.remove(out_path)
35 | 
36 | if __name__ == "__main__":
37 | 	setup_logger.setup(verbose=True)
38 | 
39 | 	unittest.main()
40 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | from . import scattergram
2 | from . import stratogram
3 | from . import cohort_view


--------------------------------------------------------------------------------
/cmapPy/visualization/cohort_view.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import logging
  4 | 
  5 | from IPython.display import display
  6 | 
  7 | import numpy as np
  8 | import pandas as pd 
  9 | 
 10 | logger = logging.getLogger()
 11 | 
 12 | 
 13 | def cohort_view_table(df,
 14 |                       category_label="category_label",
 15 |                       category_order="category_order",
 16 |                       flags=[],
 17 |                       flag_display_labels=[],
 18 |                      add_percentages=True):
 19 | 
 20 |     ''' Generate a DataFrame showing counts and percentages
 21 |     of subsets (defined by flags), stratified by categories.
 22 |     For instance, each row (category) may be a selectivity
 23 |     bucket, and each column can be the number of compounds in
 24 |     that bucket that passed a given threshold. A "Total"
 25 |     column shows the total number of compounds in each 
 26 |     bucket and a grand total sums them all up.
 27 |     @param df: DataFrame where each row is a compound and
 28 |     columns are various metrics and flags
 29 |     @kwarg category_label: name of the column that defines
 30 |     a category. The data is stratified based on this fieild.
 31 |     @kwarg category_order: order in which the categories should
 32 |     be displayed as rows of the table. There should be a one
 33 |     to one correspondence between category_label and category_order.
 34 |     @kwarg flags: list of column names defining binary flags. 
 35 |     These flags define subsets that will be counted and displayed
 36 |     as columns of the output table.
 37 |     @kwarg flag_display_labels: string labels for output columns
 38 |     corresonding to flags
 39 |     @kwarg add_percentages: whether to display percentages 
 40 |     alongside the counts.
 41 |     '''
 42 |     assert len(flags) == len(flag_display_labels), '"flags" and "flag_display_labels" should have the same length'
 43 |     
 44 |     df['Total'] = 1
 45 |     columns = ['Total'] + flags 
 46 |     df = (
 47 |         df
 48 |         .groupby([category_order, category_label])[columns]
 49 |         .sum()
 50 |         .sort_index(axis=0, level=category_order)
 51 |         .reset_index(level=[category_order])
 52 |         .drop(columns=category_order)
 53 |     )
 54 | 
 55 |     column_names = ["Total"] + flag_display_labels 
 56 |     df.columns = column_names
 57 |     df.index.names = ['Category'] 
 58 | 
 59 |     df = df.T
 60 |     num_categories = len(df.columns)
 61 |     logger.info("num_categories: {}".format(num_categories))
 62 | 
 63 |     # Test comopound fields
 64 |     cpd_fields = [c for c in df.columns if 'Test subset' in c]
 65 |     if len(cpd_fields) != 0:
 66 |         df['Test Compounds Total'] = df[cpd_fields].sum(1)
 67 |     df['Grand Total'] = df.iloc[:, :num_categories].sum(1)
 68 |     df = df.T
 69 |     df.index.name = None
 70 |     
 71 |     if add_percentages:
 72 |         df = df.transform(_add_row_percentages, axis=1)
 73 |     return df
 74 | 
 75 | 
 76 | def _fmt_total_percentages(x, total):
 77 |     '''
 78 |     Formatting function for DataFrame.Style. Formats the 
 79 |     "Total" column to show percentages. 
 80 |     '''
 81 |     s = '''<span style="width:50%;float: left;text-align:right;font-weight:bold">{:,d} </span>
 82 |     <span style="font-size:1em;color:#FF7043;width:50%;text-align:left;float: right;padding-left:1em;font-weight:bold">
 83 |     ({:.0%})</span>'''.format(int(x), float(x) / total)
 84 |     return s
 85 | 
 86 | 
 87 | def _add_row_percentages(s):
 88 |     '''Convert all columns except for "Total" to a string
 89 |     that shows the integer count as well as the percentage
 90 |     of Total within the row.'''
 91 |     s = s + 0
 92 |     index = s.index
 93 |     assert "Total" in index
 94 |     total = s['Total']
 95 |     for label, x in s.iteritems():
 96 |         if label == "Total":
 97 |             continue
 98 |         s[label] = '''<span style="width:50%;float: left;text-align:right">{:,d} </span>
 99 |         <span style="font-size:1em;color:#888888;width:50%;text-align:left;float: right;padding-left:1em">
100 |         ({:.0%})</span>'''.format(int(x), float(x) / total)
101 |     return s
102 | 
103 | 
104 | def display_cohort_stats_table(table, barplot_column):
105 |     font_family = "Roboto"
106 |     idx = pd.IndexSlice
107 |     # indexes of the rows corresponding to categories, exludes 
108 |     # the last "total" sums
109 |     group_ids = [x for x in table.index if 'Total' not in x]
110 |     
111 |     barplot_max = table.loc[group_ids, barplot_column].sum()
112 |     
113 |     # Sum of numbers in Total column (excluding Grand Total, obviously)
114 |     total = table.loc['Grand Total', 'Total']
115 |     table_stylized = (
116 |         table
117 |         .style
118 |         .format(
119 |             lambda s: _fmt_total_percentages(s, total),
120 |             subset=pd.IndexSlice[:, 'Total']
121 |         )
122 |         .applymap(lambda x : 'text-align:center;')
123 |         .applymap(lambda x: "border-left:solid thin #d65f5f", subset=idx[:, barplot_column])
124 |         .bar(subset=idx[group_ids, barplot_column], color='#FFDACF', vmin=0, vmax=barplot_max)
125 |         .applymap(lambda x: "padding:0.5em 1em 0.5em 1em")
126 |         .applymap(lambda x: "background:#444;color:white;border:solid thin #000;font-weight:bold", subset=idx['Grand Total', :])
127 |         .applymap(lambda x: "border-left:solid thin #ddd", subset=idx[:, 'Total'])
128 |          .set_table_styles(
129 |              [
130 |                  {'selector' : 'table',
131 |                   'props' : [('font-family', font_family), ('font-size', '30px'), ('border', 'solid thin #999')]
132 |                  },
133 |                  {'selector' : 'thead, tbody', 'props' : [
134 |                      ('border', 'solid 1px #ddd'),
135 |                  ]
136 |                  },
137 |                  {'selector' :
138 |                   'thead', 'props' : [
139 |                      ('border-bottom', 'solid 2px #ddd'),
140 |                      ('border-top', 'solid 2px #ddd'),
141 |                      ('background', '#fefefe'), ('text-align', 'center'),
142 |                      ('font-family', font_family),
143 |                       ('font-size' , '1em')
144 |                  ]
145 |                  },
146 |                  {'selector' : 'th',
147 |                   'props' : [ 
148 |                       ('text-align', 'center'),
149 |                       ('color' , '#444'),
150 |                   ]
151 |                  },
152 |                  {'selector' : 'th.col_heading',
153 |                   'props' : [ 
154 |                       ('max-width', '8em')
155 |                   ]
156 |                  },
157 |                  {'selector' : 'th:not(.blank)',
158 |                   'props' : [ 
159 | #                       ('border-left','solid thin #ddd'), 
160 | #                       ('border-right','solid thin #ddd'), 
161 |                   ]
162 |                  },
163 |                  {'selector' : 'tbody', 'props' : [ 
164 |                      ('text-align', 'center'), ('background', '#fff'), ('font-size' , '1.em'),
165 |                                                    ('font-family', font_family)]},
166 |                  {'selector' : '.row_heading',
167 |                   'props' : [('border-right', 'solid thin #ddd'), ('text-align', 'left')]}                 
168 |              ]
169 |           )
170 |         )
171 |     if 'Test Compounds Total' in table.index:
172 |         table_stylized = table_stylized.applymap(lambda x: "border-top:solid thin #aaa", subset=idx['Test Compounds Total', :])
173 |     
174 |     return table_stylized
175 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/scattergram.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Sep 30, 2019
  3 | @author: Navid Dianati
  4 | @contact: navid@broadinstitute.org
  5 | '''
  6 | 
  7 | import logging
  8 | import os
  9 | 
 10 | import matplotlib
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | import pandas as pd
 15 | import seaborn as sns
 16 | 
 17 | logger = logging.getLogger()
 18 | 
 19 | 
 20 | def scattergram(
 21 |     df, columns, column_names, title="",
 22 |     outfile='',
 23 |     fig_dpi=150,
 24 |     fontfamily="Roboto"
 25 |     ):
 26 |     '''
 27 |     Make a grid of scatterplots of a set of columns against each other. 
 28 |     The values should all be "normalized", i.e., between 0 and 1.
 29 |     @param df: Pandas DataFrame containing the variables to be scattered.
 30 |     @param columns: list of column names to plot.
 31 |     @param column_names: list of display names corresponding to the 
 32 |     variable columns.
 33 |     @return: g: Seaborn PairGrid object
 34 |     '''
 35 |     
 36 |     df = df.copy()[columns]
 37 |     
 38 |     # rename the columns
 39 |     df.columns = column_names
 40 | 
 41 |     df = df.dropna()
 42 |     with sns.axes_style('ticks') as c1:
 43 |         
 44 |         g = sns.PairGrid(
 45 |             data=df, vars=column_names,
 46 |             palette="Greys", despine=False,
 47 |             height=2
 48 |             )
 49 |         g.map_lower(
 50 |             plt.scatter,
 51 |             s=10,
 52 |             lw=0,
 53 |             alpha=0.5,
 54 |             color="#555555"
 55 |             )
 56 |         g.map_diag(
 57 |             _plot_hist,
 58 |             **dict(
 59 |                 normed=True,
 60 |                 alpha=0.5,
 61 |                 bins=np.linspace(-0.00001, 1.00001, 21),
 62 |                 histtype="bar",
 63 |                 edgecolor="#ffffff"
 64 |                 )
 65 |             )
 66 |         
 67 |         if title:
 68 |             g.fig.text(1, 1,
 69 |                 "{} (N = {:,})".format(title, len(df)),
 70 |                 fontsize=30,
 71 |                 fontname=fontfamily,
 72 |                 fontweight="bold",
 73 |                 horizontalalignment="right",
 74 |                 verticalalignment="top"
 75 |                 )
 76 |         
 77 |         plt.subplots_adjust(wspace=0, hspace=0.0)
 78 |         font_properties = dict(family=fontfamily, weight="bold")
 79 |         _adjust_axes(g, font_properties)
 80 |         _draw_row_labels(g, column_names)
 81 |         
 82 |         if outfile:
 83 |             plt.savefig(outfile, dpi=fig_dpi)
 84 |         return g
 85 |     
 86 |     
 87 | def _adjust_axes(g, font_properties={}):
 88 |     for i, j in zip(*np.triu_indices_from(g.axes, 1)):
 89 |         g.axes[i, j].set_visible(False)
 90 | 
 91 |     for i in range(g.axes.shape[0]):
 92 |         for j in range(g.axes.shape[1]):
 93 |             ax = g.axes[i, j]
 94 |             if i > j:
 95 |                 ax.set_zorder(100)
 96 |                 ax.set_xlim(-0.1, 1.1)
 97 |                 ax.set_ylim(-0.1, 1.1)
 98 |                 ax.set_ylabel('')
 99 |                 ax.set_xlabel('')
100 |                 frame_line_width = 2
101 |                 _set_axis_thickness(ax, frame_line_width)
102 |                 ax.xaxis.set_tick_params(width=frame_line_width)
103 |                 ax.yaxis.set_tick_params(width=frame_line_width)
104 |                 _set_ticks_fontproperties(ax, font_properties)
105 |     for i in range(g.axes.shape[0]):
106 |         ax = g.axes[i, i]
107 |         ax.set_ylim(-0.1, 1.1)
108 |         ax.set_xlim(-0.1, 1.1)
109 |         ax.set_ylabel('')
110 |         ax.set_xlabel('')
111 |         ax.set_yticks([0, 0.5, 1])
112 |         ax.set_xticks([0, 0.5, 1])
113 |         _set_axis_thickness(ax, 1)
114 |         _set_axis_style(ax, '--')
115 |         _set_ticks_fontproperties(ax, font_properties)
116 | 
117 | 
118 | def _draw_row_labels(g, column_names):
119 |     for i in range(g.axes.shape[0]):
120 |         label = column_names[i]
121 |         ax = g.axes[i, i]
122 |         ax.annotate(label, (0.5, .5),
123 |                 horizontalalignment="center",
124 |                 verticalalignment="center",
125 |                 fontweight="bold",
126 |                 fontname="Roboto",
127 |                 fontsize=18,
128 |                 zorder=100,
129 | #                     bbox=dict(boxstyle="square,pad=0.5", fc="white", ec="#dddddd", lw=0)
130 |                ) 
131 | 
132 | 
133 | def _set_axis_thickness(ax, width):
134 |     for axis in ['top', 'bottom', 'left', 'right']:
135 |         ax.spines[axis].set_linewidth(width)
136 | 
137 | 
138 | def _set_axis_style(ax, linestyle):
139 |     for axis in ['top', 'bottom', 'left', 'right']:
140 |         ax.spines[axis].set_linestyle(linestyle)
141 | 
142 | 
143 | def _plot_hist(data, **kwargs):
144 |     plt.hist(data, clip_on=True, **kwargs)
145 | 
146 | 
147 | def _set_ticks_fontproperties(ax, font_properties):
148 |     ax.set_xticklabels(ax.get_xticks(), font_properties)
149 |     ax.set_yticklabels(ax.get_yticks(), font_properties)
150 | 
151 | 
152 | def plot_selected_points_among_all(*args, **kwargs):
153 |     '''
154 |     Legacy function.
155 |     '''
156 |     return scattergram(*args, **kwargs)
157 | 
158 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/test_cohort_view.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 30, 2019
 3 | 
 4 | @author: Navid Dianati
 5 | @contact: navid@broadinstitute.org
 6 | '''
 7 | import unittest
 8 | 
 9 | import cohort_view
10 | import matplotlib.pyplot as plt
11 | import pandas as pd
12 | 
13 | 
14 | class Test(unittest.TestCase):
15 | 
16 |     def testCohortView(self):
17 |         filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt"
18 |         df = pd.read_csv(filename, sep="\t")
19 | 
20 |         df['is_reproducible'] = (df['cc_q75'] > 0.2) + 0
21 |         df['is_high_mag'] = (df['mag_vi'] > 0.2) + 0
22 |         flags = ['is_reproducible', 'is_high_mag']
23 |         column_names = ['Reproducible', 'magnitude']
24 |         table = cohort_view.cohort_view_table(
25 |             df,
26 |             category_label="category_label",
27 |             category_order="category_order",
28 |             flags=flags,
29 |             flag_display_labels=column_names
30 |             
31 |             )
32 |         print(table)
33 | #         plt.savefig("./test_files/cohort_view_test.html", dpi=150)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     # import sys;sys.argv = ['', 'Test.testStratogram']
38 |     unittest.main()
39 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/test_scattergram.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 30, 2019
 3 | 
 4 | @author: Navid Dianati
 5 | @contact: navid@broadinstitute.org
 6 | '''
 7 | import unittest
 8 | 
 9 | import matplotlib.pyplot as plt
10 | import pandas as pd
11 | import scattergram
12 | 
13 | 
14 | class Test(unittest.TestCase):
15 | 
16 |     def testScattergram1(self):
17 |         filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt"
18 |         df = pd.read_csv(filename, sep="\t")
19 |         plot_columns = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi']
20 |         column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude']
21 |         scattergram.scattergram(
22 |             df,
23 |             columns=plot_columns,
24 |             column_names=column_names,
25 |             title="This is a test"
26 |             )
27 |         plt.show()
28 |     
29 |     def testScattergram2(self):
30 |         filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt"
31 |         df = pd.read_csv(filename, sep="\t")
32 |         plot_columns = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi']
33 |         column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude']
34 |         scattergram.scattergram(
35 |             df,
36 |             columns=plot_columns,
37 |             column_names=column_names,
38 |             title="This is a test",
39 |             outfile="./test_files/deleteme.png",
40 |             fig_dpi=50
41 |             )
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     # import sys;sys.argv = ['', 'Test.testScattergram']
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/cmapPy/visualization/test_stratogram.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Sep 30, 2019
 3 | 
 4 | @author: Navid Dianati
 5 | @contact: navid@broadinstitute.org
 6 | '''
 7 | import unittest
 8 | 
 9 | import matplotlib.pyplot as plt
10 | import pandas as pd
11 | import stratogram
12 | 
13 | 
14 | class Test(unittest.TestCase):
15 | 
16 |     def testStratogram(self):
17 |         filename = "./test_files/PBRANT_CYCLE1_key_metrics_expanded_sample.txt"
18 |         df = pd.read_csv(filename, sep="\t")
19 |         metrics = ['ss_ltn2', 'cc_q75', 'spec_vi', 'mag_vi']
20 |         column_names = ['Strength', 'Reproducibility', 'specificity', 'magnitude']
21 |         stratogram.stratogram(
22 |             df,
23 |             category_definition="category_label",
24 |             category_label="category_label_abridged",
25 |             category_order="category_order",
26 |             metrics=metrics,
27 |             figsize=(20, 15),
28 |             column_display_names=column_names,
29 |             xtick_orientation="horizontal",
30 |             ylabel_fontsize=15,
31 |             xlabel_fontsize=15,
32 |             xlabel_fontcolor="#555555",
33 |             ylabel_fontcolor="#555555",
34 |             fontfamily="Roboto"
35 |             )
36 |         plt.savefig("./test_files/stratogram_test.png", dpi=150)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     # import sys;sys.argv = ['', 'Test.testStratogram']
41 |     unittest.main()
42 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | cmapPy==2.2.0
2 | 


--------------------------------------------------------------------------------
/docs/source/available_modules.rst:
--------------------------------------------------------------------------------
 1 | .. _available_modules:
 2 | 
 3 | Available Modules
 4 | =================
 5 | 
 6 | clue_api_client
 7 | ---------------
 8 | 
 9 |   A Python client for easy interaction with the Connectivity Map (CLUE) API. 
10 | 
11 |   Maintainer: David Lahr, dlahr@broadinstitute.org
12 | 
13 | pandasGEXpress
14 | --------------
15 | 
16 |   A package (integrated with Python's pandas package) allowing users to easily read, modify, and write .gct and .gctx files.
17 | 
18 |   Also features the following command-line tools:
19 |   
20 | 	``gct2gctx``: converts .gct to .gctx file. Type ``gct2gctx -h`` for help.
21 | 	
22 | 	``gctx2gct``: converts .gctx to .gct file. Type ``gctx2gct -h`` for help.
23 | 
24 | 	``concat``: Concats two or more .gct/x files as specified by user. Type ``concat -h`` for help.
25 | 
26 |   Maintainer: Oana Enache, oana@broadinstitute.org
27 |   
28 | set_io
29 | ------
30 | 
31 |    set_io contains simple scripts for parsing two other common file types used by the Connectivity Map: GRP and GMT files. The GRP file is used for storing a single set of things (e.g. a single gene set), while the GMT file is used for storing multiple sets of things (e.g. several gene sets).
32 |    
33 |    Maintainer: Lev Litichevskiy, lev@broadinstitute.org 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/docs/source/build.rst:
--------------------------------------------------------------------------------
 1 | .. _install:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | We  highly recommend the using a prebuilt distribution of cmapPy along with a virtual environment (here we demonstrate how to use it with conda).
 7 | 
 8 | **Option 1 (recommended): Setup pandasGEXpress in a new conda environment**
 9 | 
10 | * (All operating systems; If you haven't already) install ``miniconda``
11 | 	* Download/follow instructions provided `here <https://conda.io/miniconda.html>`_. Unless you have personal preferences/reasons to do so, we recommend installing Miniconda over Anaconda because it's more lightweight.
12 | 	* On the command line, type ``conda info`` to verify that conda has been properly instaled on your system. You should see some information about the "current conda install"; if not, your installation didn't work.
13 | 
14 | * (Linux and Mac) Create & activate your cmapPy environment:
15 | 
16 | 	Note. We currently use Python 2.7.11 for our production code (hence its specification); however, other versions of Python 2 should be stable as well. We do not currently support Python 3. 
17 | 
18 | 	**Step 1** 
19 | 
20 | 	Python 2: ``conda create --name my_cmapPy_env -c bioconda python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0 cmappy``
21 | 
22 | 	* ``-c bionconda`` tells conda that it should look for packages in the bioconda channel (that's where cmapPy lives)
23 | 
24 | 
25 | 	**Step 2**
26 | 
27 | 	``source activate my_cmapPy_env``
28 | 
29 | * (Windows) Create & activate your cmapPy environment:
30 | 
31 | 	**Step 1**
32 | 
33 | 	Python 2: ``conda create --name my_cmapPy_env python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0``
34 | 
35 | 	**Step 2**
36 | 
37 | 	``pip install cmapPy``
38 | 
39 | 	``source activate my_cmapPy_env``
40 | 
41 | ...and then cmapPy (including command line tools) should be available for use.
42 | 
43 | To update cmapPy in your conda environment (from activate environment): ``conda update cmappy``
44 | 
45 | **Option 2: Install cmapPy from PyPI**
46 | 
47 | * ``pip install cmapPy``
48 | * Note: For use of other virtualenvs, we include a requirements.txt file in the cmapPy package that you can use to install the proper versions of depencies.
49 | 
50 | **Option 3: Install as a development environment**
51 | 
52 | A development environment will allow you to use the cmapPy code as it is in a clone of the repository, allowing you to try out changes and modifications you may wish to make.
53 | 
54 | Follow the instructions for Option 1 or Option 2 above but change the name of the environment to e.g. ``my_cmapPy_dev_env`` and do not include ``cmappy`` in the list of packages to install (or do not install it with pip), then activate this environment, i.e.:
55 | 	
56 | 	Python 2.7: ``conda create --name my_cmapPy_dev_env python=2.7.11 numpy=1.11.2 pandas=0.20.3 h5py=2.7.0 requests==2.13.0``
57 | 	
58 | 	``source activate my_cmapPy_dev_env``
59 | 
60 | Clone the cmapPy github repository, cd into the repo's top-level directory, and run:
61 | 
62 | 	``$ python setup.py develop``
63 | 
64 | To test your setup, change into a directory outside the repo, run the python interpreter and try:
65 | 	``cd <ELSEWHERE>``
66 | 
67 | 	``$ python``
68 | 
69 | 	``>> import cmapPy.pandasGEXpress.parse_gct as pg``
70 | 


--------------------------------------------------------------------------------
/docs/source/citing.rst:
--------------------------------------------------------------------------------
1 | .. _citing:
2 | 
3 | Citation Information
4 | ====================
5 | 
6 | If you use GCTx and/or cmapPy, please cite `Enache et al.`_
7 | 
8 | .. _Enache et al.: https://www.biorxiv.org/content/early/2017/11/30/227041
9 | 


--------------------------------------------------------------------------------
/docs/source/clue_api_client.rst:
--------------------------------------------------------------------------------
 1 | .. _clueclient:
 2 | 
 3 | API (clue_api_client)
 4 | =====================
 5 | 
 6 | To use the CLUE API client, put a copy of the file ``example_cmapPy_config_file.cfg`` in your home directory and name the copy ``.cmapPy.cfg``.  Replace the clue_api_user_key entries in that file with your CLUE API user key that you obtained from the CLUE website.
 7 | 
 8 | .. automodule:: cmapPy.clue_api_client.clue_api_client
 9 |    :members:
10 | 
11 | .. automodule:: cmapPy.clue_api_client.cell_queries
12 |    :members:
13 | 
14 | .. automodule:: cmapPy.clue_api_client.gene_queries
15 |    :members:
16 | 
17 | .. automodule:: cmapPy.clue_api_client.macchiato_queries
18 |    :members:
19 | 
20 | .. automodule:: cmapPy.clue_api_client.pert_queries
21 |    :members:
22 | 


--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
 1 | .. _contributing:
 2 | 
 3 | Contribution guidelines
 4 | =======================
 5 | 
 6 | We welcome contributors! For your pull requests, please include the following:
 7 | 
 8 | * Sample code/file that reproducibly causes the bug/issue
 9 | * Documented code (include a docstring for new functions!) providing fix
10 | * Unit tests evaluating added/modified methods.


--------------------------------------------------------------------------------
/docs/source/faq.rst:
--------------------------------------------------------------------------------
1 | .. _faq:
2 | 
3 | FAQ
4 | ===
5 | 
6 | We will be adding FAQs as they come up. 
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | cmapPy: Python utilities for Connectivity Map Resources
 2 | *******************************************************
 3 | 
 4 | Provided by the Connectivity Map, Broad Institute of MIT and Harvard. More information 
 5 | `on our website <https://clue.io/code>`_
 6 | 
 7 | Where to Start
 8 | --------------
 9 | 
10 | * :ref:`Installation <install>`
11 | * :ref:`Summary of Available Modules <available_modules>`
12 | 
13 | 
14 | High-level API reference
15 | ------------------------
16 | 
17 | .. toctree::
18 |     :maxdepth: 1
19 | 
20 |     clue_api_client
21 |     pandasGEXpress
22 |     set_io
23 | 
24 | 
25 | Other resources
26 | ---------------
27 | 
28 | * `GitHub project <https://github.com/cmap/cmapPy>`_
29 | * `Tutorials and additional reference <https://clue.io/code>`_
30 | 
31 | 
32 | Meta-info about cmapPy
33 | ----------------------
34 | 
35 | .. toctree::
36 |     :maxdepth: 1
37 | 
38 |     contributing
39 |     faq
40 |     licenses
41 |     citing
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/docs/source/licenses.rst:
--------------------------------------------------------------------------------
 1 | .. _licenses:
 2 | 
 3 | BSD 3-Clause License
 4 | ====================
 5 | 
 6 | Copyright (c) 2017, Connectivity Map (CMap) at the Broad Institute, Inc. 
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 | * Redistributions of source code must retain the above copyright notice, this
13 |   list of conditions and the following disclaimer.
14 | 
15 | * Redistributions in binary form must reproduce the above copyright notice,
16 |   this list of conditions and the following disclaimer in the documentation
17 |   and/or other materials provided with the distribution.
18 | 
19 | * Neither the name of the copyright holder nor the names of its
20 |   contributors may be used to endorse or promote products derived from
21 |   this software without specific prior written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/docs/source/pandasGEXpress.rst:
--------------------------------------------------------------------------------
 1 | .. _pandasGEXpress:
 2 | 
 3 | GCT, GCTx (pandasGEXpress)
 4 | ==========================
 5 | 
 6 | pandasGEXpress package (integrated with Python's `pandas <http://pandas.pydata.org/>`_ package) allowing users to easily read, modify, and write .gct and .gctx files. Note that .gctx files are more performant than .gct, and we recommend their use. 
 7 | 
 8 | 
 9 | GCToo Class
10 | -----------
11 | .. autoclass:: cmapPy.pandasGEXpress.GCToo.GCToo
12 | 
13 | Parsing
14 | -------
15 | 
16 | .. autofunction:: cmapPy.pandasGEXpress.parse.parse
17 | 
18 | Writing
19 | -------
20 | 
21 | .. autofunction:: cmapPy.pandasGEXpress.write_gctx.write
22 | 
23 | .. autofunction:: cmapPy.pandasGEXpress.write_gct.write
24 | 
25 | Concatenating
26 | -------------
27 | 
28 | .. automodule:: cmapPy.pandasGEXpress.concat
29 |    :members:
30 | 
31 | Converting .gct <-> .gctx
32 | -------------------------
33 | 
34 | .. automodule:: cmapPy.pandasGEXpress.gct2gctx
35 |    :members:
36 | 
37 | .. automodule:: cmapPy.pandasGEXpress.gctx2gct
38 |    :members:
39 | 
40 | Extracting from .grp files
41 | --------------------------
42 | 
43 | .. automodule:: cmapPy.pandasGEXpress.plategrp
44 |    :members:
45 | 
46 | Subsetting
47 | -------
48 | 
49 | .. automodule:: cmapPy.pandasGEXpress.random_slice
50 |    :members:
51 | 
52 | .. automodule:: cmapPy.pandasGEXpress.subset
53 |    :members:
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/source/pandasgexpress_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/docs/source/pandasgexpress_fig.png


--------------------------------------------------------------------------------
/docs/source/set_io.rst:
--------------------------------------------------------------------------------
 1 | .. _set_io:
 2 | 
 3 | GRP, GMT (set_io) 
 4 | =================
 5 | 
 6 | set_io contains simple scripts for parsing two other common file types used by the Connectivity Map: GRP and GMT files. 
 7 | The GRP file is used for storing a single set of things (e.g. a single gene set), while the GMT file is used for storing multiple sets of things (e.g. several gene sets).
 8 | 
 9 | Further details on GRP and GMT files can be found `here
10 | <https://clue.io/connectopedia/grp_gmt_gmx_format>`_.
11 | 
12 | Reading GRP files
13 | -----------------
14 | 
15 | .. autofunction:: cmapPy.set_io.grp.read
16 | 
17 | Writing GRP files
18 | -----------------
19 | 
20 | .. autofunction:: cmapPy.set_io.grp.write
21 | 
22 | Reading GMT files
23 | -----------------
24 | 
25 | .. autofunction:: cmapPy.set_io.gmt.read
26 | 
27 | Verifying GMT integrity
28 | -----------------------
29 | 
30 | .. autofunction:: cmapPy.set_io.gmt.verify_gmt_integrity
31 | 
32 | Writing GMT files
33 | -----------------
34 | 
35 | .. autofunction:: cmapPy.set_io.gmt.write
36 | 


--------------------------------------------------------------------------------
/nginx.conf:
--------------------------------------------------------------------------------
 1 | # on alpine, copy to /etc/nginx/nginx.conf
 2 | user                            root;
 3 | worker_processes                auto;
 4 | 
 5 | error_log                       /var/log/nginx/error.log warn;
 6 | 
 7 | events {
 8 |     worker_connections          1024;
 9 | }
10 | 
11 | http {
12 |     include                     /etc/nginx/mime.types;
13 |     default_type                application/octet-stream;
14 |     sendfile                    off;
15 |     access_log                  off;
16 |     keepalive_timeout           3000;
17 |     server {
18 |         listen                  9081;
19 |         index                   index.html;
20 |         server_name             cmapPy;
21 |         client_max_body_size    16m;
22 |         port_in_redirect off;
23 |         location ~ ^/cmapPy$ {
24 |                try_files $uri @rewrite;
25 |         }
26 |         location @rewrite {
27 |                return 302 $scheme://$http_host$uri/;
28 |         }
29 |         location ~ ^/cmapPy(?:/(.*))?$ {
30 |           	   root /usr/share/nginx/html;
31 |           	   access_log off;
32 |          }
33 |     }
34 | }


--------------------------------------------------------------------------------
/performance_testing/python_parse_timing.py:
--------------------------------------------------------------------------------
 1 | # '/path/with/gctx/files/to/test/*gct*' refers to a directory of GCT and/or GCTX files to time parsing operations on.
 2 | # Cache was cleared in between consecutive operations. 
 3 | 
 4 | import time
 5 | import pandas as pd 
 6 | import glob
 7 | import cmapPy.pandasGEXpress.parse as parse
 8 | 
 9 | # for storing timing results
10 | parse_times = {}
11 | 
12 | # input directory of files (gct or gctx) to test
13 | input_files = glob.glob("/path/with/gctx/files/to/test/*gct*")
14 | 
15 | for f in input_files:
16 | 	start = time.clock()
17 | 	in_gctoo = parse.parse(f)
18 | 	end = time.clock()
19 | 	elapsed_time = end - start
20 | 	parse_times[f] = elapsed_time
21 | 
22 | # write results to file
23 | parse_time_series = pd.Series(parse_times)
24 | parse_time_series.to_csv("python_parsing_results.txt", sep="\t")
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/performance_testing/python_write_timing.py:
--------------------------------------------------------------------------------
 1 | # '/path/to/large/gctx/file' refers to a large GCTX file (any size above 10174x100000 should work) from which file subsets are made.
 2 | # In testing, the large GCTX file used lacked metadata; including metadata would cause slight variation in results.
 3 | # Cache was cleared in between consecutive operations.
 4 | 
 5 | import os
 6 | import time
 7 | import pandas as pd
 8 | import cmapPy.pandasGEXpress.write_gctx as write_gctx
 9 | import cmapPy.pandasGEXpress.write_gct as write_gct
10 | import cmapPy.pandasGEXpress.parse as parse
11 | import cmapPy.pandasGEXpress.subset_gctoo as sg
12 | 
13 | # for storing timing results
14 | gct_times = {}
15 | gctx_times = {}
16 | 
17 | # large input gctx; see notes above for more info about this
18 | big_gctoo = parse.parse("/path/to/large/gctx/file")
19 | 
20 | # column and row spaces to test writing on
21 | col_spaces = [96, 384, 1536, 3000, 6000, 12000, 24000, 48000, 100000]
22 | row_spaces = [978, 10174]
23 | 
24 | for c in col_spaces:
25 | 	for r in row_spaces:
26 | 		curr_gctoo = sg.subset_gctoo(big_gctoo, ridx = range(0, r), cidx=range(0,c))
27 | 		# gct writing 
28 | 		out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct"
29 | 		start = time.clock()
30 | 		write_gct.write(curr_gctoo, out_fname)
31 | 		end = time.clock()
32 | 		elapsed_time = end - start
33 | 		gct_times[out_fname] = elapsed_time
34 | 		os.remove(out_fname)
35 | 		# gctx writing 
36 | 		out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx"
37 | 		start = time.clock()
38 | 		write_gctx.write(curr_gctoo, out_fname)
39 | 		end = time.clock()
40 | 		elapsed_time = end - start
41 | 		gctx_times[out_fname] = elapsed_time
42 | 		os.remove(out_fname)
43 | 
44 | # write results to file
45 | gct_df = pd.DataFrame(pd.Series(gct_times))
46 | gctx_df = pd.DataFrame(pd.Series(gctx_times))
47 | write_times_df = pd.concat([gct_df, gctx_df])
48 | write_times_df.columns = ["write_time"]
49 | write_times_df.to_csv("python_writing_results.txt", sep="\t")
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.11.2
2 | pandas==0.20.3 
3 | h5py==2.7.0
4 | requests==2.20.0
5 | 
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # Only Python 2.7 supported 
3 | universal=0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Always prefer setuptools over distutils
 2 | from setuptools import setup, find_packages
 3 | # To use a consistent encoding
 4 | from codecs import open
 5 | from os import path
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | setup(
10 |     name='cmapPy',
11 | 
12 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
13 |     # the version across setup.py and the project code, see
14 |     # https://packaging.python.org/en/latest/single_source_version.html
15 |     version='3.3.3',
16 | 
17 |     description='Assorted tools for interacting with .gct, .gctx files and other Connectivity Map (Broad Institute) data/tools',
18 |     long_description="cmapPy: Tools for interacting with .gctx and .gct files, and other Connectivity Map resources. See our documentation at http://cmappy.readthedocs.io/en/latest/, and for more information on the file formats and available resources, please see clue.io/gctx.",
19 | 
20 |     # The project's main homepage.
21 |     url='https://github.com/cmap/cmapPy',
22 | 
23 |     # Author details
24 |     maintainer='Oana Enache',
25 |     maintainer_email='oana@broadinstitute.org',
26 | 
27 |     # Choose your license
28 |     license='BSD 3-clause',
29 | 
30 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
31 |     classifiers=[
32 |         # How mature is this project? Common values are
33 |         #   3 - Alpha
34 |         #   4 - Beta
35 |         #   5 - Production/Stable
36 |         'Development Status :: 5 - Production/Stable',
37 | 
38 |         # Indicate who your project is intended for
39 |         'Intended Audience :: Science/Research',
40 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
41 | 
42 |         # Pick your license as you wish (should match "license" above)
43 |         'License :: OSI Approved :: BSD License',
44 | 
45 |         # Specify the Python versions you support here. In particular, ensure
46 |         # that you indicate whether you support Python 2, Python 3 or both.
47 |         'Programming Language :: Python :: 2',
48 |         'Programming Language :: Python :: 2.7'
49 |     ],
50 | 
51 |     # What does your project relate to?
52 |     keywords='gct gctx file-manipulation Connectivity Map CMap Broad Institute',
53 | 
54 |     # You can just specify the packages manually here if your project is
55 |     # simple. Or you can use find_packages().
56 |     packages=find_packages(exclude=['contrib','docs','tutorials', 'tests', 'performance_testing']),
57 | 
58 |     # List run-time dependencies here.  These will be installed by pip when
59 |     # your project is installed. For an analysis of "install_requires" vs pip's
60 |     # requirements files see:
61 |     # https://packaging.python.org/en/latest/requirements.html
62 |     install_requires=['numpy>=1.11.2', 'pandas>=0.18', 'h5py>=2.6.0', 'requests>=2.13.0', 'six'],
63 | 
64 |     # List additional groups of dependencies here (e.g. development
65 |     # dependencies). You can install these using the following syntax,
66 |     # for example:
67 |     # $ pip install -e .[dev,test]
68 |     extras_require={},
69 | 
70 |     # If there are data files included in your packages that need to be
71 |     # installed, specify them here.  If using Python 2.6 or less, then these
72 |     # have to be included in MANIFEST.in as well.
73 |     #package_data={},
74 |     include_package_data=True, # reads these from MANIFEST.in
75 | 
76 |     # Although 'package_data' is the preferred approach, in some case you may
77 |     # need to place data files outside of your packages. See:
78 |     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
79 |     # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
80 |     data_files=[],
81 | 
82 |     # To provide executable scripts, use entry points in preference to the
83 |     # "scripts" keyword. Entry points provide cross-platform support and allow
84 |     # pip to create the appropriate form of executable for the target platform.
85 |     entry_points={'console_scripts': ['gctx2gct=cmapPy.pandasGEXpress.gctx2gct:main', 'gct2gctx=cmapPy.pandasGEXpress.gct2gctx:main', 
86 |         'concat=cmapPy.pandasGEXpress.concat:main', 'subset=cmapPy.pandasGEXpress.subset:main']},
87 | 
88 |     tests_require=['unittest']
89 | )
90 | 


--------------------------------------------------------------------------------
/tutorials/GCTX_mockup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/tutorials/GCTX_mockup.png


--------------------------------------------------------------------------------
/tutorials/GCT_mockup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmap/cmapPy/d1652c3223e49e68e3a71634909342b4a6dbf361/tutorials/GCT_mockup.png


--------------------------------------------------------------------------------