├── .gitignore ├── LICENSE.txt ├── README.md ├── data └── test_filter_1.txt ├── notebooks ├── BCGSC microRNA expression.ipynb ├── BRAF-V600 study using CCLE data.ipynb ├── Copy Number segments.ipynb ├── Creating TCGA cohorts -- part 1.ipynb ├── Creating TCGA cohorts -- part 2.ipynb ├── Creating TCGA cohorts -- part 3.ipynb ├── DNA Methylation.ipynb ├── ISB_CGC_Query_of_the_Month_November_2018.ipynb ├── ISB_cgc_bam_slicing_with_pysam.ipynb ├── Protein expression.ipynb ├── README.md ├── RegulomeExplorer_1_Gexpr_CNV.ipynb ├── Somatic Mutations.ipynb ├── TCGA Annotations.ipynb ├── The ISB-CGC open-access TCGA tables in BigQuery.ipynb ├── UNC HiSeq mRNAseq gene expression.ipynb └── isb_cgc_bam_slicing_with_pysam.ipynb └── python ├── README.md ├── createSchema.py ├── createSchemaP3.py ├── gdcCase2Cohort.py ├── isb_auth.py ├── isb_cgc_api_v3_cases.py ├── isb_cgc_api_v3_cohorts.py ├── isb_cgc_api_v3_samples.py ├── isb_cgc_api_v3_users.py ├── isb_curl.py ├── melt_matrix.py ├── pairwise ├── README ├── archive │ ├── bq_filter_file.py │ ├── bq_filter_file_v2.py │ ├── filter_file_test_1.txt │ └── ref_query.txt ├── bqpairwise.py ├── filter_and_annot.py ├── pairwise_fun.py └── tests │ ├── test1 │ ├── filter_file_1.txt │ ├── filter_file_2.txt │ └── filter_file_test_query.sql │ ├── test2 │ ├── filter_file_1.txt │ ├── filter_file_2.txt │ └── filter_file_test_query.sql │ ├── test3 │ ├── filter_file_1.txt │ └── filter_file_2.txt │ ├── test4 │ ├── filter_file_1.txt │ ├── filter_file_2.txt │ └── filter_file_test_query.sql │ └── test5 │ ├── filter_file_1.txt │ └── filter_file_2.txt └── tsv2json.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # tilde files 62 | *.*~ 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2015, Institute for Systems Biology 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # examples-Python 2 | This repository contains a series of Python examples to get you started using the data and tools provided by the ISB-CGC on the Google Cloud Platform. 3 | 4 | * **[notebooks](https://github.com/isb-cgc/examples-Python/tree/master/notebooks)**: this directory contains tutorials in the form of IPython notebooks; 5 | * **[python](https://github.com/isb-cgc/examples-Python/tree/master/python)**: this directory contain tutorials in the form of simple python scripts. 6 | 7 | ### Important Note 8 | If you launch your own Cloud Datalab instance, please be sure to shut it down when you are no longer using it. A typical Google Cloud Datalab VM will cost your project approximately **$1/day**. Shut down instructions and other tips are [here](https://cloud.google.com/datalab/getting-started) -- look for the section called **Stopping/Starting VM Instances**. 9 | 10 | 11 | #### Requirements 12 | * google-api-python-client 13 | * oauth2client 14 | * httplib2 15 | * requests 16 | -------------------------------------------------------------------------------- /data/test_filter_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM 2 | idvar:ParticipantBarcode 3 | valvar:normalized_count 4 | pivot:HGNC_gene_symbol 5 | filter:SampleTypeLetterCode='TP' 6 | filter:Study='BRCA' 7 | filter:HGNC_gene_symbol IN ('ACSM5','NAP1L4','SULF2') 8 | limit:100 9 | -------------------------------------------------------------------------------- /notebooks/BCGSC microRNA expression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# microRNA expression (BCGSC RPKM)\n", 8 | "\n", 9 | "The goal of this notebook is to introduce you to the microRNA expression BigQuery table.\n", 10 | "\n", 11 | "This table contains all available TCGA Level-3 microRNA expression data produced by BCGSC's microRNA pipeline using the Illumina HiSeq platform, as of July 2016. The most recent archive (*eg* ``bcgsc.ca_THCA.IlluminaHiSeq_miRNASeq.Level_3.1.9.0``) for each of the 32 tumor types was downloaded from the DCC, and data extracted from all files matching the pattern ``%.isoform.quantification.txt``. The isoform-quantification values were then processed through a Perl script provided by BCGSC which produces normalized expression levels for *mature* microRNAs. Each of these mature microRNAs is identified by name (*eg* hsa-mir-21) and by MIMAT accession number (*eg* MIMAT0000076).\n", 12 | "\n", 13 | "In order to work with BigQuery, you need to import the python bigquery module (`gcp.bigquery`) and you need to know the name(s) of the table(s) you are going to be working with:" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 5, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import gcp.bigquery as bq\n", 25 | "miRNA_BQtable = bq.Table('isb-cgc:tcga_201607_beta.miRNA_Expression')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "From now on, we will refer to this table using this variable ($miRNA_BQtable), but we could just as well explicitly give the table name each time.\n", 33 | "\n", 34 | "Let's start by taking a look at the table schema:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 6, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "\n", 48 | "
\n", 49 | " \n", 56 | " " 57 | ], 58 | "text/plain": [ 59 | "" 60 | ] 61 | }, 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "%bigquery schema --table $miRNA_BQtable" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Now let's count up the number of unique patients, samples and aliquots mentioned in this table. We will do this by defining a very simple parameterized query. (Note that when using a variable for the table name in the FROM clause, you should not also use the square brackets that you usually would if you were specifying the table name as a string.)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "%%sql --module count_unique\n", 87 | "\n", 88 | "DEFINE QUERY q1\n", 89 | "SELECT COUNT (DISTINCT $f, 25000) AS n\n", 90 | "FROM $t" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 8, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | " There are 10245 unique values in the field ParticipantBarcode. \n", 105 | " There are 11015 unique values in the field SampleBarcode. \n", 106 | " There are 11077 unique values in the field AliquotBarcode. \n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "fieldList = ['ParticipantBarcode', 'SampleBarcode', 'AliquotBarcode']\n", 112 | "for aField in fieldList:\n", 113 | " field = miRNA_BQtable.schema[aField]\n", 114 | " rdf = bq.Query(count_unique.q1,t=miRNA_BQtable,f=field).results().to_dataframe()\n", 115 | " print \" There are %6d unique values in the field %s. \" % ( rdf.iloc[0]['n'], aField)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "raw", 120 | "metadata": {}, 121 | "source": [ 122 | "We can do the same thing to look at how many unique microRNAs exist in the table:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 9, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | " There are 965 unique values in the field mirna_id. \n", 137 | " There are 1222 unique values in the field mirna_accession. \n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "fieldList = ['mirna_id', 'mirna_accession']\n", 143 | "for aField in fieldList:\n", 144 | " field = miRNA_BQtable.schema[aField]\n", 145 | " rdf = bq.Query(count_unique.q1,t=miRNA_BQtable,f=field).results().to_dataframe()\n", 146 | " print \" There are %6d unique values in the field %s. \" % ( rdf.iloc[0]['n'], aField)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "These counts show that the mirna_id field is not a unique identifier and should be used in combination with the MIMAT accession number." 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Another thing to note about this table is that these expression values are obtained from two different platforms -- approximately 15% of the data is from the Illumina GA platform, and 85% from the Illumina HiSeq:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 10, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/html": [ 173 | "\n", 174 | "
Platformn
IlluminaHiSeq23226554
IlluminaGA3536468
\n", 175 | "
(rows: 2, time: 1.7s, 390MB processed, job: job_mz2XhfqdyCl9bBTe_6-N-l-aD7s)
\n", 176 | " \n", 190 | " " 191 | ], 192 | "text/plain": [ 193 | "QueryResultsTable job_mz2XhfqdyCl9bBTe_6-N-l-aD7s" 194 | ] 195 | }, 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "%%sql\n", 203 | "\n", 204 | "SELECT\n", 205 | " Platform,\n", 206 | " COUNT(*) AS n\n", 207 | "FROM\n", 208 | " $miRNA_BQtable\n", 209 | "GROUP BY\n", 210 | " Platform\n", 211 | "ORDER BY\n", 212 | " n DESC" 213 | ] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Python 2", 219 | "language": "python", 220 | "name": "python2" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 2 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython2", 232 | "version": "2.7.9" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 0 237 | } 238 | -------------------------------------------------------------------------------- /notebooks/BRAF-V600 study using CCLE data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Working with BigQuery tables and the Genomics API" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | "## Case Study: BRAF V600 mutations in CCLE cell-lines\n", 16 | "\n", 17 | "In this notebook we'll show you how you might combine information available in BigQuery tables with sequence-reads that have been imported into Google Genomics. We'll be using the open-access CCLE data for this example.\n", 18 | "\n", 19 | "You'll need to make sure that your project has the necessary APIs enabled, so take a look at the [Getting started with Google Genomics](https://cloud.google.com/genomics/install-genomics-tools) page, and be sure to also have a look at this [Getting started with the Genomics API](https://github.com/googlegenomics/datalab-examples/blob/master/datalab/genomics/Getting%20started%20with%20the%20Genomics%20API.ipynb) tutorial notebook available on github." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "We'll be using the [Google Python API client](https://github.com/google/google-api-python-client) so we'll need to install that first using the ``pip`` package manager.\n", 27 | "\n", 28 | "**NOTE** that Datalab is currently using an older version of the oauth2client (1.4.12) and as a result we need to install an older version of the google-api-python-client that supports it." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 24, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Requirement already up-to-date: google-api-python-client==1.4.2 in /usr/local/lib/python2.7/dist-packages\r\n", 43 | "Cleaning up...\r\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "!pip install --upgrade google-api-python-client==1.4.2" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Next we're going to need to authenticate using the service account on the Datalab host. " 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 25, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "" 69 | ] 70 | }, 71 | "execution_count": 25, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "from httplib2 import Http\n", 78 | "from oauth2client.client import GoogleCredentials\n", 79 | "credentials = GoogleCredentials.get_application_default()\n", 80 | "http = Http()\n", 81 | "credentials.authorize(http)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Now we can create a client for the Genomics API. **NOTE** that in order to use the Genomics API, you need to have enabled it for your GCP project." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 26, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "from apiclient import discovery\n", 100 | "ggSvc = discovery.build ( 'genomics', 'v1', http=http )" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "We're also going to want to work with BigQuery, so we'll need the biguery module. We will also be using the pandas and time modules." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 27, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "import gcp.bigquery as bq\n", 119 | "import pandas as pd\n", 120 | "import time" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "The ISB-CGC group has assembled metadata as well as molecular data from the CCLE project into an open-access BigQuery dataset called ``isb-cgc:ccle_201602_alpha``. In this notebook we will make use of two tables in this dataset: ``Mutation_calls`` and ``DataFile_info``. You can explore the entire dataset using the BigQuery [web UI](https://bigquery.cloud.google.com)." 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Let's say that we're interested in cell-lines with BRAF V600 mutations, and in particular we want to see if there is evidence in both the DNA-seq and the RNA-seq data for these mutations. Let's start by making sure that there are some cell-lines with these mutations in our dataset:" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 28, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/html": [ 147 | "\n", 148 | "
CCLE_nameHugo_SymbolProtein_ChangeGenome_Change
8305C_THYROIDBRAFp.V600Eg.chr7:140453136A>T
8505C_THYROIDBRAFp.V600Eg.chr7:140453136A>T
A375_SKINBRAFp.V600Eg.chr7:140453136A>T
A673_BONEBRAFp.V600Eg.chr7:140453136A>T
A101D_SKINBRAFp.V600Eg.chr7:140453136A>T
\n", 149 | "
(rows: 5, time: 0.5s, cached, job: job_bEFKS7D_j7mYWVeGJBmzD60iFZw)
\n", 150 | " \n", 164 | " " 165 | ], 166 | "text/plain": [ 167 | "QueryResultsTable job_bEFKS7D_j7mYWVeGJBmzD60iFZw" 168 | ] 169 | }, 170 | "execution_count": 28, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "%%sql\n", 177 | "\n", 178 | "SELECT CCLE_name, Hugo_Symbol, Protein_Change, Genome_Change \n", 179 | "FROM [isb-cgc:ccle_201602_alpha.Mutation_calls] \n", 180 | "WHERE ( Hugo_Symbol=\"BRAF\" AND Protein_Change CONTAINS \"p.V600\" )\n", 181 | "ORDER BY Cell_line_primary_name\n", 182 | "LIMIT 5" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "OK, so let's get the complete list of cell-lines with this particular mutation:" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 29, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "%%sql --module get_mutated_samples\n", 201 | "\n", 202 | "SELECT CCLE_name \n", 203 | "FROM [isb-cgc:ccle_201602_alpha.Mutation_calls] \n", 204 | "WHERE ( Hugo_Symbol=\"BRAF\" AND Protein_Change CONTAINS \"p.V600\" )\n", 205 | "ORDER BY Cell_line_primary_name" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 30, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | " Found 54 samples with a BRAF V600 mutation. \n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "r = bq.Query(get_mutated_samples).results()\n", 225 | "list1 = r.to_dataframe()\n", 226 | "print \" Found %d samples with a BRAF V600 mutation. \" % len(list1)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Now we want to know, from the ``DataFile_info`` table, which cell lines have both DNA-seq and RNA-seq data imported into Google Genomics. (To find these samples, we will look for samples that have non-null readgroupset IDs from \"DNA\" and \"RNA\" pipelines.) " 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 31, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "%%sql --module get_samples_with_data\n", 245 | "\n", 246 | "SELECT\n", 247 | " a.CCLE_name AS CCLE_name\n", 248 | "FROM (\n", 249 | " SELECT\n", 250 | " CCLE_name\n", 251 | " FROM\n", 252 | " [isb-cgc:ccle_201602_alpha.DataFile_info]\n", 253 | " WHERE\n", 254 | " ( Pipeline CONTAINS \"DNA\"\n", 255 | " AND GG_readgroupset_id<>\"NULL\" ) ) a\n", 256 | "JOIN (\n", 257 | " SELECT\n", 258 | " CCLE_name\n", 259 | " FROM\n", 260 | " [isb-cgc:ccle_201602_alpha.DataFile_info]\n", 261 | " WHERE\n", 262 | " ( Pipeline CONTAINS \"RNA\"\n", 263 | " AND GG_readgroupset_id<>\"NULL\" ) ) b\n", 264 | "ON\n", 265 | " a.CCLE_name = b.CCLE_name" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 32, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | " Found 265 samples with both DNA-seq and RNA-seq reads. \n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "r = bq.Query(get_samples_with_data).results()\n", 285 | "list2 = r.to_dataframe()\n", 286 | "print \" Found %d samples with both DNA-seq and RNA-seq reads. \" % len(list2)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Now let's find out which samples are in both of these lists:" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 33, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | " Found 7 mutated samples with DNA-seq and RNA-seq data. \n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "list3 = pd.merge ( list1, list2, how='inner', on=['CCLE_name'] )\n", 313 | "print \" Found %d mutated samples with DNA-seq and RNA-seq data. \" % len(list3)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "No we're going to take a closer look at the reads from each of these samples. First, we'll need to be able to get the readgroupset IDs for each sample from the BigQuery table. To do this, we'll define a parameterized function:" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 34, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "%%sql --module get_readgroupsetid\n", 332 | "\n", 333 | "SELECT Pipeline, GG_readgroupset_id \n", 334 | "FROM [isb-cgc:ccle_201602_alpha.DataFile_info]\n", 335 | "WHERE CCLE_name=$c AND GG_readgroupset_id<>\"NULL\"" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Let's take a look at how this will work:" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 35, 348 | "metadata": { 349 | "collapsed": false 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "COLO783_SKIN\n", 357 | " Pipeline GG_readgroupset_id\n", 358 | "0 broad.mit.edu__DNASeq CJKPhaq1GhC4rZSVj4TMoIIB\n", 359 | "1 broad.mit.edu__RNASeq CJKPhaq1GhDN4avdoaTXsKcB\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "aName = list3['CCLE_name'][0]\n", 365 | "print aName\n", 366 | "ids = bq.Query(get_readgroupsetid,c=aName).to_dataframe()\n", 367 | "print ids" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "Ok, so we see that for this sample, we have two readgroupset IDs, one based on DNA-seq and one based on RNA-seq. This is what we expect, based on how we chose this list of samples." 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "Now we'll define a function we can re-use that calls the GA4GH API reads.search method to find all reads that overlap the V600 mutation position. Note that we will query all of the readgroupsets that we get for each sample at the same time by passing in a list of readGroupSetIds. Once we have the reads, we'll organized them into a dictionary based on the local context centered on the mutation hotspot." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 36, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "chr = \"7\"\n", 393 | "pos = 140453135\n", 394 | "width = 11\n", 395 | "rgsList = ids['GG_readgroupset_id'].tolist()\n", 396 | "\n", 397 | "def getReads ( rgsList, pos, width):\n", 398 | " \n", 399 | " payload = { \"readGroupSetIds\": rgsList,\n", 400 | " \"referenceName\": chr,\n", 401 | " \"start\": pos-(width/2),\n", 402 | " \"end\": pos+(width/2),\n", 403 | " \"pageSize\": 2048 \n", 404 | " }\n", 405 | " r = ggSvc.reads().search(body=payload).execute()\n", 406 | " \n", 407 | " context = {}\n", 408 | " for a in r['alignments']:\n", 409 | " rgsid = a['readGroupSetId']\n", 410 | " seq = a['alignedSequence']\n", 411 | " seqStartPos = int ( a['alignment']['position']['position'] )\n", 412 | " relPos = pos - (width/2) - seqStartPos\n", 413 | " if ( relPos >=0 and relPos+width3 or len(context[c])>1 ):\n", 431 | " print \" --> \", c, context[c]" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "Here we define the position (0-based) of the BRAF V600 mutation:" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 37, 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "chr = \"7\"\n", 450 | "pos = 140453135\n", 451 | "width = 11" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "OK, now we can loop over all of the samples we found earlier:" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 38, 464 | "metadata": { 465 | "collapsed": false 466 | }, 467 | "outputs": [ 468 | { 469 | "name": "stdout", 470 | "output_type": "stream", 471 | "text": [ 472 | " \n", 473 | " \n", 474 | "COLO783_SKIN\n", 475 | " broad.mit.edu__DNASeq CJKPhaq1GhC4rZSVj4TMoIIB\n", 476 | " broad.mit.edu__RNASeq CJKPhaq1GhDN4avdoaTXsKcB\n", 477 | " --> ATTTCACTGTA {u'CJKPhaq1GhC4rZSVj4TMoIIB': 47, u'CJKPhaq1GhDN4avdoaTXsKcB': 10}\n", 478 | " --> ATTTCTCTGTA {u'CJKPhaq1GhC4rZSVj4TMoIIB': 100, u'CJKPhaq1GhDN4avdoaTXsKcB': 30}\n", 479 | " \n", 480 | " \n", 481 | "K029AX_SKIN\n", 482 | " broad.mit.edu__DNASeq CJKPhaq1GhCj-6WMobnvxCA\n", 483 | " broad.mit.edu__RNASeq CJKPhaq1GhDHzLyn-_j7w18\n", 484 | " --> ATTTCACTGTA {u'CJKPhaq1GhDHzLyn-_j7w18': 8, u'CJKPhaq1GhCj-6WMobnvxCA': 63}\n", 485 | " --> ATTTCTCTGTA {u'CJKPhaq1GhDHzLyn-_j7w18': 13, u'CJKPhaq1GhCj-6WMobnvxCA': 119}\n", 486 | " \n", 487 | " \n", 488 | "MALME3M_SKIN\n", 489 | " broad.mit.edu__RNASeq CJKPhaq1GhCMvqyM-MWor2o\n", 490 | " broad.mit.edu__DNASeq CJKPhaq1GhCj8MDayraUjOUB\n", 491 | " --> ATTTCACTGTA {u'CJKPhaq1GhCj8MDayraUjOUB': 48, u'CJKPhaq1GhCMvqyM-MWor2o': 9}\n", 492 | " --> ATTTCTCTGTA {u'CJKPhaq1GhCj8MDayraUjOUB': 126, u'CJKPhaq1GhCMvqyM-MWor2o': 37}\n", 493 | " \n", 494 | " \n", 495 | "SIGM5_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE\n", 496 | " broad.mit.edu__DNASeq CJKPhaq1GhDH0sLg9qTgt5QB\n", 497 | " broad.mit.edu__RNASeq CJKPhaq1GhD_2PTBu4jiq5wB\n", 498 | " --> ATTTCACTGTA {u'CJKPhaq1GhD_2PTBu4jiq5wB': 3, u'CJKPhaq1GhDH0sLg9qTgt5QB': 28}\n", 499 | " --> ATTTCTCTGTA {u'CJKPhaq1GhD_2PTBu4jiq5wB': 6, u'CJKPhaq1GhDH0sLg9qTgt5QB': 44}\n", 500 | " \n", 501 | " \n", 502 | "WM793_SKIN\n", 503 | " broad.mit.edu__DNASeq CJKPhaq1GhDauaiO54TV-lU\n", 504 | " broad.mit.edu__RNASeq CJKPhaq1GhDo3-DQ8s2qiAU\n", 505 | " --> ATTTCACTGTA {u'CJKPhaq1GhDo3-DQ8s2qiAU': 11, u'CJKPhaq1GhDauaiO54TV-lU': 50}\n", 506 | " --> ATTTCTCTGTA {u'CJKPhaq1GhDo3-DQ8s2qiAU': 13, u'CJKPhaq1GhDauaiO54TV-lU': 102}\n", 507 | " \n", 508 | " \n", 509 | "WM88_SKIN\n", 510 | " broad.mit.edu__RNASeq CJKPhaq1GhDvot6FqNrO8bUB\n", 511 | " broad.mit.edu__DNASeq CJKPhaq1GhD85Zrppre74-0B\n", 512 | " --> ATTTCACTGTA {u'CJKPhaq1GhDvot6FqNrO8bUB': 13, u'CJKPhaq1GhD85Zrppre74-0B': 47}\n", 513 | " --> ATTTCTCTGTA {u'CJKPhaq1GhDvot6FqNrO8bUB': 17, u'CJKPhaq1GhD85Zrppre74-0B': 76}\n", 514 | " \n", 515 | " \n", 516 | "WM1799_SKIN\n", 517 | " broad.mit.edu__RNASeq CJKPhaq1GhDrrMCJ0_zBuKkB\n", 518 | " broad.mit.edu__DNASeq CJKPhaq1GhDQ5u66urbHjGI\n", 519 | " --> ATTTCTCTGTA {u'CJKPhaq1GhDrrMCJ0_zBuKkB': 42, u'CJKPhaq1GhDQ5u66urbHjGI': 143}\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "for aName in list3['CCLE_name']: \n", 525 | " print \" \"\n", 526 | " print \" \"\n", 527 | " print aName\n", 528 | " r = bq.Query(get_readgroupsetid,c=aName).to_dataframe()\n", 529 | " for i in range(r.shape[0]):\n", 530 | " print \" \", r['Pipeline'][i], r['GG_readgroupset_id'][i]\n", 531 | " rgsList = r['GG_readgroupset_id'].tolist()\n", 532 | " getReads ( rgsList, pos, width)\n", 533 | " " 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "Notice that we consistently see greater read-depth in the DNA-seq data. Also all but the last sample are heterozygous for the V600 mutation, while ``WM1799_SKIN`` is homozygous. (Of course a proper analysis would also take into consideration the cigar information that is available with each read as well.)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "collapsed": true 548 | }, 549 | "outputs": [], 550 | "source": [] 551 | } 552 | ], 553 | "metadata": { 554 | "kernelspec": { 555 | "display_name": "Python 2", 556 | "language": "python", 557 | "name": "python2" 558 | }, 559 | "language_info": { 560 | "codemirror_mode": { 561 | "name": "ipython", 562 | "version": 2 563 | }, 564 | "file_extension": ".py", 565 | "mimetype": "text/x-python", 566 | "name": "python", 567 | "nbconvert_exporter": "python", 568 | "pygments_lexer": "ipython2", 569 | "version": "2.7.9" 570 | } 571 | }, 572 | "nbformat": 4, 573 | "nbformat_minor": 0 574 | } 575 | -------------------------------------------------------------------------------- /notebooks/Protein expression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Protein expression (MDAnderson RPPA)\n", 8 | "\n", 9 | "The goal of this notebook is to introduce you to the Protein expression BigQuery table.\n", 10 | "\n", 11 | "This table contains all available TCGA Level-3 protein expression data produced by MD Anderson's RPPA pipeline, as of July 2016. The most recent archives (*eg* ``mdanderson.org_COAD.MDA_RPPA_Core.Level_3.2.0.0``) for each of the 32 tumor types was downloaded from the DCC, and data extracted from all files matching the pattern ``%_RPPA_Core.protein_expression%.txt``. Each of these “protein expression” files has two columns: the ``Composite Element REF`` and the ``Protein Expression``. In addition, each mage-tab archive contains an ``antibody_annotation`` file which is parsed in order to obtain the correct mapping between antibody name, protein name, and gene symbol. During the ETL process, portions of the protein name and the antibody name were extracted into additional columns in the table, including ``Phospho``, ``antibodySource`` and ``validationStatus``. \n", 12 | "\n", 13 | "In order to work with BigQuery, you need to import the python bigquery module (`gcp.bigquery`) and you need to know the name(s) of the table(s) you are going to be working with:" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import gcp.bigquery as bq\n", 25 | "rppa_BQtable = bq.Table('isb-cgc:tcga_201607_beta.Protein_RPPA_data')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "From now on, we will refer to this table using this variable ($rppa_BQtable), but we could just as well explicitly give the table name each time.\n", 33 | "\n", 34 | "Let's start by taking a look at the table schema:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "\n", 48 | "
\n", 49 | " \n", 56 | " " 57 | ], 58 | "text/plain": [ 59 | "" 60 | ] 61 | }, 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "%bigquery schema --table $rppa_BQtable" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Let's count up the number of unique patients, samples and aliquots mentioned in this table. We will do this by defining a very simple parameterized query. (Note that when using a variable for the table name in the FROM clause, you should not also use the square brackets that you usually would if you were specifying the table name as a string.)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "%%sql --module count_unique\n", 87 | "\n", 88 | "DEFINE QUERY q1\n", 89 | "SELECT COUNT (DISTINCT $f, 25000) AS n\n", 90 | "FROM $t" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | " There are 7845 unique values in the field ParticipantBarcode. \n", 105 | " There are 7933 unique values in the field SampleBarcode. \n", 106 | " There are 7943 unique values in the field AliquotBarcode. \n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "fieldList = ['ParticipantBarcode', 'SampleBarcode', 'AliquotBarcode']\n", 112 | "for aField in fieldList:\n", 113 | " field = rppa_BQtable.schema[aField]\n", 114 | " rdf = bq.Query(count_unique.q1,t=rppa_BQtable,f=field).results().to_dataframe()\n", 115 | " print \" There are %6d unique values in the field %s. \" % ( rdf.iloc[0]['n'], aField)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "raw", 120 | "metadata": {}, 121 | "source": [ 122 | "We can do the same thing to look at how many unique gene symbols and proteins exist in the table:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | " There are 217 unique values in the field Gene_Name. \n", 137 | " There are 259 unique values in the field Protein_Name. \n", 138 | " There are 219 unique values in the field Protein_Basename. \n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "fieldList = ['Gene_Name', 'Protein_Name', 'Protein_Basename']\n", 144 | "for aField in fieldList:\n", 145 | " field = rppa_BQtable.schema[aField]\n", 146 | " rdf = bq.Query(count_unique.q1,t=rppa_BQtable,f=field).results().to_dataframe()\n", 147 | " print \" There are %6d unique values in the field %s. \" % ( rdf.iloc[0]['n'], aField)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Based on the counts, we can see that there are several genes for which multiple proteins are assayed, and that overall this dataset is quite small compared to most of the other datasets. Let's look at which genes have multiple proteins assayed:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "\n", 168 | "
Gene_Namen
EIF4EBP14
PARP13
SRC3
EGFR3
AKT23
AKT33
AKT13
GSK3B3
CDKN1B3
RPS63
GSK3A3
CHEK13
MYH92
RAF12
PYGB2
FOXO32
PRKAA12
ERBB22
RB12
ESR12
ARAF2
YBX12
TSC22
ACACA2
RICTOR2
\n", 169 | "
(rows: 42, time: 1.1s, 30MB processed, job: job_tbt23N58wYdiUQoFPqOqv4QpiWw)
\n", 170 | " \n", 184 | " " 185 | ], 186 | "text/plain": [ 187 | "QueryResultsTable job_tbt23N58wYdiUQoFPqOqv4QpiWw" 188 | ] 189 | }, 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "%%sql\n", 197 | "\n", 198 | "SELECT\n", 199 | " Gene_Name,\n", 200 | " COUNT(*) AS n\n", 201 | "FROM (\n", 202 | " SELECT\n", 203 | " Gene_Name,\n", 204 | " Protein_Name,\n", 205 | " FROM\n", 206 | " $rppa_BQtable\n", 207 | " GROUP BY\n", 208 | " Gene_Name,\n", 209 | " Protein_Name )\n", 210 | "GROUP BY\n", 211 | " Gene_Name\n", 212 | "HAVING\n", 213 | " ( n > 1 )\n", 214 | "ORDER BY\n", 215 | " n DESC" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "Let's look further in the the EIF4EBP1 gene which has the most different proteins being measured:" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 7, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/html": [ 235 | "\n", 236 | "
Gene_NameProtein_NamePhosphoantibodySourcevalidationStatus
EIF4EBP14E-BP1 RV
EIF4EBP14E-BP1_pS65pS65RV
EIF4EBP14E-BP1_pT37_T46pT37_T46RV
EIF4EBP14E-BP1_pT70pT70RC
EIF4EBP14E-BP1_pT70pT70RV
\n", 237 | "
(rows: 5, time: 1.1s, 45MB processed, job: job_SgenmfoKWUaoks4yU5uW3SxRf9s)
\n", 238 | " \n", 252 | " " 253 | ], 254 | "text/plain": [ 255 | "QueryResultsTable job_SgenmfoKWUaoks4yU5uW3SxRf9s" 256 | ] 257 | }, 258 | "execution_count": 7, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "%%sql\n", 265 | "\n", 266 | "SELECT\n", 267 | " Gene_Name,\n", 268 | " Protein_Name,\n", 269 | " Phospho,\n", 270 | " antibodySource,\n", 271 | " validationStatus\n", 272 | "FROM\n", 273 | " $rppa_BQtable\n", 274 | "WHERE\n", 275 | " ( Gene_Name=\"EIF4EBP1\" )\n", 276 | "GROUP BY\n", 277 | " Gene_Name,\n", 278 | " Protein_Name,\n", 279 | " Phospho,\n", 280 | " antibodySource,\n", 281 | " validationStatus\n", 282 | "ORDER BY\n", 283 | " Gene_Name,\n", 284 | " Protein_Name,\n", 285 | " Phospho,\n", 286 | " antibodySource,\n", 287 | " validationStatus" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Some antibodies are non-specific and bind to protein products from multiple genes in a gene family. One example of this is the AKT1, AKT2, AKT3 gene family. This non-specificity is indicated in the antibody-annotation file by a list of gene symbols, but in this table, we duplicate the entries (as well as the data values) on multiple rows:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 8, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/html": [ 307 | "\n", 308 | "
Gene_NameProtein_NamePhosphoantibodySourcevalidationStatus
AKT1Akt RV
AKT1Akt_pS473pS473RV
AKT1Akt_pT308pT308RV
AKT1S1PRAS40_pT246pT246RV
AKT2Akt RV
AKT2Akt_pS473pS473RV
AKT2Akt_pT308pT308RV
AKT3Akt RV
AKT3Akt_pS473pS473RV
AKT3Akt_pT308pT308RV
\n", 309 | "
(rows: 10, time: 1.5s, 45MB processed, job: job_U2oT67OWVVo4W9ta4ssOKLiXeCI)
\n", 310 | " \n", 324 | " " 325 | ], 326 | "text/plain": [ 327 | "QueryResultsTable job_U2oT67OWVVo4W9ta4ssOKLiXeCI" 328 | ] 329 | }, 330 | "execution_count": 8, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "%%sql\n", 337 | "\n", 338 | "SELECT\n", 339 | " Gene_Name,\n", 340 | " Protein_Name,\n", 341 | " Phospho,\n", 342 | " antibodySource,\n", 343 | " validationStatus\n", 344 | "FROM\n", 345 | " $rppa_BQtable\n", 346 | "WHERE\n", 347 | " ( Gene_Name CONTAINS \"AKT\" )\n", 348 | "GROUP BY\n", 349 | " Gene_Name,\n", 350 | " Protein_Name,\n", 351 | " Phospho,\n", 352 | " antibodySource,\n", 353 | " validationStatus\n", 354 | "ORDER BY\n", 355 | " Gene_Name,\n", 356 | " Protein_Name,\n", 357 | " Phospho,\n", 358 | " antibodySource,\n", 359 | " validationStatus" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 9, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/html": [ 372 | "\n", 373 | "
SampleBarcodeStudyGene_NameProtein_NameProtein_Expression
TCGA-02-0003-01AGBMAKT1Akt-0.0372666185
TCGA-02-0003-01AGBMAKT2Akt-0.0372666185
TCGA-02-0003-01AGBMAKT3Akt-0.0372666185
TCGA-02-0004-01AGBMAKT1Akt-1.14494074
TCGA-02-0004-01AGBMAKT2Akt-1.14494074
TCGA-02-0004-01AGBMAKT3Akt-1.14494074
TCGA-02-0011-01BGBMAKT1Akt-0.319130557
TCGA-02-0011-01BGBMAKT2Akt-0.319130557
TCGA-02-0011-01BGBMAKT3Akt-0.319130557
\n", 374 | "
(rows: 9, time: 1.1s, 89MB processed, job: job_UqRCkgBuVxiDDh0kWKoFgfHvcag)
\n", 375 | " \n", 389 | " " 390 | ], 391 | "text/plain": [ 392 | "QueryResultsTable job_UqRCkgBuVxiDDh0kWKoFgfHvcag" 393 | ] 394 | }, 395 | "execution_count": 9, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "%%sql\n", 402 | "\n", 403 | "SELECT\n", 404 | " SampleBarcode,\n", 405 | " Study,\n", 406 | " Gene_Name,\n", 407 | " Protein_Name,\n", 408 | " Protein_Expression\n", 409 | "FROM\n", 410 | " $rppa_BQtable\n", 411 | "WHERE\n", 412 | " ( Protein_Name=\"Akt\" )\n", 413 | "ORDER BY\n", 414 | " SampleBarcode,\n", 415 | " Gene_Name\n", 416 | "LIMIT\n", 417 | " 9" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": true 425 | }, 426 | "outputs": [], 427 | "source": [] 428 | } 429 | ], 430 | "metadata": { 431 | "kernelspec": { 432 | "display_name": "Python 2", 433 | "language": "python", 434 | "name": "python2" 435 | }, 436 | "language_info": { 437 | "codemirror_mode": { 438 | "name": "ipython", 439 | "version": 2 440 | }, 441 | "file_extension": ".py", 442 | "mimetype": "text/x-python", 443 | "name": "python", 444 | "nbconvert_exporter": "python", 445 | "pygments_lexer": "ipython2", 446 | "version": "2.7.9" 447 | } 448 | }, 449 | "nbformat": 4, 450 | "nbformat_minor": 0 451 | } 452 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # examples-Python/notebooks 2 | The **notebooks** subdirectory of this repository contains a series of IPython notebooks that are intended to help you get started working with the ISB-CGC hosted [TCGA](http://cancergenome.nih.gov/) [data](https://tcga-data.nci.nih.gov/tcga/tcgaHome2.jsp) in BigQuery. 3 | 4 | ### Where to start? 5 | You can find an overview of the BigQuery tables in this [notebook](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/The%20ISB-CGC%20open-access%20TCGA%20tables%20in%20BigQuery.ipynb) and from there, we suggest that you look at the two "Creating TCGA cohorts" notebooks ([part 1](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%201.ipynb) and [part 2](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%202.ipynb)) which describe and make use of the Clinical and Biospecimen tables. From there you can delve into the various molecular data tables as well as the Annotations table. For now these sample notebooks are intentionally relatively simple and do not do any analysis that integrates data from multiple tables but once you have a grasp of how to use the data, developing your own more complex analyses should not be difficult. You could even contribute an example back to our github repository! You are also welcome to submit bug reports, comments, and feature-requests as [github issues](https://github.com/isb-cgc/examples-Python/issues). 6 | 7 | ### How to run the notebooks 8 | 9 | 1. Launch your own Cloud Datalab instance [in the cloud](https://cloud.google.com/datalab/getting-started) or [run it locally](https://github.com/GoogleCloudPlatform/datalab#using-datalab-and-getting-started). 10 | 2. Work through the introductory notebooks that are pre-installed on Cloud Datalab. 11 | 3. Run `git clone https://github.com/isb-cgc/examples-Python.git` on your local file system to download the notebooks. 12 | 4. Import the ISB-CGC notebooks into your Cloud Datalab instance by navigating to the notebook list page and uploading them. 13 | 14 | ### Important Note 15 | If you are running in the cloud, a typical Google Cloud Datalab VM will cost your project approximately **$1/day**, so be sure to **shut down** Cloud Datalab when you are no longer using it. (Shutting down a "session" within IPython sessions does *not* shut down the VM.) Shut down instructions and other tips are [here](https://cloud.google.com/datalab/getting-started) -- look for the section called **Stopping/Starting VM Instances**. 16 | -------------------------------------------------------------------------------- /notebooks/The ISB-CGC open-access TCGA tables in BigQuery.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The ISB-CGC open-access TCGA tables in Big-Query\n", 8 | "\n", 9 | "The goal of this notebook is to introduce you to a new publicly-available, open-access dataset in BigQuery. This set of BigQuery tables was produced by the [ISB-CGC](http://www.isb-cgc.org) project, based on the open-access [TCGA](http://cancergenome.nih.gov/) data available at the TCGA [Data Portal](https://tcga-data.nci.nih.gov/tcga/). You will need to have access to a Google Cloud Platform (GCP) project in order to use BigQuery. If you don't already have one, you can sign up for a [free-trial](https://cloud.google.com/free-trial/) or contact [us](mailto://info@isb-cgc.org) and become part of the community evaluation phase of our Cancer Genomics Cloud pilot. (You can find more information about this NCI-funded program [here](https://cbiit.nci.nih.gov/ncip/nci-cancer-genomics-cloud-pilots).)\n", 10 | "\n", 11 | "We are not attempting to provide a thorough BigQuery or IPython tutorial here, as a wealth of such information already exists. Here are links to some resources that you might find useful: \n", 12 | "* [BigQuery](https://cloud.google.com/bigquery/what-is-bigquery), \n", 13 | "* the BigQuery [web UI](https://bigquery.cloud.google.com/) where you can run queries interactively, \n", 14 | "* [IPython](http://ipython.org/) (now known as [Jupyter](http://jupyter.org/)), and \n", 15 | "* [Cloud Datalab](https://cloud.google.com/datalab/) the recently announced interactive cloud-based platform that this notebook is being developed on. \n", 16 | "\n", 17 | "There are also many tutorials and samples available on github (see, in particular, the [datalab](https://github.com/GoogleCloudPlatform/datalab) repo and the [Google Genomics]( https://github.com/googlegenomics) project).\n", 18 | "\n", 19 | "In order to work with BigQuery, the first thing you need to do is import the [gcp.bigquery](http://googlecloudplatform.github.io/datalab/gcp.bigquery.html) package:" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 6, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import gcp.bigquery as bq" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "The next thing you need to know is how to access the specific tables you are interested in. BigQuery tables are organized into datasets, and datasets are owned by a specific GCP project. The tables we are introducing in this notebook are in a dataset called **`tcga_201607_beta`**, owned by the **`isb-cgc`** project. A full table identifier is of the form `:.`. Let's start by getting some basic information about the tables in this dataset:" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 7, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | " 6322 rows 1729204 bytes Annotations\n", 52 | " 23797 rows 6382147 bytes Biospecimen_data\n", 53 | " 11160 rows 4201379 bytes Clinical_data\n", 54 | " 2646095 rows 333774244 bytes Copy_Number_segments\n", 55 | "3944304319 rows 445303830985 bytes DNA_Methylation_betas\n", 56 | " 382335670 rows 43164264006 bytes DNA_Methylation_chr1\n", 57 | " 197519895 rows 22301345198 bytes DNA_Methylation_chr10\n", 58 | " 235823572 rows 26623975945 bytes DNA_Methylation_chr11\n", 59 | " 198050739 rows 22359642619 bytes DNA_Methylation_chr12\n", 60 | " 97301675 rows 10986815862 bytes DNA_Methylation_chr13\n", 61 | " 123239379 rows 13913712352 bytes DNA_Methylation_chr14\n", 62 | " 124566185 rows 14064712239 bytes DNA_Methylation_chr15\n", 63 | " 179772812 rows 20296128173 bytes DNA_Methylation_chr16\n", 64 | " 234003341 rows 26417830751 bytes DNA_Methylation_chr17\n", 65 | " 50216619 rows 5669139362 bytes DNA_Methylation_chr18\n", 66 | " 211386795 rows 23862583107 bytes DNA_Methylation_chr19\n", 67 | " 279668485 rows 31577200462 bytes DNA_Methylation_chr2\n", 68 | " 86858120 rows 9805923353 bytes DNA_Methylation_chr20\n", 69 | " 35410447 rows 3997986812 bytes DNA_Methylation_chr21\n", 70 | " 70676468 rows 7978947938 bytes DNA_Methylation_chr22\n", 71 | " 201119616 rows 22705358910 bytes DNA_Methylation_chr3\n", 72 | " 159148744 rows 17968482285 bytes DNA_Methylation_chr4\n", 73 | " 195864180 rows 22113162401 bytes DNA_Methylation_chr5\n", 74 | " 290275524 rows 32772371379 bytes DNA_Methylation_chr6\n", 75 | " 240010275 rows 27097948808 bytes DNA_Methylation_chr7\n", 76 | " 164810092 rows 18607886221 bytes DNA_Methylation_chr8\n", 77 | " 81260723 rows 9173717922 bytes DNA_Methylation_chr9\n", 78 | " 98082681 rows 11072059468 bytes DNA_Methylation_chrX\n", 79 | " 2330426 rows 263109775 bytes DNA_Methylation_chrY\n", 80 | " 1867233 rows 207365611 bytes Protein_RPPA_data\n", 81 | " 5356089 rows 5715538107 bytes Somatic_Mutation_calls\n", 82 | " 5738048 rows 657855993 bytes mRNA_BCGSC_GA_RPKM\n", 83 | " 38299138 rows 4459086535 bytes mRNA_BCGSC_HiSeq_RPKM\n", 84 | " 44037186 rows 5116942528 bytes mRNA_BCGSC_RPKM\n", 85 | " 16794358 rows 1934755686 bytes mRNA_UNC_GA_RSEM\n", 86 | " 211284521 rows 24942992190 bytes mRNA_UNC_HiSeq_RSEM\n", 87 | " 228078879 rows 26877747876 bytes mRNA_UNC_RSEM\n", 88 | " 11997545 rows 2000881026 bytes miRNA_BCGSC_GA_isoform\n", 89 | " 4503046 rows 527101917 bytes miRNA_BCGSC_GA_mirna\n", 90 | " 90237323 rows 15289326462 bytes miRNA_BCGSC_HiSeq_isoform\n", 91 | " 28207741 rows 3381212265 bytes miRNA_BCGSC_HiSeq_mirna\n", 92 | " 102234868 rows 17290207488 bytes miRNA_BCGSC_isoform\n", 93 | " 32710787 rows 3908314182 bytes miRNA_BCGSC_mirna\n", 94 | " 26763022 rows 3265303352 bytes miRNA_Expression\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "d = bq.DataSet('isb-cgc:tcga_201607_beta')\n", 100 | "for t in d.tables():\n", 101 | " print '%10d rows %12d bytes %s' \\\n", 102 | " % (t.metadata.rows, t.metadata.size, t.name.table_id)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "These tables are based on the open-access TCGA data as of July 2016. The molecular data is all \"Level 3\" data, and is divided according to platform/pipeline. See [here](https://tcga-data.nci.nih.gov/tcga/tcgaDataType.jsp) for additional details regarding the TCGA data levels and data types.\n", 110 | "\n", 111 | "Additional notebooks go into each of these tables in more detail, but here is an overview, in the same alphabetical order that they are listed in above and in the BigQuery web UI:\n", 112 | "\n", 113 | "\n", 114 | "- **Annotations**: This table contains the annotations that are also available from the interactive [TCGA Annotations Manager](https://tcga-data.nci.nih.gov/annotations/). Annotations can be associated with any type of \"item\" (*eg* Patient, Sample, Aliquot, etc), and a single item may have more than one annotation. Common annotations include \"Item flagged DNU\", \"Item is noncanonical\", and \"Prior malignancy.\" More information about this table can be found in the [TCGA Annotations](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/TCGA%20Annotations.ipynb) notebook.\n", 115 | "\n", 116 | "\n", 117 | "- **Biospecimen_data**: This table contains information obtained from the \"biospecimen\" and \"auxiliary\" XML files in the TCGA Level-1 \"bio\" archives. Each row in this table represents a single \"biospecimen\" or \"sample\". Most participants in the TCGA project provided two samples: a \"primary tumor\" sample and a \"blood normal\" sample, but others provided normal-tissue, metastatic, or other types of samples. This table contains metadata about all of the samples, and more information about exploring this table and using this information to create your own custom analysis cohort can be found in the [Creating TCGA cohorts (part 1)](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%201.ipynb) and [(part 2)](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%202.ipynb) notebooks.\n", 118 | "\n", 119 | "\n", 120 | "- **Clinical_data**: This table contains information obtained from the \"clinical\" XML files in the TCGA Level-1 \"bio\" archives. Not all fields in the XML files are represented in this table, but any field which was found to be significantly filled-in for at least one tumor-type has been retained. More information about exploring this table and using this information to create your own custom analysis cohort can be found in the [Creating TCGA cohorts (part 1)](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%201.ipynb) and [(part 2)](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%202.ipynb) notebooks.\n", 121 | "\n", 122 | "\n", 123 | "- **Copy_Number_segments**: This table contains Level-3 copy-number segmentation results generated by The Broad Institute, from Genome Wide SNP 6 data using the CBS (Circular Binary Segmentation) algorithm. The values are base2 log(copynumber/2), centered on 0. More information about this data table can be found in the [Copy Number segments](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Copy%20Number%20segments.ipynb) notebook.\n", 124 | "\n", 125 | "\n", 126 | "- **DNA_Methylation_betas**: This table contains Level-3 summary measures of DNA methylation for each interrogated locus (beta values: M/(M+U)). This table contains data from two different platforms: the Illumina Infinium HumanMethylation 27k and 450k arrays. More information about this data table can be found in the [DNA Methylation](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/DNA%20Methylation.ipynb) notebook. Note that individual chromosome-specific DNA Methylation tables are also available to cut down on the amount of data that you may need to query (depending on yoru use case). \n", 127 | "\n", 128 | "\n", 129 | "- **Protein_RPPA_data**: This table contains the normalized Level-3 protein expression levels based on each antibody used to probe the sample. More information about how this data was generated by the RPPA Core Facility at MD Anderson can be found [here](https://wiki.nci.nih.gov/display/TCGA/Protein+Array+Data+Format+Specification#ProteinArrayDataFormatSpecification-Expression-Protein), and more information about this data table can be found in the [Protein expression](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Protein%20expression.ipynb) notebook.\n", 130 | "\n", 131 | "\n", 132 | "- **Somatic_Mutation_calls**: This table contains annotated somatic mutation calls. All current MAF (Mutation Annotation Format) files were annotated using [Oncotator](http://onlinelibrary.wiley.com/doi/10.1002/humu.22771/abstract;jsessionid=15E7960BA5FEC21EE608E6D262390C52.f01t04) v1.5.1.0, and merged into a single table. More information about this data table can be found in the [Somatic Mutations](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Somatic%20Mutations.ipynb) notebook, including an example of how to use the [Tute Genomics annotations database in BigQuery](http://googlegenomics.readthedocs.org/en/latest/use_cases/annotate_variants/tute_annotation.html).\n", 133 | "\n", 134 | "\n", 135 | "- **mRNA_BCGSC_HiSeq_RPKM**: This table contains mRNAseq-based gene expression data produced by the [BC Cancer Agency](http://www.bcgsc.ca/). (For details about a very similar table, take a look at a [notebook](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/UNC%20HiSeq%20mRNAseq%20gene%20expression.ipynb) describing the other mRNAseq gene expression table.)\n", 136 | "\n", 137 | "\n", 138 | "- **mRNA_UNC_HiSeq_RSEM**: This table contains mRNAseq-based gene expression data produced by [UNC Lineberger](https://unclineberger.org/). More information about this data table can be found in the [UNC HiSeq mRNAseq gene expression](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/UNC%20HiSeq%20mRNAseq%20gene%20expression.ipynb) notebook.\n", 139 | "\n", 140 | "\n", 141 | "- **miRNA_expression**: This table contains miRNAseq-based expression data for mature microRNAs produced by the [BC Cancer Agency](http://www.bcgsc.ca/). More information about this data table can be found in the [microRNA expression](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/BCGSC%20microRNA%20expression.ipynb) notebook." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### Where to start?\n", 149 | "We suggest that you start with the two \"Creating TCGA cohorts\" notebooks ([part 1](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%201.ipynb) and [part 2](https://github.com/isb-cgc/examples-Python/blob/master/notebooks/Creating%20TCGA%20cohorts%20--%20part%202.ipynb)) which describe and make use of the Clinical and Biospecimen tables. From there you can delve into the various molecular data tables as well as the Annotations table. For now these sample notebooks are intentionally relatively simple and do not do any analysis that integrates data from multiple tables but once you have a grasp of how to use the data, developing your own more complex analyses should not be difficult. You could even contribute an example back to our github repository! You are also welcome to submit bug reports, comments, and feature-requests as [github issues](https://github.com/isb-cgc/examples-Python/issues)." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### A note about BigQuery tables and \"tidy data\"\n", 157 | "You may be used to thinking about a molecular data table such as a gene-expression table as a matrix where the rows are genes and the columns are samples (or *vice versa*). These BigQuery tables instead use the [tidy data](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html) approach, with each \"cell\" from the traditional data-matrix becoming a single row in the BigQuery table. A 10,000 gene x 500 sample matrix would therefore become a 5,000,000 row BigQuery table." 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 2", 164 | "language": "python", 165 | "name": "python2" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 2 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython2", 177 | "version": "2.7.9" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 0 182 | } 183 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # examples-Python/python 2 | The **python** subdirectory of this repository contains examples to help you get started working with the data and tools hosted by the ISB-CGC on the Google Cloud Platform. There are three endpoints based on the three programs in the Genomics Data Commons: 3 | * **isb_cgc_ccle_api** 4 | * **isb_cgc_target_api** 5 | * **isb_cgc_tcga_api** 6 | and one endpoint for the cohort apis that can be cross-program: 7 | * **isb_cgc_api** 8 | 9 | ### Programmatic API Examples 10 | The ISB-CGC programmatic API is implemented using Google Cloud Endpoints. These services run on App Engine and can be accessed in several different ways. You can use the [APIs Explorer](https://apis-explorer.appspot.com/apis-explorer/?base=https://api-dot-isb-cgc.appspot.com/_ah/api#p/) to try them out directly in your web browser. 11 | * **isb_auth.py** is a help script that takes care of the auth required for the cohort endpoint APIs 12 | * **isb_cgc_api_v3_cases** shows you how to build a service object and run the isb_cgc_tcga_api cases get API. This cases get API is also part of the isb_cgc_ccle_api and isb_cgc_target_api endpoints. 13 | * **isb_cgc_api_v3_cohorts** shows you how to build a service object and run the isb_cgc_tcga_api cohorts create and preview APIs. These cohorts APIs are also part of the isb_cgc_ccle_api and isb_cgc_target_api endpoints. It also shows how to build a service object and run the isb_cgc_api cohorts get, list, cloud_storage_file_paths, and delete APIs 14 | * **isb_cgc_api_v3_samples** shows you how to build a service object and run the isb_cgc_tcga_api samples get and cloud_storage_file_paths APIs. These samples APIs are also part of the isb_cgc_ccle_api and isb_cgc_target_api endpoints. 15 | * **isb_cgc_api_v3_users** shows you how to build a service object and run the isb_cgc_tcga_api users get API. This users get API is also part of the isb_cgc_ccle_api and isb_cgc_target_api endpoints. 16 | These are some helper scripts for other aspects of the Google Cloud 17 | * **query_ccle_reads.py** script illustrates the usage of the GA4GH API for open-access CCLE reads 18 | * **createSchema.py** script generates a JSON schema for an input data file. This is useful when the data file has a large number of columns, so you can avoid manual creation of its schema. This can be used with the 'bq load' command line tool to load data to BigQuery (https://cloud.google.com/bigquery/quickstart-command-line). 19 | -------------------------------------------------------------------------------- /python/createSchema.py: -------------------------------------------------------------------------------- 1 | # This script generates a JSON schema for a given data file to 2 | # be used with the 'bq load' command line tool. 3 | # ------------------------------------------------------------- 4 | 5 | import sys 6 | import string 7 | import gzip 8 | 9 | from dateutil.parser import parse 10 | 11 | # ------------------------------------------------------------- 12 | 13 | # INPUT: path to local data file 14 | # OUTPUT: JSON schema to stdout 15 | 16 | # BigQuery data types = ['string','bytes','integer','float','boolean','record','timestamp'] 17 | # BigQuery modes = ['nullable','required','repeated'] , default is nullable 18 | 19 | # ------------------------------------------------------------- 20 | 21 | # function to check is a given value is numeric 22 | def isNumeric(val): 23 | try: 24 | float(val) 25 | return True 26 | except ValueError: 27 | return False 28 | 29 | # -------------------------------------------------------------- 30 | 31 | specialChars = [ ' ', '-', ')', '(', ',', ':', ';', '.', '@', 32 | '#', '$', '^', '&', '*', '[', ']', '{', 33 | '}', '|', '/', '?' ] 34 | 35 | def removeSpecialChars ( aString ): 36 | 37 | bString = '' 38 | for ii in range(len(aString)): 39 | if ( aString[ii] in specialChars ): 40 | if ( len(bString) > 0 ): 41 | if ( bString[-1] != "_" ): bString += '_' 42 | elif ( aString[ii] == '%' ): 43 | bString += 'pct' 44 | else: 45 | bString += aString[ii] 46 | 47 | try: 48 | if ( bString[-1] == "_" ): bString = bString[:-1] 49 | except: 50 | doNothing = 1 51 | 52 | if ( bString != aString ): 53 | print " removeSpecialChars : <%s> <%s> " % ( aString, bString ) 54 | 55 | return ( bString ) 56 | 57 | # -------------------------------------------------------------- 58 | 59 | def letter_or_underscore ( aChar ): 60 | 61 | io = ord(aChar) 62 | if ( io == 95 ): return ( 1 ) 63 | if ( io>=64 and io<=90 ): return ( 1 ) 64 | if ( io>=97 and io<=122 ): return ( 1 ) 65 | return ( 0 ) 66 | 67 | # -------------------------------------------------------------- 68 | 69 | def valid_char ( aChar ): 70 | 71 | io = ord(aChar) 72 | if ( io == 95 ): return ( 1 ) 73 | if ( io>=48 and io<=57 ): return ( 1 ) 74 | if ( io>=64 and io<=90 ): return ( 1 ) 75 | if ( io>=97 and io<=122 ): return ( 1 ) 76 | return ( 0 ) 77 | 78 | # -------------------------------------------------------------- 79 | 80 | def createValidBQfieldName ( aString ): 81 | 82 | ## print " " 83 | ## print " in createValidBQfieldName ... <%s> " % aString 84 | 85 | bString = removeSpecialChars ( aString ) 86 | ## print " <%s> " % bString 87 | 88 | ## make sure that the following is satisfied: 89 | ## Fields must contain only letters, numbers, and underscores, start 90 | ## with a letter or underscore, and be at most 128 characters long. 91 | 92 | if ( len(bString) > 128 ): 93 | cString = createValidBQfieldName ( bString[:128] ) 94 | else: 95 | cString = bString 96 | 97 | ## check first character: 98 | ## print " <%s> " % cString 99 | try: 100 | if not letter_or_underscore ( cString[0] ): 101 | print " createValidBQfieldName: first character is not valid <%s> " % cString 102 | sys.exit(-1) 103 | except: 104 | doNothing = 1 105 | 106 | ## check all other characters: 107 | for ii in range(len(cString)): 108 | if not valid_char ( cString[ii] ): 109 | print " createValidBQfieldName: invalid character at position %d <%s> " % ( ii, cString ) 110 | sys.exit(-1) 111 | 112 | return ( cString ) 113 | 114 | # -------------------------------------------------------------- 115 | 116 | def splitListString ( aString ): 117 | 118 | ## print " in splitListString : <%s> " % aString 119 | 120 | aTokens = [] 121 | if ( aString.startswith("u'") ): 122 | ii = 2 123 | while ( ii < len(aString) ): 124 | jj = aString.find("'",ii) 125 | if ( jj > ii ): 126 | aTokens += [ aString[ii:jj] ] 127 | ii = jj 128 | ii = aString.find("'",jj+1) 129 | if ( ii < 0 ): ii = len(aString) 130 | 131 | else: 132 | aTokens = aString.split(',') 133 | 134 | return ( aTokens ) 135 | 136 | # -------------------------------------------------------------- 137 | # this function infers the "types" and "modes" for each 138 | # of the input fields, based on a single input dataRow 139 | 140 | def inferDataTM ( dataRow, dataTypes, dataModes, fieldNames ): 141 | 142 | for ii in range(len(dataRow)): 143 | 144 | item = dataRow[ii].strip() 145 | if ( len(item) < 1 ): continue 146 | 147 | if ( item[0] == '[' and item[-1] == ']' ): 148 | ## print ii, item 149 | ## print " this item looks like a LIST !!! " 150 | aList = item[1:-1] 151 | aToks = splitListString ( aList ) 152 | if ( len(aToks) > 0 ): 153 | if ( dataModes[ii] == 'NA' ): 154 | print " initially setting field #%d (%s) mode to REPEATED " % ( ii, fieldNames[ii] ) 155 | dataModes[ii] = "repeated" 156 | elif ( dataModes[ii] == 'nullable' ): 157 | print " CHANGING field #%d (%s) mode to REPEATED " % ( ii, fieldNames[ii] ) 158 | dataModes[ii] = "repeated" 159 | 160 | else: 161 | aToks = [ item ] 162 | if ( dataModes[ii] == 'NA' ): 163 | print " initially setting field #%d (%s) mode to NULLABLE " % ( ii, fieldNames[ii] ) 164 | dataModes[ii] = "nullable" 165 | 166 | for jtem in aToks: 167 | 168 | if ( jtem == '' or jtem == 'NA'): 169 | ## print " SKIPPING field #%d because it is blank ... " % ii 170 | continue 171 | 172 | elif ( dataTypes[ii] == "string" ): 173 | ## print " SKIPPING field #%d because it is already a STRING " % ii 174 | continue 175 | 176 | elif ( jtem.lower()=="true" or jtem.lower()=="false" ): 177 | if ( dataTypes[ii] == "NA" ): 178 | print " initially setting field #%d (%s) type to BOOLEAN (%s) " % ( ii, fieldNames[ii], jtem ) 179 | dataTypes[ii] = "boolean" 180 | elif ( dataTypes[ii] == "boolean" ): 181 | continue 182 | else: 183 | print " ERROR ??? conflicting data types ??? ", jtem, dataTypes[ii] 184 | dataTypes[ii] = "string" 185 | 186 | else: 187 | 188 | try: 189 | iVal = int(jtem) 190 | if ( dataTypes[ii] == "NA" ): 191 | print " initially setting field #%d (%s) type to INTEGER (%s) " % ( ii, fieldNames[ii], jtem ) 192 | dataTypes[ii] = "integer" 193 | elif ( dataTypes[ii] == "integer" ): 194 | continue 195 | elif ( dataTypes[ii] == "float" ): 196 | continue 197 | else: 198 | print " ERROR ??? conflicting data types ??? ", jtem, dataTypes[ii] 199 | dataTypes[ii] = "string" 200 | 201 | except: 202 | try: 203 | fVal = float(jtem) 204 | if ( dataTypes[ii] == "NA" ): 205 | print " initially setting field #%d (%s) type to FLOAT (%s) " % ( ii, fieldNames[ii], jtem ) 206 | dataTypes[ii] = "float" 207 | elif ( dataTypes[ii] == "float" ): 208 | continue 209 | elif ( dataTypes[ii] == "integer" ): 210 | print " CHANGING field #%d (%s) from INTEGER to FLOAT (%s) " % ( ii, fieldNames[ii], jtem ) 211 | dataTypes[ii] = "float" 212 | continue 213 | else: 214 | print " ERROR ??? conflicting data types ??? ", jtem, dataTypes[ii] 215 | dataTypes[ii] = "string" 216 | 217 | except: 218 | if ( dataTypes[ii] == "NA" ): 219 | print " initially setting field #%d (%s) type to STRING (%s) " % ( ii, fieldNames[ii], jtem ) 220 | else: 221 | print " CHANGING field #%d (%s) to STRING (%s) " % ( ii, fieldNames[ii], jtem ) 222 | dataTypes[ii] = "string" 223 | 224 | ## print dataTypes 225 | return ( dataTypes, dataModes ) 226 | 227 | # -------------------------------------------------------------- 228 | 229 | # TODO: clean up this code ... also make it capable of handling 230 | # both TSVs and CSVs ... and look at the shlex module/class 231 | 232 | if ( len(sys.argv) == 1 ): 233 | print " " 234 | print " Usage : %s " 235 | print " * nSkip: # of lines skipped between lines that are parsed and checked for data-types; " 236 | print " if the input file is small, you can leave set nSkip to be small, but if the input " 237 | print " file is very large, nSkip should probably be 1000 or more (default value is 1000) " 238 | print " * topSkip: # of lines to be completely skipped at the top of the file (default 0) " 239 | print " " 240 | sys.exit(-1) 241 | 242 | inFilename = sys.argv[1] 243 | 244 | ## this is the # of lines that we'll skip over each time we 245 | ## read and parse a single line of data ... 246 | nSkip = 1000 247 | if ( len(sys.argv) >= 3 ): 248 | nSkip = int ( sys.argv[2] ) 249 | if ( nSkip < 0 ): nSkip = 0 250 | 251 | topSkip = 0 252 | if ( len(sys.argv) >= 4 ): 253 | topSkip = int ( sys.argv[3] ) 254 | if ( topSkip < 0 ): topSkip = 0 255 | 256 | ## scratch file ... 257 | dmpFh = file ( "subsample.tsv", 'w' ) 258 | 259 | # open data file ... 260 | try: 261 | if inFilename.endswith('gz'): 262 | dataFile = gzip.open(inFilename,"r") 263 | else: 264 | dataFile = open(inFilename,"r") 265 | except: 266 | print 'requires input filename as command-line parameter' 267 | if ( len(inFilename) > 0 ): 268 | print ' --> failed to open <%s> ' % inFilename 269 | sys.exit() 270 | 271 | print " " 272 | print "Parsing input file <%s>." % inFilename 273 | print " " 274 | 275 | # start by skipping the specified 'topSkip' lines ... 276 | for i in range(topSkip): 277 | aLine = dataFile.readline() 278 | print " skipping line %d <%s...> " % ( (i+1), aLine[:16] ) 279 | 280 | # first line is expected to be the header 281 | aLine = dataFile.readline() 282 | dmpFh.write ( '%s' % aLine ) 283 | headerRow = aLine.split('\t') 284 | 285 | # if any numeric values in this first line, it is likely not a header: hence exit 286 | if any([isNumeric(x) for x in headerRow]): 287 | print 'Numeric fields found in the first line. Perhaps the header is missing. Please check input file.' 288 | print headerRow 289 | sys.exit() 290 | 291 | # build up a list of field names based on the header tokens and make sure they 292 | # are all unique 293 | fieldNames = [] 294 | lowerNames = [] 295 | for ii in range(len(headerRow)): 296 | aName = removeSpecialChars ( headerRow[ii].strip() ) 297 | aName = createValidBQfieldName ( headerRow[ii].strip() ) 298 | 299 | if ( aName.lower() in lowerNames ): 300 | print " ERROR: repeated header token <%s> " % aName 301 | print " --> appending 'X' --> <%sX> " % aName 302 | aName = aName + 'X' 303 | ## sys.exit(-1) 304 | 305 | if ( aName == "" ): 306 | print " ERROR: blank header token ??? " 307 | sys.exit(-1) 308 | 309 | fieldNames += [ aName ] 310 | lowerNames += [ aName.lower() ] 311 | 312 | print " " 313 | print fieldNames 314 | print " " 315 | 316 | dataTypes = ['NA'] * len(fieldNames) 317 | dataModes = ['NA'] * len(fieldNames) 318 | 319 | 320 | done = 0 321 | while not done: 322 | 323 | # next, read a data row to infer column data types 324 | aLine = dataFile.readline() 325 | dmpFh.write ( '%s' % aLine ) 326 | dataRow = aLine.split('\t') 327 | 328 | if ( len(dataRow) == 1 ): 329 | done = 1 330 | continue 331 | 332 | if ( len(dataRow) != len(fieldNames) ): 333 | print " ERROR ??? # of values in data row is not as expected ??? ", len(dataRow), len(fieldNames) 334 | print " " 335 | for ii in range(min(len(dataRow),len(fieldNames))): 336 | print " %3d %s %s " % ( ii, fieldNames[ii], dataRow[ii] ) 337 | sys.exit(-1) 338 | 339 | ( dataTypes, dataModes ) = inferDataTM ( dataRow, dataTypes, dataModes, fieldNames ) 340 | 341 | ## skip over a bunch of rows, we don't want to check every single row, 342 | ## just a few of them at random ... 343 | for jj in range(nSkip): 344 | dataRow = dataFile.readline() 345 | if ( len(dataRow) < 1 ): done = 1 346 | 347 | dataFile.close() 348 | dmpFh.close() 349 | 350 | schemaFilename = inFilename + ".json" 351 | try: 352 | fhOut = file ( schemaFilename, 'w' ) 353 | except: 354 | print " ERROR??? failed to open output schema file??? " 355 | print schemaFilename 356 | sys.exit(-1) 357 | 358 | print " " 359 | print " " 360 | print "writing draft JSON schema to <%s> " % schemaFilename 361 | print " " 362 | 363 | # print opening bracket 364 | fhOut.write ( '[\n' ) 365 | 366 | # the available data types are described in detail at: https://cloud.google.com/bigquery/data-types 367 | # and include: STRING, BYTES, INTEGER, FLOAT, BOOLEAN ('true' or 'false'), 368 | # RECORD, and TIMESTAMP 369 | # here we will only try to infer STRING, INTEGER, FLOAT, or BOOLEAN 370 | 371 | # loop through the columns 372 | for ii in range(len(fieldNames)): 373 | 374 | # in case we got this far w/o a dataType getting set ... 375 | if ( dataTypes[ii] == "NA" ): dataTypes[ii] = "string" 376 | if ( dataModes[ii] == "NA" ): dataModes[ii] = "nullable" 377 | 378 | outStr = ' {"name": "'+fieldNames[ii]+'", "type": "'+dataTypes[ii]+'", "mode": "'+dataModes[ii]+'", "description": ""}' 379 | if ( ii < len(fieldNames)-1 ): 380 | outStr+=',' 381 | 382 | fhOut.write ( '%s\n' % outStr ) 383 | 384 | # print closing bracket 385 | fhOut.write ( ']\n' ) 386 | 387 | fhOut.close() 388 | 389 | # -------------------------------------------------------------- 390 | -------------------------------------------------------------------------------- /python/createSchemaP3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2015-2019, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | # 18 | # This version of the script has been modified to work with Python 3, and 19 | # can also be called as a function within python 20 | # 21 | 22 | # This script generates a JSON schema for a given data file to 23 | # be used with the 'bq load' command line tool. 24 | # ------------------------------------------------------------- 25 | 26 | import sys 27 | import string 28 | import gzip 29 | 30 | from dateutil.parser import parse 31 | 32 | 33 | # ------------------------------------------------------------- 34 | 35 | # INPUT: path to local data file 36 | # OUTPUT: JSON schema to stdout 37 | 38 | # BigQuery data types = ['string','bytes','integer','float','boolean','record','timestamp'] 39 | # BigQuery modes = ['nullable','required','repeated'] , default is nullable 40 | 41 | # ------------------------------------------------------------- 42 | 43 | # function to check is a given value is numeric 44 | def isNumeric(val): 45 | try: 46 | float(val) 47 | return True 48 | except ValueError: 49 | return False 50 | 51 | 52 | # -------------------------------------------------------------- 53 | 54 | specialChars = [' ', '-', ')', '(', ',', ':', ';', '.', '@', 55 | '#', '$', '^', '&', '*', '[', ']', '{', 56 | '}', '|', '/', '?'] 57 | 58 | 59 | def removeSpecialChars(aString): 60 | bString = '' 61 | for ii in range(len(aString)): 62 | if (aString[ii] in specialChars): 63 | if (len(bString) > 0): 64 | if (bString[-1] != "_"): bString += '_' 65 | elif (aString[ii] == '%'): 66 | bString += 'pct' 67 | else: 68 | bString += aString[ii] 69 | 70 | try: 71 | if (bString[-1] == "_"): bString = bString[:-1] 72 | except: 73 | doNothing = 1 74 | 75 | print(" removeSpecialChars : <%s> <%s> " % (aString, bString)) 76 | return (bString) 77 | 78 | 79 | # -------------------------------------------------------------- 80 | 81 | def letter_or_underscore(aChar): 82 | io = ord(aChar) 83 | if (io == 95): return (1) 84 | if (io >= 64 and io <= 90): return (1) 85 | if (io >= 97 and io <= 122): return (1) 86 | return (0) 87 | 88 | 89 | # -------------------------------------------------------------- 90 | 91 | def valid_char(aChar): 92 | io = ord(aChar) 93 | if (io == 95): return (1) 94 | if (io >= 48 and io <= 57): return (1) 95 | if (io >= 64 and io <= 90): return (1) 96 | if (io >= 97 and io <= 122): return (1) 97 | return (0) 98 | 99 | 100 | # -------------------------------------------------------------- 101 | 102 | def createValidBQfieldName(aString): 103 | ## print " " 104 | ## print " in createValidBQfieldName ... <%s> " % aString 105 | 106 | bString = removeSpecialChars(aString) 107 | ## print " <%s> " % bString 108 | 109 | ## make sure that the following is satisfied: 110 | ## Fields must contain only letters, numbers, and underscores, start 111 | ## with a letter or underscore, and be at most 128 characters long. 112 | 113 | if (len(bString) > 128): 114 | cString = createValidBQfieldName(bString[:128]) 115 | else: 116 | cString = bString 117 | 118 | ## check first character: 119 | print(" <%s> " % cString) 120 | try: 121 | if not letter_or_underscore(cString[0]): 122 | print(" createValidBQfieldName: first character is not valid <%s> " % cString) 123 | sys.exit(-1) 124 | except: 125 | doNothing = 1 126 | 127 | ## check all other characters: 128 | for ii in range(len(cString)): 129 | if not valid_char(cString[ii]): 130 | print(" createValidBQfieldName: invalid character at position %d <%s> " % (ii, cString)) 131 | sys.exit(-1) 132 | 133 | return (cString) 134 | 135 | 136 | # -------------------------------------------------------------- 137 | 138 | def inferDataTypes(dataRow, dataTypes, fieldNames): 139 | for ii in range(len(dataRow)): 140 | 141 | item = dataRow[ii].strip() 142 | 143 | if (item == '' or item == 'NA'): 144 | ## print " SKIPPING field #%d because it is blank ... " % ii 145 | continue 146 | 147 | elif (dataTypes[ii] == "string"): 148 | ## print " SKIPPING field #%d because it is already a STRING " % ii 149 | continue 150 | 151 | elif (item.lower() == "true" or item.lower() == "false"): 152 | if (dataTypes[ii] == "NA"): 153 | print(" initially setting field #%d (%s) to BOOLEAN (%s) " % (ii, fieldNames[ii], item)) 154 | dataTypes[ii] = "boolean" 155 | elif (dataTypes[ii] == "boolean"): 156 | continue 157 | else: 158 | print(" ERROR ??? conflicting data types ??? ", item, dataTypes[ii]) 159 | dataTypes[ii] = "string" 160 | 161 | else: 162 | 163 | try: 164 | iVal = int(item) 165 | if (dataTypes[ii] == "NA"): 166 | print(" initially setting field #%d (%s) to INTEGER (%s) " % (ii, fieldNames[ii], item)) 167 | dataTypes[ii] = "integer" 168 | elif (dataTypes[ii] == "integer"): 169 | continue 170 | elif (dataTypes[ii] == "float"): 171 | continue 172 | else: 173 | print(" ERROR ??? conflicting data types ??? ", item, dataTypes[ii]) 174 | dataTypes[ii] = "string" 175 | 176 | except: 177 | try: 178 | fVal = float(item) 179 | if (dataTypes[ii] == "NA"): 180 | print(" initially setting field #%d (%s) to FLOAT (%s) " % (ii, fieldNames[ii], item)) 181 | dataTypes[ii] = "float" 182 | elif (dataTypes[ii] == "float"): 183 | continue 184 | elif (dataTypes[ii] == "integer"): 185 | print(" CHANGING field #%d (%s) from INTEGER to FLOAT (%s) " % (ii, fieldNames[ii], item)) 186 | dataTypes[ii] = "float" 187 | continue 188 | else: 189 | print(" ERROR ??? conflicting data types ??? ", item, dataTypes[ii]) 190 | dataTypes[ii] = "string" 191 | 192 | except: 193 | if (dataTypes[ii] == "NA"): 194 | print(" initially setting field #%d (%s) to STRING (%s) " % (ii, fieldNames[ii], item)) 195 | else: 196 | print(" CHANGING field #%d (%s) to STRING (%s) " % (ii, fieldNames[ii], item)) 197 | dataTypes[ii] = "string" 198 | 199 | ## print dataTypes) 200 | return (dataTypes) 201 | 202 | 203 | def main(args): 204 | if len(args) == 1: 205 | print(" ") 206 | print(" Usage : %s ") 207 | print(" where nSkip specifies the # of lines skipped between ") 208 | print(" lines that are parsed and checked for data-types; ") 209 | print(" if the input file is small, you can leave set nSkip ") 210 | print(" to be small, but if the input file is very large, nSkip ") 211 | print(" should probably be 1000 or more (default value is 1000) ") 212 | print(" ") 213 | sys.exit(-1) 214 | 215 | ## this is the # of lines that we'll skip over each time we 216 | ## read and parse a single line of data . 217 | 218 | nSkip = 1000 219 | if len(args) == 3: 220 | nSkip = int(args[2]) 221 | if (nSkip < 0): nSkip = 0 222 | 223 | build_schema(args[1], nSkip) 224 | 225 | # -------------------------------------------------------------- 226 | 227 | def build_schema(inFilename, nSkip): 228 | 229 | ## scratch file ... 230 | dmpFh = open("subsample.tsv", 'w') 231 | 232 | # open data file ... 233 | try: 234 | if inFilename.endswith('gz'): 235 | dataFile = gzip.open(inFilename, "r") 236 | else: 237 | dataFile = open(inFilename, "r") 238 | except: 239 | print('requires input filename as command-line parameter') 240 | if (len(inFilename) > 0): 241 | print(' --> failed to open <%s> ' % inFilename) 242 | sys.exit() 243 | 244 | print(" ") 245 | print("Parsing input file <%s>." % inFilename) 246 | print(" ") 247 | 248 | # first line is expected to be the header 249 | aLine = dataFile.readline() 250 | dmpFh.write('%s' % aLine) 251 | headerRow = aLine.split('\t') 252 | 253 | # if any numeric values in this first line, it is likely not a header: hence exit 254 | if any([isNumeric(x) for x in headerRow]): 255 | print('Numeric fields found in the first line. Perhaps the header is missing. Please check input file.') 256 | print(headerRow) 257 | sys.exit() 258 | 259 | # build up a list of field names based on the header tokens and make sure they 260 | # are all unique 261 | fieldNames = [] 262 | lowerNames = [] 263 | for ii in range(len(headerRow)): 264 | aName = removeSpecialChars(headerRow[ii].strip()) 265 | aName = createValidBQfieldName(headerRow[ii].strip()) 266 | 267 | if (aName.lower() in lowerNames): 268 | print(" ERROR: repeated header token <%s> " % aName) 269 | print(" --> appending 'X' --> <%sX> " % aName) 270 | aName = aName + 'X' 271 | ## sys.exit(-1) 272 | 273 | if (aName == ""): 274 | print(" ERROR: blank header token ??? ") 275 | sys.exit(-1) 276 | 277 | fieldNames += [aName] 278 | lowerNames += [aName.lower()] 279 | 280 | print(" ") 281 | print(fieldNames) 282 | print(" ") 283 | 284 | dataTypes = ['NA'] * len(fieldNames) 285 | 286 | done = 0 287 | while not done: 288 | 289 | # next, read a data row to infer column data types 290 | aLine = dataFile.readline() 291 | dmpFh.write('%s' % aLine) 292 | dataRow = aLine.split('\t') 293 | 294 | if (len(dataRow) == 1): 295 | done = 1 296 | continue 297 | 298 | if (len(dataRow) != len(fieldNames)): 299 | print(" ERROR ??? # of values in data row is not as expected ??? ", len(dataRow), len(fieldNames)) 300 | print(" ") 301 | for ii in range(min(len(dataRow), len(fieldNames))): 302 | print(" %3d %s %s " % (ii, fieldNames[ii], dataRow[ii])) 303 | sys.exit(-1) 304 | 305 | dataTypes = inferDataTypes(dataRow, dataTypes, fieldNames) 306 | 307 | ## skip over a bunch of rows, we don't want to check every single row, 308 | ## just a few of them at random ... 309 | for jj in range(nSkip): 310 | dataRow = dataFile.readline() 311 | if (len(dataRow) < 1): done = 1 312 | 313 | dataFile.close() 314 | dmpFh.close() 315 | 316 | schemaFilename = inFilename + ".json" 317 | try: 318 | fhOut = open(schemaFilename, 'w') 319 | except: 320 | print(" ERROR??? failed to open output schema file??? ") 321 | print(schemaFilename) 322 | sys.exit(-1) 323 | 324 | print(" ") 325 | print(" ") 326 | print("writing draft JSON schema to <%s> " % schemaFilename) 327 | print(" ") 328 | 329 | # print opening bracket) 330 | fhOut.write('[\n') 331 | 332 | # the available data types are described in detail at: https://cloud.google.com/bigquery/data-types 333 | # and include: STRING, BYTES, INTEGER, FLOAT, BOOLEAN ('true' or 'false'), 334 | # RECORD, and TIMESTAMP 335 | # here we will only try to infer STRING, INTEGER, FLOAT, or BOOLEAN 336 | 337 | retval = [] 338 | # loop through the columns 339 | for ii in range(len(fieldNames)): 340 | 341 | # in case we got this far w/o a dataType getting set ... 342 | if (dataTypes[ii] == "NA"): 343 | dataTypes[ii] = "string" 344 | retval.append((fieldNames[ii], dataTypes[ii])) 345 | outStr = ' {"name": "' + fieldNames[ii] + '", "type": "' + dataTypes[ 346 | ii] + '", "mode": "nullable", "description": ""}' 347 | if (ii < len(fieldNames) - 1): 348 | outStr += ',' 349 | 350 | fhOut.write('%s\n' % outStr) 351 | 352 | # print closing bracket 353 | fhOut.write(']\n') 354 | 355 | fhOut.close() 356 | 357 | return retval 358 | 359 | # -------------------------------------------------------------- 360 | 361 | if __name__ == "__main__": 362 | main(sys.argv) -------------------------------------------------------------------------------- /python/gdcCase2Cohort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | Copyright 2017, Institute for Systems Biology. 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | 17 | Create ISB-CGC cohorts from a GDC Case JSON file 18 | 19 | From the command line use the following options: 20 | 21 | -c/--credentialsfile OPTIONAL: This defaults to the .isb_credentials file in the user's home directory. If one 22 | does not exist, it will be created even if this option is not used. 23 | 24 | -i/--inputfile REQUIRED: This is the JSON file obtained from the GDC that will be parsed to obtain the 25 | case IDs. 26 | 27 | -n/--cohortname REQUIRED: This is the name that will be assigned to the cohort. While this has no impact on 28 | cohort creation, it should be descriptive enough to be useful. 29 | 30 | ''' 31 | 32 | from oauth2client.client import OAuth2WebServerFlow 33 | from oauth2client import tools 34 | from oauth2client.file import Storage 35 | from googleapiclient.discovery import build 36 | from googleapiclient.errors import HttpError 37 | import os 38 | import argparse 39 | import httplib2 40 | import json 41 | 42 | # the CLIENT_ID for the ISB-CGC site 43 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 44 | # The google-specified 'installed application' OAuth pattern 45 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 46 | # The google defined scope for authorization 47 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 48 | # where a default credentials file will be stored for use by the endpoints 49 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 50 | 51 | 52 | def get_credentials(credFile): 53 | oauth_flow_args = ['--noauth_local_webserver'] 54 | if credFile is None: 55 | storage = Storage(DEFAULT_STORAGE_FILE) 56 | else: 57 | storage = Storage(credFile) 58 | 59 | credentials = storage.get() 60 | if not credentials or credentials.invalid: 61 | flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, EMAIL_SCOPE) 62 | flow.auth_uri = flow.auth_uri.rstrip('/') + '?approval_prompt=force' 63 | credentials = tools.run_flow(flow, storage, tools.argparser.parse_args(oauth_flow_args)) 64 | return credentials 65 | 66 | 67 | def get_authorized_service(api, version, site, credentials): 68 | discovery_url = '%s/_ah/api/discovery/v1/apis/%s/%s/rest' % (site, api, version) 69 | http = credentials.authorize(httplib2.Http()) 70 | if credentials.access_token_expired or credentials.invalid: 71 | credentials.refresh(http) 72 | authorized_service = build(api, version, discoveryServiceUrl=discovery_url, http=http) 73 | return authorized_service 74 | 75 | def parseGDCCase(filename): 76 | inputfile = open(filename,'r') 77 | data = json.load(inputfile) 78 | uuids = [] 79 | 80 | for entry in data: 81 | uuids.append(entry['case_id']) 82 | 83 | return uuids 84 | 85 | def cohortsCreate(service, name, body): 86 | try: 87 | data = service.cohorts().create(name=name, body=body).execute() 88 | return data 89 | except HttpError as exception: 90 | raise exception 91 | 92 | def main(args): 93 | #Main variables 94 | api = "isb_cgc_tcga_api" 95 | version = "v3" 96 | site = "https://api-dot-isb-cgc.appspot.com" 97 | 98 | #Set up credentials and API service 99 | credentials = get_credentials(args.credentialsfile) 100 | service = get_authorized_service(api, version, site, credentials) 101 | 102 | #Parse the case IDs from the GDC case file 103 | uuids = parseGDCCase(args.inputfile) 104 | 105 | #Create the cohort 106 | query = {"case_gdc_id" : uuids} 107 | try: 108 | data = cohortsCreate(service, args.cohortname, query) 109 | except HttpError as exception: 110 | print exception 111 | 112 | 113 | 114 | 115 | if __name__ == "__main__": 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument("-c", "--credentialsfile", nargs = '?', const = None , help="File to use for credentials, will default to ~/.isb_credentials if left blank") 118 | parser.add_argument("-i", "--inputfile", required = True, help = "GDC Case JSON file") 119 | parser.add_argument("-n", "--cohortname", nargs = '?', const = None, required = True, help = "Provide a name for the cohort") 120 | args = parser.parse_args() 121 | 122 | main(args) 123 | -------------------------------------------------------------------------------- /python/isb_auth.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | Authenticates user for accessing the ISB-CGC Endpoint APIs. 17 | 18 | May be run from the command line or in scripts/ipython. 19 | 20 | The credentials file can be copied to any machine from which you want 21 | to access the API. 22 | 23 | 1. Command Line 24 | python ./isb_auth.py saves the user's credentials; 25 | OPTIONAL: 26 | -v for verbose (returns token!) 27 | -s FILE sets credentials file [default: ~/.isb_credentials] 28 | -u URL-only: for use over terminal connections; 29 | gives user a URL to paste into their browser, 30 | and asks for an auth code in return 31 | 32 | 2. Python 33 | import isb_auth 34 | isb_auth.get_credentials() 35 | 36 | # optional: to store credentials in a different location 37 | from oauth2client.file import Storage 38 | import isb_auth 39 | import os 40 | 41 | storage_file = os.path.join(os.path.expanduser("~"), "{USER_CREDENTIALS_FILE_NAME}") 42 | storage = Storage(storage_file) 43 | isb_auth.get_credentials(storage=storage) 44 | ''' 45 | 46 | from argparse import ArgumentParser 47 | import os 48 | 49 | from oauth2client.client import OAuth2WebServerFlow 50 | from oauth2client import tools 51 | from oauth2client.file import Storage 52 | 53 | VERBOSE = False 54 | # for native application - same as settings.INSTALLED_APP_CLIENT_ID 55 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 56 | # NOTE: this is NOT actually a 'secret' -- we're using the 'installed 57 | # application' OAuth pattern here 58 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 59 | 60 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 61 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 62 | 63 | 64 | def maybe_print(msg): 65 | if VERBOSE: 66 | print msg 67 | 68 | 69 | def get_credentials(storage=None, oauth_flow_args=[]): 70 | noweb = '--noauth_local_webserver' 71 | if __name__ != '__main__' and noweb not in oauth_flow_args: 72 | oauth_flow_args.append(noweb) 73 | if storage is None: 74 | storage = Storage(DEFAULT_STORAGE_FILE) 75 | credentials = storage.get() 76 | if not credentials or credentials.invalid: 77 | maybe_print('credentials missing/invalid, kicking off OAuth flow') 78 | flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, EMAIL_SCOPE) 79 | flow.auth_uri = flow.auth_uri.rstrip('/') + '?approval_prompt=force' 80 | credentials = tools.run_flow(flow, storage, tools.argparser.parse_args(oauth_flow_args)) 81 | return credentials 82 | 83 | 84 | def main(): 85 | global VERBOSE 86 | args = parse_args() 87 | oauth_flow_args = [args.noauth_local_webserver] if args.noauth_local_webserver else [] 88 | VERBOSE = args.verbose 89 | maybe_print('--verbose: printing extra information') 90 | storage = Storage(args.storage_file) 91 | credentials = get_credentials(storage, oauth_flow_args) 92 | maybe_print('credentials stored in ' + args.storage_file) 93 | maybe_print('access_token: ' + credentials.access_token) 94 | maybe_print('refresh_token: ' + credentials.refresh_token) 95 | 96 | def parse_args(): 97 | parser = ArgumentParser() 98 | parser.add_argument('--storage_file', '-s', default=DEFAULT_STORAGE_FILE, help='storage file to use for the credentials (default is {})'.format(DEFAULT_STORAGE_FILE)) 99 | parser.add_argument('--verbose', '-v', dest='verbose', action='store_true', help='display credentials storage location, access token, and refresh token') 100 | parser.set_defaults(verbose=False) 101 | parser.add_argument('--noauth_local_webserver','-u', action='store_const', const='--noauth_local_webserver') 102 | return parser.parse_args() 103 | 104 | if __name__ == '__main__': 105 | main() 106 | 107 | -------------------------------------------------------------------------------- /python/isb_cgc_api_v3_cases.py: -------------------------------------------------------------------------------- 1 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 2 | 3 | ''' 4 | Copyright 2018, Institute for Systems Biology. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 20 | 21 | from argparse import ArgumentParser 22 | from googleapiclient.discovery import build 23 | 24 | import httplib2 25 | import os 26 | import pprint 27 | import sys 28 | 29 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 30 | 31 | # the CLIENT_ID for the ISB-CGC site 32 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 33 | # The google-specified 'installed application' OAuth pattern 34 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 35 | # The google defined scope for authorization 36 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 37 | # where a default credentials file will be stored for use by the endpoints 38 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 39 | 40 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 41 | 42 | def get_unauthorized_service(): 43 | api = 'isb_cgc_tcga_api' 44 | version = 'v3' 45 | site = "https://api-dot-isb-cgc.appspot.com" 46 | discovery_url = '%s/_ah/api/discovery/v1/apis/%s/%s/rest' % (site, api, version) 47 | return build(api, version, discoveryServiceUrl=discovery_url, http=httplib2.Http()) 48 | 49 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 50 | 51 | # the example uses the TCGA-specific endpoint but the same functionality 52 | # exists in the TARGET and CCLE endpoints as well 53 | 54 | def get(service, barcode): 55 | """ 56 | Usage: python isb_cgc_api_v3_cases.py -b TCGA-W5-AA2R 57 | """ 58 | data = service.cases().get(case_barcode=barcode).execute() 59 | print '\nResults from cases().get()' 60 | pprint.pprint(data) 61 | 62 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 63 | 64 | def print_usage(): 65 | print " " 66 | print " Usage: python %s -b " % sys.argv[0] 67 | print " " 68 | print " Example: python %s -b TCGA-W5-AA2R " % sys.argv[0] 69 | print " " 70 | 71 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 72 | 73 | def main(): 74 | parser = ArgumentParser() 75 | parser.add_argument('--barcode', '-b', type=str, required=True, 76 | action='store', help='Case barcode. Example: TCGA-W5-AA2R') 77 | try: 78 | args = parser.parse_args() 79 | except: 80 | print_usage() 81 | return 82 | 83 | service = get_unauthorized_service() 84 | get(service, args.barcode) 85 | 86 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 87 | 88 | if __name__ == '__main__': 89 | main() 90 | 91 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 92 | -------------------------------------------------------------------------------- /python/isb_cgc_api_v3_cohorts.py: -------------------------------------------------------------------------------- 1 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 2 | 3 | ''' 4 | Copyright 2018, Institute for Systems Biology. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 19 | 20 | from argparse import ArgumentParser 21 | from googleapiclient.discovery import build 22 | from oauth2client.client import OAuth2WebServerFlow 23 | from oauth2client import tools 24 | from oauth2client.file import Storage 25 | 26 | import httplib2 27 | import json 28 | import os 29 | import pprint 30 | import sys 31 | 32 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 33 | 34 | # the CLIENT_ID for the ISB-CGC site 35 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 36 | # The google-specified 'installed application' OAuth pattern 37 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 38 | # The google defined scope for authorization 39 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 40 | # where a default credentials file will be stored for use by the endpoints 41 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 42 | 43 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 44 | # validate the credentials of the current user against the ISB-CGC site 45 | 46 | def get_credentials(): 47 | oauth_flow_args = ['--noauth_local_webserver'] 48 | storage = Storage(DEFAULT_STORAGE_FILE) 49 | credentials = storage.get() 50 | if not credentials or credentials.invalid: 51 | flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, EMAIL_SCOPE) 52 | flow.auth_uri = flow.auth_uri.rstrip('/') + '?approval_prompt=force' 53 | credentials = tools.run_flow(flow, storage, tools.argparser.parse_args(oauth_flow_args)) 54 | return credentials 55 | 56 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 57 | 58 | def get_authorized_service(api_tag): 59 | api = 'isb_cgc{}_api'.format(api_tag) 60 | version = 'v3' 61 | site = "https://api-dot-isb-cgc.appspot.com" 62 | discovery_url = '%s/_ah/api/discovery/v1/apis/%s/%s/rest' % (site, api, version) 63 | 64 | credentials = get_credentials() 65 | http = credentials.authorize(httplib2.Http()) 66 | 67 | if credentials.access_token_expired or credentials.invalid: 68 | credentials.refresh(http) 69 | 70 | authorized_service = build(api, version, discoveryServiceUrl=discovery_url, http=http) 71 | 72 | return authorized_service 73 | 74 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 75 | # resource methods 76 | # the following four APIs can be cross-program and are part of the isb_cgc_api endpoint 77 | 78 | def get(service, cohort_id=None, body=None, name=None): 79 | """ 80 | Usage: python isb_cgc_api_v3_cohorts.py -e get -c 24 81 | """ 82 | data = service.cohorts().get(cohort_id=cohort_id).execute() 83 | print '\nresults from cohorts().get()' 84 | pprint.pprint(data) 85 | 86 | def list(service, cohort_id=None, body=None, name=None): 87 | """ 88 | Usage: python isb_cgc_api_v3_cohorts.py -e list 89 | """ 90 | data = service.cohorts().list().execute() 91 | print '\nresults from cohorts().list()' 92 | pprint.pprint(data) 93 | 94 | return data 95 | 96 | def delete(service, cohort_id=None, body=None, name=None): 97 | """ 98 | Usage: python isb_cgc_api_v3_cohorts.py -e delete -c 24 99 | """ 100 | data = service.cohorts().delete(cohort_id=cohort_id).execute() 101 | print '\nresults from cohorts().delete()' 102 | pprint.pprint(data) 103 | 104 | def cloud_storage_file_paths(service, cohort_id, body=None, name=None): 105 | 106 | """ 107 | Usage: python isb_cgc_api_v3_cohorts.py -e cloud_storage_file_paths -c 24 108 | """ 109 | 110 | data = service.cohorts().cloud_storage_file_paths(cohort_id=cohort_id).execute() 111 | print '\nresults from cohorts().cloud_storage_file_paths()' 112 | pprint.pprint(data) 113 | 114 | return data 115 | 116 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 117 | # the following two APIs are specific to a program endpoint -- the examples 118 | # here use the TCGA-specific endpoint, but are also available within the 119 | # TARGET- and CCLE-specific APIs 120 | 121 | def preview(service, cohort_id=None, body=None, name=None): 122 | """ 123 | Usage: python isb_cgc_api_v3_cohorts.py -e preview -b '{"project_short_name": ["TCGA-BRCA", "TCGA-UCS"], "age_at_diagnosis_gte": 90}' 124 | """ 125 | data = service.cohorts().preview(**body).execute() 126 | print '\nresults from cohorts().preview()' 127 | pprint.pprint(data['sample_count']) 128 | 129 | def create(service, cohort_id=None, body=None, name=None): 130 | """ 131 | Usage: python isb_cgc_api_v3_cohorts.py -e create -n mycohortname -b '{"project_short_name": ["TCGA-BRCA", "TCGA-UCS"], "age_at_diagnosis_gte": 90}' 132 | """ 133 | data = service.cohorts().create(name=name, body=body).execute() 134 | print '\nresults from cohorts().create()' 135 | pprint.pprint(data) 136 | 137 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 138 | 139 | def print_usage(): 140 | print " " 141 | print " Usage: python %s --endpoint ... " % sys.argv[0] 142 | print " " 143 | print " Examples: " 144 | print " python %s --endpoint list " % sys.argv[0] 145 | print " python %s --endpoint get --cohort " % sys.argv[0] 146 | print " python %s --endpoint delete --cohort " % sys.argv[0] 147 | print " python %s --endpoint cloud_storage_file_paths --cohort " % sys.argv[0] 148 | bodyString = '{"project_short_name": ["TCGA-BRCA", "TCGA-UCS"], "age_at_diagnosis_gte": 90}' 149 | ## print " python %s --endpoint preview --body '%s' " % ( sys.argv[0], bodyString ) 150 | ## print " python %s --endpoint create --name myNewCohort --body '%s' " % ( sys.argv[0], bodyString ) 151 | print " " 152 | 153 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 154 | 155 | def main(): 156 | parser = ArgumentParser() 157 | parser.add_argument('--endpoint', '-e', 158 | help='Name of cohorts endpoint to execute. ' 159 | 'Options: get, list, preview, create, delete, cloud_storage_file_paths') 160 | parser.add_argument('--cohort_id', '-c', 161 | help='Id of cohort to use in get, delete, or cloud_storage_file_paths') 162 | parser.add_argument('--body', '-b', 163 | help='Payload to use in preview or create endpoints. Example: ' 164 | '{"project_short_name": ["TCGA-BRCA", "TCGA-UCS"], "age_at_diagnosis_gte": 90') 165 | parser.add_argument('--name', '-n', 166 | help='The name of the cohort to create in the create endpoint.') 167 | try: 168 | args = parser.parse_args() 169 | except: 170 | print_usage() 171 | return 172 | 173 | if args.endpoint not in ['get', 'list', 'preview', 'create', 'delete', 'cloud_storage_file_paths']: 174 | print_usage() 175 | return 176 | 177 | if args.endpoint not in ['list', 'preview', 'create']: 178 | if ( args.cohort_id is None ): 179 | print " " 180 | print " an integer cohort identifier is required for this endpoint " 181 | print " " 182 | print_usage() 183 | return 184 | 185 | if args.endpoint in ['preview', 'create']: 186 | print " " 187 | print " Cohort preview and create functionality is not currently available from the ISB-CGC APIs. " 188 | print " " 189 | return 190 | if ( args.body is None ): 191 | print " " 192 | print " at least one filter must be specified using the 'body' argument " 193 | print " in order to %s a cohort " % args.endpoint 194 | print " " 195 | print_usage() 196 | return 197 | 198 | api_tag = '_tcga' if args.endpoint in ('preview', 'create') else '' 199 | service = get_authorized_service(api_tag) 200 | cohort_id = args.cohort_id if args.cohort_id is None else int(args.cohort_id) 201 | body = json.loads(args.body) if args.body is not None else args.body 202 | 203 | globals()[args.endpoint](service, cohort_id=cohort_id, body=body, name=args.name) 204 | 205 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 206 | 207 | if __name__ == '__main__': 208 | main() 209 | 210 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 211 | -------------------------------------------------------------------------------- /python/isb_cgc_api_v3_samples.py: -------------------------------------------------------------------------------- 1 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 2 | ''' 3 | Copyright 2018, Institute for Systems Biology. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | ''' 17 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 18 | 19 | from argparse import ArgumentParser 20 | from googleapiclient.discovery import build 21 | from oauth2client.client import OAuth2WebServerFlow 22 | from oauth2client import tools 23 | from oauth2client.file import Storage 24 | 25 | import httplib2 26 | import os 27 | import pprint 28 | import sys 29 | 30 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 31 | 32 | # the CLIENT_ID for the ISB-CGC site 33 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 34 | # The google-specified 'installed application' OAuth pattern 35 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 36 | # The google defined scope for authorization 37 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 38 | # where a default credentials file will be stored for use by the endpoints 39 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 40 | 41 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 42 | # this API does not require the user to be authenticated ... 43 | 44 | def get_unauthorized_service(): 45 | api = 'isb_cgc_tcga_api' 46 | version = 'v3' 47 | site = "https://api-dot-isb-cgc.appspot.com" 48 | discovery_url = '%s/_ah/api/discovery/v1/apis/%s/%s/rest' % (site, api, version) 49 | return build(api, version, discoveryServiceUrl=discovery_url, http=httplib2.Http()) 50 | 51 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 52 | 53 | # the example uses the TCGA-specific endpoint but the same functionality 54 | # exists in the TARGET and CCLE endpoints as well 55 | 56 | def get(service, barcode): 57 | """ 58 | Usage: python isb_cgc_api_v3_samples.py -e get -b TCGA-W5-AA2R-01A 59 | """ 60 | data = service.samples().get(sample_barcode=barcode).execute() 61 | print '\nresults from samples().get()' 62 | pprint.pprint(data) 63 | 64 | def cloud_storage_file_paths(service, barcode=None): 65 | """ 66 | Usage: python isb_cgc_api_v3_samples.py -e cloud_storage_file_paths -b TCGA-01-0642-11A 67 | """ 68 | data = service.samples().cloud_storage_file_paths(sample_barcode=barcode).execute() 69 | print '\nresults from samples().cloud_storage_file_paths()' 70 | pprint.pprint(data) 71 | 72 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 73 | 74 | def print_usage(): 75 | print " " 76 | print " Usage: python %s --endpoint --barcode " % sys.argv[0] 77 | print " " 78 | print " Examples: " 79 | print " python %s --endpoint get --barcode TCGA-W5-AA2R-01A " % sys.argv[0] 80 | print " python %s --endpoint cloud_storage_file_paths --barcode TCGA-W5-AA2R-01A " % sys.argv[0] 81 | print " " 82 | 83 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 84 | 85 | def main(): 86 | parser = ArgumentParser() 87 | parser.add_argument('--endpoint', '-e', 88 | help='Name of samples endpoint to execute. ' 89 | 'Options: get, cloud_storage_file_paths') 90 | parser.add_argument('--barcode', '-b', 91 | help='Sample barcode. Examples: TCGA-W5-AA2R-01A, TCGA-01-0642-11A') 92 | args = parser.parse_args() 93 | if args.endpoint not in ['get', 'cloud_storage_file_paths']: 94 | print_usage() 95 | return 96 | 97 | service = get_unauthorized_service() 98 | globals()[args.endpoint](service, barcode=args.barcode) 99 | 100 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 101 | 102 | if __name__ == '__main__': 103 | main() 104 | 105 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 106 | -------------------------------------------------------------------------------- /python/isb_cgc_api_v3_users.py: -------------------------------------------------------------------------------- 1 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 2 | 3 | ''' 4 | Copyright 2018, Institute for Systems Biology. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 19 | 20 | from googleapiclient.discovery import build 21 | from oauth2client.client import OAuth2WebServerFlow 22 | from oauth2client import tools 23 | from oauth2client.file import Storage 24 | import httplib2 25 | import pprint 26 | import os 27 | 28 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 29 | 30 | # the CLIENT_ID for the ISB-CGC site 31 | CLIENT_ID = '907668440978-0ol0griu70qkeb6k3gnn2vipfa5mgl60.apps.googleusercontent.com' 32 | # The google-specified 'installed application' OAuth pattern 33 | CLIENT_SECRET = 'To_WJH7-1V-TofhNGcEqmEYi' 34 | # The google defined scope for authorization 35 | EMAIL_SCOPE = 'https://www.googleapis.com/auth/userinfo.email' 36 | # where a default credentials file will be stored for use by the endpoints 37 | DEFAULT_STORAGE_FILE = os.path.join(os.path.expanduser("~"), '.isb_credentials') 38 | 39 | #------------------------------------------------------------------------------ 40 | # validate the credentials of the current user against the ISB-CGC site 41 | 42 | def get_credentials(): 43 | oauth_flow_args = ['--noauth_local_webserver'] 44 | storage = Storage(DEFAULT_STORAGE_FILE) 45 | credentials = storage.get() 46 | if not credentials or credentials.invalid: 47 | print " " 48 | print " You do not have cached credentials ... please follow these instructions: " 49 | print " " 50 | flow = OAuth2WebServerFlow(CLIENT_ID, CLIENT_SECRET, EMAIL_SCOPE) 51 | flow.auth_uri = flow.auth_uri.rstrip('/') + '?approval_prompt=force' 52 | credentials = tools.run_flow(flow, storage, tools.argparser.parse_args(oauth_flow_args)) 53 | return credentials 54 | 55 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 56 | 57 | def get_authorized_service(): 58 | api = 'isb_cgc_tcga_api' 59 | version = 'v3' 60 | site = "https://api-dot-isb-cgc.appspot.com" 61 | discovery_url = '%s/_ah/api/discovery/v1/apis/%s/%s/rest' % (site, api, version) 62 | 63 | credentials = get_credentials() 64 | http = credentials.authorize(httplib2.Http()) 65 | 66 | if credentials.access_token_expired or credentials.invalid: 67 | credentials.refresh(http) 68 | 69 | authorized_service = build(api, version, discoveryServiceUrl=discovery_url, http=http) 70 | 71 | return authorized_service 72 | 73 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 74 | 75 | def get(service): 76 | """ 77 | Usage: python python/isb_cgc_api_v3_users.py 78 | """ 79 | data = service.users().get().execute() 80 | print " " 81 | print " NB: this API will only return YOUR information as a user of ISB-CGC " 82 | print " " 83 | print '\nresult of users().get()' 84 | pprint.pprint(data) 85 | print " " 86 | 87 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 88 | 89 | def main(): 90 | service = get_authorized_service() 91 | get(service) 92 | 93 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 94 | 95 | if __name__ == '__main__': 96 | main() 97 | 98 | # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# 99 | -------------------------------------------------------------------------------- /python/isb_curl.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python2.7 2 | ''' 3 | Copyright 2015, Institute for Systems Biology 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | 18 | 19 | isb_curl can be called by commandline or used as a library 20 | 21 | Use the endpoint URL structure in the API Documentation 22 | https://docs.google.com/document/d/1Jax7HCmGPM7J-52c8AsSbfFcQv8L8AkB612s-50_7GU 23 | 24 | URL = https://isb-cgc.appspot.com/_ah/api/{API-NAME}/{VERSION}/{ENDPOINT}?{QUERYSTRING-PARAMS} 25 | e.g. for the "cohorts_list" endpoint: 26 | https://isb-cgc.appspot.com/_ah/api/cohort_api/v1/cohorts_list 27 | 28 | 29 | A. Command Line: 30 | python isb_auth.py # saves the user's credentials to their root directory 31 | python isb_curl.py URL 32 | note: if the endpoint takes a resource in the request body, such as the save_cohort endpoint, use the following: 33 | python isb_curl.py https://isb-cgc.appspot.com/_ah/api/cohort_api/v1/save_cohort?name={YOUR-COHORT-NAME} \ 34 | -d '{"Study": "BRCA"}' -H "Content-Type: application/json" 35 | 36 | 37 | B. Python: 38 | import isb_auth 39 | import isb_curl 40 | import requests 41 | 42 | url = 'https://isb-cgc.appspot.com/_ah/api/cohort_api/v1/cohorts_list' 43 | token = isb_curl.get_access_token() 44 | head = {'Authorization': 'Bearer ' + token} 45 | 46 | # for GET requests 47 | resp = requests.get(url, headers=head) 48 | # querystring parameters can be added to either the url itself... 49 | url += '?cohort_id=1' 50 | resp = requests.get(url, headers=head) 51 | # ... or passed in with the params kwarg 52 | url = 'https://isb-cgc.appspot.com/_ah/api/cohort_api/v1/cohorts_list' 53 | params = {'cohort_id': 1} 54 | resp = requests.get(url, headers=head, params=params) 55 | 56 | # if the endpoint takes a resource in the request body, such as the save_cohort endpoint... 57 | url = https://isb-cgc.appspot.com/_ah/api/cohort_api/v1/save_cohort?name=my-new-cohort' 58 | head.update({'Content-Type': 'application/json'}) 59 | payload = {"SampleBarcode": "TCGA-02-0001-01C,TCGA-02-0001-10A,TCGA-01-0642-11A"} 60 | resp = requests.post(url, headers=head, json=payload) 61 | 62 | # if requests version < 2.4.2 63 | import json 64 | resp = requests.post(url, headers=head, data=json.dumps(payload)) 65 | 66 | ''' 67 | 68 | import httplib2 69 | import os 70 | import sys 71 | from oauth2client.file import Storage 72 | 73 | CREDENTIALS_LOC_ENV = 'ISB_CREDENTIALS' 74 | DEFAULT_CREDENTIALS_LOC = os.path.join(os.path.expanduser("~"), '.isb_credentials') 75 | 76 | 77 | def check(assertion, msg): 78 | if not assertion: 79 | error(msg) 80 | 81 | def error(msg): 82 | sys.stderr.write(msg + '\n') 83 | sys.exit(1) 84 | 85 | def get_credentials_location(): 86 | credentials_location = os.environ.get(CREDENTIALS_LOC_ENV, DEFAULT_CREDENTIALS_LOC) 87 | check(credentials_location, "couldn't find ISB credentials...try running isb_auth.py") 88 | return credentials_location 89 | 90 | def load_credentials(credentials_location): 91 | storage = Storage(credentials_location) 92 | credentials = storage.get() 93 | check(credentials and not credentials.invalid, 'missing/invalid credentials...try running isb_auth.py') 94 | return credentials 95 | 96 | def get_access_token(credentials_location=get_credentials_location()): 97 | credentials = load_credentials(credentials_location) 98 | if credentials.access_token_expired: 99 | credentials.refresh(httplib2.Http()) 100 | return credentials.access_token 101 | 102 | 103 | def main(): 104 | args = sys.argv[1:] 105 | check(args, 'usage: isb_curl.py ') 106 | access_token = get_access_token() 107 | curl_args = ['curl', '-H', 'Authorization: Bearer ' + access_token] + args 108 | os.execvp('curl', curl_args) 109 | 110 | 111 | # this allows us to call this from command line 112 | if __name__ == '__main__': 113 | main() 114 | 115 | 116 | -------------------------------------------------------------------------------- /python/melt_matrix.py: -------------------------------------------------------------------------------- 1 | ## ============================================================================= 2 | ## This script reads a flat, delimited text file containing a 'matrix' of 3 | ## some sort and 'melts' it into a 'tidy' format file. 4 | ## ============================================================================= 5 | 6 | import argparse 7 | import sys 8 | 9 | ## ============================================================================= 10 | 11 | def melt_matrix ( inputFilename, tidyFilename, \ 12 | nSkipRows, nHeadRows, nHeadCols, 13 | separator, \ 14 | mergeRowLabels, mergeColLabels, \ 15 | ultraMelt ): 16 | 17 | print " in melt_matrix ... " 18 | print inputFilename 19 | print tidyFilename 20 | print nSkipRows, nHeadRows, nHeadCols 21 | print separator 22 | print mergeRowLabels, mergeColLabels, ultraMelt 23 | print " " 24 | print " " 25 | 26 | try: 27 | inFh = file ( inputFilename, 'r' ) 28 | except: 29 | print " ERROR: failed to open input file ??? " 30 | print " <%s> " % inputFilename 31 | sys.exit(-1) 32 | 33 | try: 34 | outFh = file ( tidyFilename, 'w' ) 35 | except: 36 | print " ERROR: failed to open output file ??? " 37 | print " <%s> " % tidyFilename 38 | sys.exit(-1) 39 | 40 | # start by skipping any leading rows ... 41 | if ( nSkipRows > 0 ): 42 | for ii in range(nSkipRows): 43 | aLine = inFh.readline() 44 | print " skipping this : ", aLine 45 | 46 | # start by reading the header row(s) 47 | if ( nHeadRows > 0 ): 48 | numTok = -1 49 | hdrRows = [0] * nHeadRows 50 | for ii in range(nHeadRows): 51 | aLine = inFh.readline() 52 | if ( aLine[-1] == '\n' ): aLine = aLine[:-1] 53 | print " --> got aLine : <%s> " % aLine 54 | print " separator : <%s> " % separator, len(separator) 55 | tokenList = aLine.split(separator) 56 | print " --> got tokenList : ", len(tokenList), tokenList 57 | for kk in range(len(tokenList)): 58 | tokenList[kk] = tokenList[kk].strip() 59 | print len(tokenList), tokenList 60 | hdrRows[ii] = tokenList 61 | if ( numTok < 0 ): 62 | numTok = len(tokenList) 63 | else: 64 | if ( numTok != len(tokenList) ): 65 | print " ERROR: inconsistent number of tokens ??? ", numTok, len(tokenList) 66 | print " please check input file " 67 | sys.exit(-1) 68 | ## DELETE for jj in range(nHeadCols): 69 | ## DELETE if ( tokenList[jj] != '' ): 70 | ## DELETE print " WARNING: non-blank token in column %d will be ignored (%s)." % ( jj, tokenList[jj] ) 71 | 72 | print " " 73 | print " hdrRows: " 74 | print hdrRows 75 | 76 | ## and then construct the output headerTokens 77 | ## exactly how these will be constructed depends on whether the row-labels 78 | ## and the column-labels are being merged, and also whether or not ultraMelt is ON 79 | 80 | if ( mergeRowLabels ): 81 | numOutCols = 1 82 | else: 83 | numOutCols = nHeadCols 84 | 85 | if ( ultraMelt ): 86 | numOutCols += 2 87 | else: 88 | numOutCols += (numTok - nHeadCols) 89 | print " numOutCols : ", numOutCols 90 | 91 | outColNames = [0] * numOutCols 92 | 93 | if ( mergeRowLabels ): 94 | tmpLabel = '' 95 | for ii in range(nHeadRows): 96 | print ' ii=%d ' % ii 97 | for kk in range(nHeadCols): 98 | print ' kk=%d ' % kk 99 | if ( tmpLabel != '' ): tmpLabel += '_' 100 | tmpLabel += hdrRows[ii][kk] 101 | outColNames[0] = tmpLabel 102 | oo = 1 103 | else: 104 | for kk in range(nHeadCols): 105 | tmpLabel = '' 106 | for ii in range(nHeadRows): 107 | if ( tmpLabel != '' ): tmpLabel += '_' 108 | tmpLabel += hdrRows[ii][kk] 109 | outColNames[kk] = tmpLabel 110 | oo = nHeadCols 111 | 112 | ## we will need to construct the dataLabel values from the column headers 113 | ## no matter what ... 114 | dataLabels = [0] * (numTok-nHeadCols) 115 | for kk in range(nHeadCols,numTok): 116 | tmpLabel = '' 117 | for ii in range(nHeadRows): 118 | if ( tmpLabel != '' ): tmpLabel += '_' 119 | tmpLabel += hdrRows[ii][kk] 120 | dataLabels[kk-nHeadCols] = tmpLabel 121 | 122 | if ( ultraMelt ): 123 | outColNames[oo] = "dataLabel" 124 | outColNames[oo+1] = "dataValue" 125 | else: 126 | for kk in range(nHeadCols,numTok): 127 | outColNames[oo+kk-nHeadCols] = dataLabels[kk-nHeadCols] 128 | 129 | print " " 130 | print " output column labels : " 131 | print outColNames 132 | print " " 133 | 134 | # now we can write out a 'dummy' header row for the output file 135 | tidyRow = '\t'.join(outColNames) 136 | print " <%s> " % tidyRow 137 | outFh.write ( "%s\n" % tidyRow ) 138 | 139 | 140 | # 141 | # now read the rest of the file 142 | # 143 | 144 | done = 0 145 | lineNo = 1 146 | while not done: 147 | aLine = inFh.readline() 148 | 149 | ## check if we've gotten to the end of the file ... 150 | if ( len(aLine.strip()) < 1 ): 151 | done = 1 152 | continue 153 | 154 | if ( lineNo%1000 == 1 ): print " handling data row #%d ... " % lineNo 155 | 156 | if ( aLine[-1] == '\n' ): aLine = aLine[:-1] 157 | tokenList = aLine.split(separator) 158 | if ( numTok != len(tokenList) ): 159 | print " ERROR: inconsistent number of tokens ??? ", numTok, len(tokenList) 160 | print " please check input file " 161 | sys.exit(-1) 162 | for kk in range(len(tokenList)): 163 | tokenList[kk] = tokenList[kk].strip() 164 | ## print tokenList 165 | 166 | ## figure out how many output rows per input row ... 167 | if ( ultraMelt ): 168 | nOut = numTok - nHeadCols 169 | else: 170 | nOut = 1 171 | 172 | ## initialize the ouput values ... 173 | outVals = [0] * nOut 174 | for nn in range(nOut): 175 | outVals[nn] = [0] * numOutCols 176 | 177 | ## first decide what to do with the header column(s) 178 | if ( mergeRowLabels ): 179 | tmpLabel = '' 180 | for kk in range(nHeadCols): 181 | if ( tmpLabel != '' ): tmpLabel += '_' 182 | tmpLabel += tokenList[kk] 183 | for nn in range(nOut): 184 | outVals[nn][0] = tmpLabel 185 | oo = 1 186 | else: 187 | for kk in range(nHeadCols): 188 | for nn in range(nOut): 189 | outVals[nn][kk] = tokenList[kk] 190 | oo = nHeadCols 191 | 192 | ## and then decide what to do with the rest of the tokens 193 | if ( ultraMelt ): 194 | ## in this case, we will write out several rows for each input row 195 | for nn in range(nOut): 196 | outVals[nn][oo] = dataLabels[nn] 197 | outVals[nn][oo+1] = tokenList[nHeadCols+nn] 198 | else: 199 | for kk in range(nHeadCols,numTok): 200 | outVals[0][oo+kk-nHeadCols] = tokenList[kk] 201 | 202 | for nn in range(nOut): 203 | ## print outVals[nn] 204 | tidyRow = '\t'.join(outVals[nn]) 205 | outFh.write ( "%s\n" % tidyRow ) 206 | 207 | lineNo += 1 208 | 209 | print " " 210 | print " DONE! %d data rows processed " % lineNo 211 | print " " 212 | 213 | outFh.close() 214 | inFh.close() 215 | 216 | ## ============================================================================= 217 | 218 | def str2bool(v): 219 | ## print " in str2bool ... <%s> " % v 220 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 221 | return True 222 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 223 | return False 224 | else: 225 | raise argparse.ArgumentTypeError('Boolean value expected.') 226 | 227 | ## ============================================================================= 228 | 229 | if __name__ == '__main__': 230 | 231 | parser = argparse.ArgumentParser ( description='Melt 2D data matrix into a tidy-format file.' ) 232 | parser.add_argument ( '--inputFilename', '-f', action='store', required=True, type=str ) 233 | parser.add_argument ( '--tidyFilename', '-t', action='store', required=True, type=str ) 234 | parser.add_argument ( '--nSkipRows', '-k', action='store', default=0, type=int ) 235 | parser.add_argument ( '--nHeadRows', '-n', action='store', default=1, type=int ) 236 | parser.add_argument ( '--nHeadCols', '-m', action='store', default=1, type=int ) 237 | parser.add_argument ( '--separator', '-s', action='store', default='\t', type=str ) 238 | parser.add_argument ( '--mergeRowLabels', '-mr', action='store', default=False, type=str2bool ) 239 | parser.add_argument ( '--mergeColLabels', '-mc', action='store', default=True, type=str2bool ) 240 | parser.add_argument ( '--ultraMelt', '-u', action='store', default=False, type=str2bool ) 241 | 242 | args = parser.parse_args ( ) 243 | ## print args 244 | 245 | if ( args.nHeadRows < 1 ): 246 | print " ERROR: your input matrix must have at least one row (the first) containing column labels. " 247 | sys.exit(-1) 248 | 249 | if ( args.nHeadCols < 1 ): 250 | print " ERROR: your input matrix must have at least one column (the first) containing a row label. " 251 | sys.exit(-1) 252 | 253 | if ( args.nHeadRows>1 and (not args.mergeColLabels) ): 254 | print args.nHeadRows, args.mergeColLabels 255 | print (args.nHeadRows>1) 256 | print (not args.mergeColLabels) 257 | print " ERROR: if you have more than one header row, the column labels must be merged to produce one label per output column. " 258 | sys.exit(-1) 259 | 260 | if ( args.separator == '\\t' ): 261 | args.separator = chr(9) 262 | elif ( args.separator == ',' ): 263 | args.separator = ',' 264 | else: 265 | print " ERROR: unknown args.separator ... <%s> " % args.separator 266 | sys.exit(-1) 267 | 268 | melt_matrix ( args.inputFilename, args.tidyFilename, \ 269 | args.nSkipRows, args.nHeadRows, args.nHeadCols, \ 270 | args.separator, \ 271 | args.mergeRowLabels, args.mergeColLabels, \ 272 | args.ultraMelt ) 273 | 274 | ## ============================================================================= 275 | -------------------------------------------------------------------------------- /python/pairwise/README: -------------------------------------------------------------------------------- 1 | 2 | #### BQPairwise #### 3 | 4 | To run BQPairwise, you must define two 'filter files'. 5 | Each filter-file produces a sub-table from a larger table. 6 | 7 | Run: python3 bqpairwise.py [your-project-id] filter-file-1 filter-file-2 8 | ------------------------------------------------------------------------------ 9 | 10 | How to define a filter file. 11 | ---------------------------- 12 | 13 | ***Example File*** 14 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 15 | tablejoin:case_barcode 16 | tablevar:project_short_name 17 | valuevar:normalized_count 18 | annot:isb-cgc.QotM.WikiPathways_20170425_Annotated 19 | annotkey:Symbol 20 | annotvar:pathway 21 | tablekey:HGNC_gene_symbol 22 | filter:project_short_name='TCGA-BRCA' 23 | filter:pathway='Apoptosis' 24 | tablegroup:HGNC_gene_symbol 25 | 26 | ***Definitions*** 27 | table: [the source of the data] 28 | tablejoin: [how the data can be 'paired' with another table] 29 | annotjoin: [if the joining variable is found in the annotation table, only one *join statement needed] 30 | valuevar: [the variable of interest, pairwise will be computed on this variable] 31 | tablevar: [variables that will be used in filtering or grouping must be included here, part of SELECT] 32 | tablevar: [there can be multiple listings of tablevars or related *-vars] 33 | annot: [a table used for annotation] 34 | annotkey: [variables for linking the table to the annot] 35 | tablekey: [variables for linking the table to the annot] 36 | filter: [a portion of the WHERE statement in the query] 37 | filter: [there can be multiple filter listings] 38 | tablegroup:[how the valuevar will be grouped, calculations will take place *within* the group] 39 | [tablegroup *** must be in *var or *key] 40 | 41 | -------------------------------------------------------------------------------- /python/pairwise/archive/bq_filter_file.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ****************************************************** 17 | Using python to generate bigqueries. 18 | 19 | Here we use the 'filter file' to create subusets of data 20 | to download. 21 | ****************************************************** 22 | 23 | First need to install the BigQuery API 24 | pip3 install --upgrade google-cloud-bigquery 25 | 26 | The first time I ran the installer, there was an error. But just running pip3 27 | again seemed to work. 28 | 29 | Also we need to get authenticated. At the command line we: 30 | gcloud auth application-default login 31 | 32 | # table:isb-cgc.tcga_201510_alpha.DNA_Methylation_betas 33 | # tablevar:Probe_Id 34 | # annot:isb-cgc.platform_reference.methylation_annotation 35 | # annotvar:IlmnID 36 | # idvar:ParticipantBarcode 37 | # valvar:Beta_Value 38 | # pivot:UCSC.RefGene_Name # after the annotation join 39 | # filter:SampleTypeLetterCode='TP' 40 | # filter:Study='BRCA' 41 | # filter:UCSC.RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 42 | # limit:100 43 | 44 | ''' 45 | 46 | from google.cloud import bigquery 47 | import argparse 48 | import sys 49 | 50 | ko = ['idvar', 'valvar', 'pivot', 'table', 'annot', 'tablevar', 'annotvar', 'filter', 'limit'] 51 | 52 | # Some queries must be annoated before running pairwise 53 | ## to this point, some annotation fields are nested 54 | ## so we need to check the schema first. 55 | def checkSchemas(client,ffd): 56 | # have to use a client pointed to the table that you want to query 57 | ts = ffd['table'].split('.') 58 | d1 = client.dataset(ts[1]) 59 | t1 = d1.table(ts[2]) 60 | t1.reload() 61 | # then t1 contains a list of schema fields 62 | print(t1.schema[0].description) 63 | print(t1.schema[0].name) 64 | print(t1.schema[0].field_type) 65 | print(t1.schema[0].mode) 66 | # will have to check if any of the fields are records 67 | # or structs or arrays. 68 | 69 | 70 | # check that dictionary names are 71 | # in the allowed set. 72 | def checkQuery(client, ffd): 73 | # make sure the query contains only allowed keys in KO. 74 | ks = list(ffd.keys()) 75 | if any([x not in ko for x in ks]): 76 | print("Removing items from the filter file:") 77 | print([x for x in ks if x not in ko]) 78 | filtered_dict = {key: value for key, value in ffd.items() if key in ko} 79 | filtered_dict = checkSchemas(client, filtered_dict) 80 | return(filtered_dict) 81 | 82 | 83 | def keyOrder(ffdict): 84 | ks = list(ffdict.keys()) 85 | kd = [x for x in ko if x in ks] 86 | return(kd) 87 | 88 | 89 | def readFilterFile(filepath): 90 | # build a dictionary of query terms 91 | fin = open(filepath, 'r') 92 | ffdict = {} 93 | for line in fin: 94 | strings = line.strip().split(':') 95 | k, v = [s.strip() for s in strings] 96 | if k not in ffdict: 97 | ffdict[k] = v 98 | else: 99 | ffdict[k] = ffdict[k] + " AND " + v 100 | fin.close() 101 | return(ffdict) 102 | 103 | 104 | def buildQuery(client, filename): 105 | ffd = readFilterFile(filename) 106 | ffd = checkQuery(client, ffd) 107 | query = "SELECT \n" 108 | for key in keyOrder(ffd): # queries need to have a particular order 109 | if key in ['idvar', 'valvar']: 110 | query += ffd[key] + ",\n" 111 | elif key == 'table': 112 | query += "FROM `" + ffd[key] + "`\n WHERE \n" 113 | elif key == 'limit': 114 | query += "LIMIT " + ffd[key] + " \n" 115 | else: 116 | query += ffd[key] + " \n" 117 | return(query) 118 | 119 | 120 | def bq(args): 121 | client = bigquery.Client(project=args.proj) 122 | queryString = buildQuery(client, args.ff1) 123 | print("*****************************************") 124 | print(queryString) 125 | print("*****************************************") 126 | #query_results = client.run_sync_query(queryString) 127 | #query_results.use_legacy_sql = False 128 | #query_results.run() 129 | #print(query_results.total_rows) 130 | #for qi in query_results.rows: 131 | # print(qi) 132 | print("done") 133 | 134 | 135 | if __name__ == "__main__": 136 | parser = argparse.ArgumentParser(description="BigQuery PairWise") 137 | parser.add_argument("prj", help="google project ID") 138 | parser.add_argument("ff1", help="filter file") 139 | args = parser.parse_args() 140 | bq(args) 141 | -------------------------------------------------------------------------------- /python/pairwise/archive/bq_filter_file_v2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ****************************************************** 17 | Using python to generate bigqueries. 18 | 19 | Here we use the 'filter file' to create subusets of data 20 | to download. 21 | ****************************************************** 22 | 23 | First need to install the BigQuery API 24 | pip3 install --upgrade google-cloud-bigquery 25 | 26 | The first time I ran the installer, there was an error. But just running pip3 27 | again seemed to work. 28 | 29 | Also we need to get authenticated. At the command line we: 30 | gcloud auth application-default login 31 | 32 | table:isb-cgc.tcga_201607_beta.DNA_Methylation_chr11 33 | tablevar:Probe_Id 34 | tablevar:ParticipantBarcode 35 | tablevar:Beta_Value 36 | tablekey:Probe_Id 37 | annot:isb-cgc.platform_reference.methylation_annotation 38 | annotvar:IlmnID 39 | annotvar:UCSC.RefGene_Name 40 | annotkey:IlmnID 41 | filter:SampleTypeLetterCode='TP' 42 | filter:Study='BRCA' 43 | filter:UCSC.RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 44 | limit:100 45 | 46 | 47 | 48 | ''' 49 | 50 | from google.cloud import bigquery 51 | import argparse 52 | import sys 53 | 54 | # main key order 55 | mko = ['tablevar', 'table' ] 56 | # annotation key order 57 | ako = [ 'annotvar','annot','recordflatten'] 58 | # join key order 59 | jko = ['bothvars', 'joinkey', 'filter', 'limit'] 60 | 61 | 62 | # Some queries must be annoated before running pairwise 63 | ## to this point, some annotation fields are nested 64 | ## so we need to check the schema first. 65 | def checkSchemas(client,ffd): 66 | # have to use a client pointed to the table that you want to query 67 | ks = list(ffd.keys()) 68 | for x in ['table', 'annot']: 69 | if x in ks: 70 | ts = ffd[x].split('.') 71 | d1 = client.dataset(ts[1]) 72 | t1 = d1.table(ts[2]) 73 | t1.reload() 74 | # then t1 contains a list of schema fields 75 | for i in range(0,len(t1.schema)): 76 | if t1.schema[i].field_type == 'RECORD': 77 | ffd['recordflatten'] = t1.schema[i].name 78 | for y in ks: 79 | # then we need to edit that entry and remove the prefix. 80 | if t1.schema[i].name in ffd[y] and (y not in ['filter','pivot']): 81 | searchString = t1.schema[i].name + '.' 82 | z = str(ffd[y]) 83 | print("search string: " + searchString) 84 | print("type: " + str(type(z))) 85 | print("remove prefix for " + z) 86 | z = z.replace(searchString, '') 87 | print(z) 88 | ffd[y] = z 89 | return(ffd) 90 | 91 | 92 | # check that dictionary names are 93 | # in the allowed set. 94 | def checkFilterFile(client, ffd): 95 | # check schemas for records 96 | ffd = checkSchemas(client, ffd) 97 | return(ffd) 98 | 99 | 100 | def keyOrder(ffdict, mode): 101 | ks = list(ffdict.keys()) 102 | if mode == 'maintable': 103 | kd = [x for x in mko if x in mko] 104 | elif mode == 'annottable': 105 | kd = [x for x in ako if x in ako] 106 | elif mode == 'jointable': 107 | kd = [x for x in jko if x in jko] 108 | else: 109 | kd = [] 110 | return(kd) 111 | 112 | 113 | def readFilterFile(filepath): 114 | # build a dictionary of query terms 115 | # the filter entries are concatenated 116 | fin = open(filepath, 'r') 117 | ffdict = {} 118 | for line in fin: 119 | strings = line.strip().split(':') 120 | k, v = [s.strip() for s in strings] 121 | if k not in ffdict: 122 | ffdict[k] = v 123 | elif k in ffdict and k in ['idvar', 'valvar', 'annotvar', 'tablevar']: 124 | ffdict[k] = ffdict[k] + ",\n" + v 125 | else: 126 | ffdict[k] = ffdict[k] + " AND " + v 127 | fin.close() 128 | return(ffdict) 129 | 130 | 131 | def buildQuery(client, ffd, mode): 132 | query = "SELECT \n" 133 | thisKeyOrder = keyOrder(ffd, mode) 134 | for key in thisKeyOrder: # queries need to have a particular order 135 | if key in ['idvar', 'valvar', 'annotvar', 'tablevar']: 136 | query += ffd[key] + "\n" 137 | elif key == 'bothvars': 138 | query += ffd['tablevar'] + ',\n' + ffd['annotvar'] +'\n' 139 | elif key == 'joinkey': 140 | query += ' FROM T1 JOIN A1 ON T1.' + ffd['tablekey'] + '= A1.' +ffd['annotkey'] +'\n' 141 | elif key == 'filter': 142 | query += "WHERE \n" + ffd[key] +'\n' 143 | elif (key == 'table' or key == 'annot') and 'filter' not in thisKeyOrder: 144 | query += "FROM `" + ffd[key] + "`\n" 145 | elif key == 'limit': 146 | query += "LIMIT " + ffd[key] + " \n" 147 | elif key == 'recordflatten': 148 | query += ", UNNEST(" + ffd[key] +")\n" 149 | else: 150 | query += ffd[key] + " \n" 151 | return(query) 152 | 153 | 154 | def buildAnnotQuery(q1,q2,q3): 155 | x = ( 156 | "WITH\n" + 157 | "T1 AS (\n" + 158 | q1 + 159 | "),\n" + 160 | "A1 AS (\n" + 161 | q2 + 162 | ") \n" + 163 | q3 164 | ) 165 | return(x) 166 | 167 | 168 | def buildFilterQuery(args): 169 | client = bigquery.Client(project=args.prj) 170 | ffdict = readFilterFile(args.ff1) 171 | ffdict = checkFilterFile(client, ffdict) 172 | q1 = buildQuery(client, ffdict, "maintable") 173 | if 'annot' in ffdict.keys(): 174 | # prepare the annotation table, and perform a join 175 | q2 = buildQuery(client, ffdict, "annottable") 176 | q3 = buildQuery(client, ffdict, "jointable") 177 | queryString = buildAnnotQuery(q1,q2,q3) 178 | else: 179 | # just query the main table with filters. 180 | q2 = '' # no annotation 181 | q3 = '' # no joins 182 | queryString = q1 183 | print("*****************************************") 184 | print(queryString) 185 | print("*****************************************") 186 | #query_results = client.run_sync_query(queryString) 187 | #query_results.use_legacy_sql = False 188 | #query_results.run() 189 | #print(query_results.total_rows) 190 | #for qi in query_results.rows: 191 | # print(qi) 192 | print("done") 193 | 194 | 195 | if __name__ == "__main__": 196 | parser = argparse.ArgumentParser(description="BigQuery PairWise") 197 | parser.add_argument("prj", help="google project ID") 198 | parser.add_argument("ff1", help="filter file") 199 | args = parser.parse_args() 200 | buildFilterQuery(args) 201 | -------------------------------------------------------------------------------- /python/pairwise/archive/filter_file_test_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16 2 | tablevar:probe_id 3 | tablevar:project_short_name 4 | tablevar:case_barcode 5 | tablevar:beta_value 6 | tablekey:Probe_Id 7 | annot:isb-cgc.platform_reference.methylation_annotation 8 | annotvar:IlmnID 9 | annotvar:UCSC.RefGene_Name 10 | annotkey:IlmnID 11 | filter:project_short_name='TCGA-BRCA' 12 | filter:RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 13 | limit:100 14 | -------------------------------------------------------------------------------- /python/pairwise/archive/ref_query.txt: -------------------------------------------------------------------------------- 1 | This filter file should produce the following query: 2 | 3 | table:isb-cgc.tcga_201607_beta.DNA_Methylation_chr11 4 | tablevar:Probe_Id 5 | tablevar:ParticipantBarcode 6 | tablevar:Beta_Value 7 | tablekey:Probe_Id 8 | annot:isb-cgc.platform_reference.methylation_annotation 9 | annotvar:IlmnID 10 | annotvar:UCSC.RefGene_Name 11 | annotkey:IlmnID 12 | pivot:UCSC.RefGene_Name 13 | filter:SampleTypeLetterCode='TP' 14 | filter:Study='BRCA' 15 | filter:UCSC.RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 16 | limit:100 17 | 18 | 19 | WITH 20 | 21 | # FIRST we are getting the methylation data, 22 | # NO filters here. 23 | 24 | T1 AS ( 25 | SELECT 26 | ParticipantBarcode, 27 | Probe_Id, 28 | Beta_Value 29 | FROM 30 | `isb-cgc.tcga_201607_beta.DNA_Methylation_chr11` 31 | ), 32 | 33 | # THEN if there's an annotation table requested 34 | #&& check if there's a record present... like UCSC 35 | # --> always has a dot? #&& 36 | A1 AS ( 37 | SELECT 38 | IlmnID, 39 | RefGene_Name 40 | FROM 41 | `isb-cgc.platform_reference.methylation_annotation`, 42 | UNNEST(UCSC)) 43 | 44 | # LAST, 45 | && IF NEEDED, we join in the annotation 46 | 47 | SELECT 48 | IlmnID, 49 | RefGene_Name ParticipantBarcode, 50 | Probe_Id, 51 | Beta_Value 52 | FROM 53 | T1 54 | JOIN 55 | A1 56 | ON 57 | T1.Probe_Id = A1.IlmnID 58 | WHERE 59 | SampleTypeLetterCode='TP' 60 | AND Study='BRCA' 61 | AND RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 62 | LIMIT 63 | 100 64 | 65 | 66 | 1.) get which variables are records. 67 | 2.) 68 | -------------------------------------------------------------------------------- /python/pairwise/bqpairwise.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ****************************************************** 17 | Using python to generate bigqueries. 18 | 19 | Here we use the 'filter file' to create SQL 20 | ****************************************************** 21 | 22 | First need to install the BigQuery API 23 | pip3 install --upgrade google-cloud-bigquery 24 | 25 | The first time I ran the installer, there was an error. But just running pip3 26 | again seemed to work. 27 | 28 | Also we need to get authenticated. At the command line we: 29 | gcloud auth application-default login 30 | 31 | 32 | table:isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16 33 | tablekey:probe_id 34 | tablevar:project_short_name 35 | joinkey:case_barcode 36 | valuevar:beta_value 37 | annot:isb-cgc.platform_reference.methylation_annotation 38 | annotkey:IlmnID 39 | groupby:UCSC.RefGene_Name 40 | filter:project_short_name='TCGA-BRCA' 41 | filter:RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 42 | limit:100 43 | 44 | 45 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 46 | tablejoin:case_barcode 47 | valuevar:normalized_count 48 | filter:project_short_name='TCGA-BRCA' 49 | filter:HGNC_gene_symbol IN ('ACSM5','NAP1L4','SULF2') 50 | limit:100 51 | 52 | 53 | 54 | python3 bqpairwise.py isb-cgc filter_file_1.txt filter_file_2.txt 55 | ''' 56 | 57 | import filter_and_annot as fa 58 | import pairwise_fun as pf 59 | from google.cloud import bigquery 60 | import argparse 61 | import sys 62 | 63 | 64 | def mainJoin(ffd1, ffd2): 65 | # joins the two filter queries 66 | q = 'mainjoin AS ( \nSELECT ' 67 | q += ffd1['valuevar2'] + ',\n' 68 | q += ffd2['valuevar2'] + ',\n' 69 | q += ffd1['groupby2'] + ',\n' 70 | q += ffd2['groupby2'] + ' \n' # both need a groupby # 71 | q += 'FROM' + '\n' 72 | q += 'J1 JOIN J2 ON \n' 73 | q += 'J1.'+ffd1['joinkey'] + ' = ' + 'J2.' + ffd2['joinkey'] + ' AND \n' 74 | q += 'J1.'+ffd1['groupby2'] + ' < ' + 'J2.' + ffd2['groupby2'] + '\n),\n' # will be another two tables 75 | return(q) 76 | 77 | 78 | def mainFun(args): 79 | # constructs each query, then joins the two queries, 80 | q1,ffd1 = fa.buildFilterQuery(args, "1") 81 | q2,ffd2 = fa.buildFilterQuery(args, "2") 82 | q3 = 'WITH\n' + q1 + ',\n' + q2 + ',\n' + mainJoin(ffd1,ffd2) 83 | q4 = pf.selectTest(q3, ffd1, ffd2) 84 | print(q4) 85 | client = bigquery.Client(project=args.prj) 86 | query_results = client.run_sync_query(q4) 87 | query_results.use_legacy_sql = False 88 | query_results.run() 89 | print(query_results.total_rows) 90 | print(query_results.rows[0]) 91 | print(query_results.rows[1]) 92 | print(query_results.rows[2]) 93 | #for qi in query_results.rows: 94 | # print(qi) 95 | 96 | 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser(description="BigQuery PairWise") 99 | parser.add_argument("prj", help="google project ID") 100 | parser.add_argument("ff1", help="filter file") 101 | parser.add_argument("ff2", help="filter file") 102 | args = parser.parse_args() 103 | mainFun(args) 104 | -------------------------------------------------------------------------------- /python/pairwise/filter_and_annot.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | ****************************************************** 17 | Using python to generate bigqueries. 18 | 19 | Here we use the 'filter file' to create SQL 20 | ****************************************************** 21 | 22 | First need to install the BigQuery API 23 | pip3 install --upgrade google-cloud-bigquery 24 | 25 | The first time I ran the installer, there was an error. But just running pip3 26 | again seemed to work. 27 | 28 | Also we need to get authenticated. At the command line we: 29 | gcloud auth application-default login 30 | 31 | table:isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16 32 | tablekey:probe_id 33 | tablevar:project_short_name 34 | joinkey:case_barcode 35 | valuevar:beta_value 36 | annot:isb-cgc.platform_reference.methylation_annotation 37 | annotkey:IlmnID 38 | groupby:UCSC.RefGene_Name 39 | filter:project_short_name='TCGA-BRCA' 40 | filter:RefGene_Name IN ('ACSM5','NAP1L4','SULF2') 41 | limit:100 42 | 43 | ''' 44 | 45 | from google.cloud import bigquery 46 | import argparse 47 | import sys 48 | 49 | # main key order 50 | mko = ['tablevar', 'table'] 51 | # annotation key order 52 | ako = [ 'annotvar', 'annot','recordflatten'] 53 | # join key order 54 | jko = ['bothvar', 'joinkey', 'filter', 'limit'] 55 | # join-with-no-annot key order 56 | nko = ['renamevar', 't1key', 'filter', 'limit'] 57 | 58 | 59 | ## Some queries must be annotated before running pairwise 60 | ## to this point, some annotation fields are nested 61 | ## so we need to check the schema first. 62 | def checkSchemas(client,ffd): 63 | # have to use a client pointed to the table that you want to query 64 | ks = list(ffd.keys()) 65 | for x in ['table', 'annot']: 66 | if x in ks: 67 | ts = ffd[x].split('.') # get the table (or annot table) name 68 | d1 = client.dataset(ts[1]) # get the dataset 69 | t1 = d1.table(ts[2]) 70 | t1.reload() # get the schema in t1 71 | for i in range(0,len(t1.schema)): # for each item in the schema 72 | if t1.schema[i].name == ffd['valuevar']: 73 | ffd['valuetype'] = t1.schema[i].field_type 74 | if t1.schema[i].field_type == 'RECORD': # if it's a record then we need extra attention 75 | ffd['recordflatten'] = t1.schema[i].name # keep track of which one it is. 76 | for y in ks: 77 | # then we need to edit that entry and remove the prefix. 78 | if t1.schema[i].name in ffd[y] and (y not in ['filter']): 79 | searchString = t1.schema[i].name + '.' 80 | z = str(ffd[y]) 81 | z = z.replace(searchString, '') 82 | ffd[y] = z 83 | return(ffd) 84 | 85 | 86 | def addItem(ffdict, mode, ki, qid): 87 | if mode == 'tablevar': 88 | if 'tablevar' not in ffdict.keys(): 89 | ffdict['tablevar'] = ffdict[ki] 90 | else: 91 | ffdict['tablevar'] = ffdict['tablevar'] + ",\n" + ffdict[ki] 92 | 93 | if mode == 'tablevar2': 94 | if 'tablevar2' not in ffdict.keys(): 95 | ffdict['tablevar2'] = ffdict[ki] + " AS " + ffdict[ki] + "_J" + qid 96 | ffdict['tablevar_rename'] = ffdict[ki] + "_J" + qid 97 | else: 98 | ffdict['tablevar2'] = ffdict['tablevar2'] + ",\n" + ffdict[ki] + " AS " + ffdict[ki] + "_J" + qid 99 | ffdict['tablevar_rename'] = ffdict['tablevar_rename'] + ",\n" + ffdict[ki] + "_J" + qid 100 | 101 | if mode == 'annotvar': 102 | if 'annotvar' not in ffdict.keys(): 103 | ffdict['annotvar'] = ffdict[ki] 104 | else: 105 | ffdict['annotvar'] = ffdict['annotvar'] + ",\n" + ffdict[ki] 106 | 107 | if mode == 'annotvar2': 108 | if 'annotvar2' not in ffdict.keys(): 109 | ffdict['annotvar2'] = ffdict[ki] + " AS " + ffdict[ki] + "_J" + qid 110 | ffdict['annotvar_rename'] = ffdict[ki] + "_J" + qid 111 | else: 112 | ffdict['annotvar2'] = ffdict['annotvar2'] + ",\n" + ffdict[ki] + " AS " + ffdict[ki] + "_J" + qid 113 | ffdict['annotvar_rename'] = ffdict['annotvar_rename'] + ",\n" + ffdict[ki] + "_J" + qid 114 | 115 | if mode == 'groupby': 116 | if 'groupby' not in ffdict.keys(): 117 | ffdict['groupby'] = ffdict[ki] 118 | else: 119 | ffdict['groupby'] = ffdict['groupby'] + ",\n" + ffdict[ki] 120 | 121 | if mode == 'groupby2': 122 | if 'groupby2' not in ffdict.keys(): 123 | ffdict['groupby2'] = ffdict[ki] + "_J" + qid 124 | else: 125 | ffdict['groupby2'] = ffdict['groupby2'] + ",\n" + ffdict[ki] + "_J" + qid 126 | 127 | return(ffdict) 128 | 129 | 130 | def updateFFdict(ffdict, qid): 131 | ks = list(ffdict.keys()) 132 | for ki in ks: 133 | #if ki in ['tablekey','tablejoin','tablegroup', 'valuevar']: 134 | if ki in ['tablekey', 'tablejoin', 'valuevar']: 135 | ffdict = addItem(ffdict, 'tablevar', ki, qid) # if it's a tablegroup... treat like filter item 136 | ffdict = addItem(ffdict, 'tablevar2', ki, qid) 137 | #if ki in ['annotkey', 'annotjoin', 'annotgroup']: 138 | if ki in ['annotkey', 'annotjoin']: 139 | ffdict = addItem(ffdict, 'annotvar', ki, qid) 140 | ffdict = addItem(ffdict, 'annotvar2', ki, qid) 141 | if ki in ['annotgroup','tablegroup']: 142 | ffdict = addItem(ffdict, 'groupby', ki, qid) 143 | ffdict = addItem(ffdict, 'groupby2', ki, qid) 144 | if ki in ['annotjoin', 'tablejoin']: 145 | ffdict['joinkey'] = ffdict[ki] 146 | ffdict['joinkey'] = ffdict[ki] + "_J" + qid 147 | if ki == 'valuevar': 148 | ffdict['valuevar2'] = ffdict['valuevar'] + "_J" + qid 149 | return(ffdict) 150 | 151 | 152 | 153 | # check that dictionary names are 154 | # in the allowed set. 155 | def checkFilterFile(client, ffd, qid): 156 | # check schemas for records 157 | ffd = updateFFdict(ffd, qid) 158 | ffd = checkSchemas(client, ffd) 159 | return(ffd) 160 | 161 | 162 | def keyOrder(ffdict, mode): 163 | ks = list(ffdict.keys()) 164 | if mode == 'maintable': 165 | kd = [x for x in mko if x in ks] 166 | elif mode == 'annottable': 167 | kd = [x for x in ako if x in ks] 168 | elif mode == 'jointable': 169 | kd = [x for x in jko if x in ks] 170 | elif mode == 'noannotjoin': 171 | kd = [x for x in nko if x in ks] 172 | else: 173 | kd = [] 174 | return(kd) 175 | 176 | 177 | def readFilterFile(filepath): 178 | # build a dictionary of query terms 179 | # the filter entries are concatenated 180 | fin = open(filepath, 'r') 181 | ffdict = {} 182 | for line in fin: 183 | strings = line.strip().split(':') 184 | k, v = [s.strip() for s in strings] 185 | if k not in ffdict: 186 | ffdict[k] = v 187 | elif k in ffdict and k in ['idvar', 'valuevar', 'annotvar', 'tablevar', 'tablegroup']: 188 | ffdict[k] = ffdict[k] + ",\n" + v 189 | else: 190 | ffdict[k] = ffdict[k] + " AND " + v 191 | fin.close() 192 | return(ffdict) 193 | 194 | 195 | def buildQuery(client, ffd, mode, qid): 196 | query = "SELECT \n" 197 | thisKeyOrder = keyOrder(ffd, mode) 198 | for key in thisKeyOrder: # queries need to have a particular order as specified in above lists 199 | if key in ['idvar', 'valuevar', 'annotvar', 'tablevar']: 200 | query += ffd[key] + "\n" 201 | elif key == 'renamevar': 202 | query += ffd['tablevar2'] 203 | elif key == 'bothvar': 204 | query += ffd['tablevar2'] + ",\n" + ffd['annotvar2'] +'\n' 205 | elif key == 'joinkey': 206 | query += ' FROM T'+ qid +' JOIN A'+ qid +' ON T'+ qid +'.' + ffd['tablekey'] + '= A'+ qid +'.' +ffd['annotkey'] +'\n' 207 | elif key == 't1key': 208 | query += ' FROM T'+ qid +'\n' 209 | elif key == 'filter': 210 | query += "WHERE \n" + ffd[key] +'\n' 211 | elif (key == 'table' or key == 'annot') and 'filter' not in thisKeyOrder: 212 | query += "FROM `" + ffd[key] + "`\n" 213 | elif key == 'limit': 214 | query += "LIMIT " + ffd[key] + " \n" 215 | elif key == 'recordflatten': 216 | query += ", UNNEST(" + ffd[key] +")\n" 217 | else: 218 | query += ffd[key] + " \n" 219 | return(query) 220 | 221 | 222 | def buildAnnotQuery(q1,q2,q3,qid): 223 | x = ( 224 | "T"+qid+" AS (\n" + 225 | q1 + 226 | "),\n" + 227 | 228 | "A"+qid+" AS (\n" + 229 | q2 + 230 | "), \n" + 231 | 232 | "J"+qid+" AS (\n" + 233 | q3 + 234 | ") \n" 235 | ) 236 | return(x) 237 | 238 | 239 | def buildNoAnnotQuery(q1,q3,qid): 240 | x = ( 241 | "T"+qid+" AS (\n" + 242 | q1 + 243 | "),\n" + 244 | 245 | "J"+qid+" AS (\n" + 246 | q3 + 247 | ") \n" 248 | ) 249 | return(x) 250 | 251 | 252 | def buildFilterQuery(args, qid): 253 | if qid == "1": 254 | ffdict = readFilterFile(args.ff1) 255 | else: 256 | ffdict = readFilterFile(args.ff2) 257 | thisproject = (ffdict['table'].split('.'))[0] 258 | client = bigquery.Client(project=thisproject) 259 | ffdict = checkFilterFile(client, ffdict, qid) 260 | q1 = buildQuery(client, ffdict, "maintable", qid) 261 | if 'annot' in ffdict.keys(): 262 | # prepare the annotation table, and perform a join 263 | q2 = buildQuery(client, ffdict, "annottable", qid) 264 | ffdict['bothvar'] = True # we are going to use both sets of vars (table and annot) 265 | ffdict['renamevar'] = True 266 | q3 = buildQuery(client, ffdict, "jointable", qid) 267 | queryString = buildAnnotQuery(q1,q2,q3,qid) 268 | else: 269 | # just query the main table with filters. 270 | q2 = '' # no annotation 271 | ffdict['t1key'] = True # not using annotvars 272 | ffdict['renamevar'] = True 273 | q3 = buildQuery(client, ffdict, "noannotjoin", qid) 274 | queryString = buildNoAnnotQuery(q1,q3,qid) 275 | return(queryString, ffdict) 276 | 277 | -------------------------------------------------------------------------------- /python/pairwise/pairwise_fun.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright 2015, Institute for Systems Biology 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | ''' 16 | 17 | #here we're going to keep the actual pairwise functions. 18 | 19 | def selectTest(q3, ffd1, ffd2): 20 | type1 = ffd1['valuetype'] 21 | type2 = ffd2['valuetype'] 22 | if type1 == 'FLOAT' and type2 == 'FLOAT': 23 | q4 = spearmans(q3, ffd1, ffd2) 24 | else: 25 | q4 = 'ERROR: you have failed at pairwise.' 26 | return(q4) 27 | 28 | 29 | # spearman's correlation 30 | # for float vs float 31 | 32 | def spearmans(q3, ffd1, ffd2): 33 | # first rank the data 34 | thisq = 'ranktable AS (\nSELECT \n ' 35 | thisq += ffd1['groupby2'] + ',\n' 36 | thisq += ffd2['groupby2'] + ',\n' 37 | thisq += ' DENSE_RANK() OVER (PARTITION BY ' + ffd1['groupby2'] + ' ORDER BY ' + ffd1['valuevar2'] + ' ASC) as rankvar1, \n' 38 | thisq += ' DENSE_RANK() OVER (PARTITION BY ' + ffd2['groupby2'] + ' ORDER BY ' + ffd2['valuevar2'] + ' ASC) as rankvar2 \n' 39 | thisq += 'FROM\nmainjoin \n' 40 | thisq += ')\n' 41 | # then correlate the ranks 42 | thisq += 'SELECT \n' 43 | thisq += ffd1['groupby2'] + ',\n' 44 | thisq += ffd2['groupby2'] + ',\n' 45 | thisq += ' CORR( rankvar1, rankvar2 ) as Spearmans \n' 46 | thisq += 'FROM\n ' 47 | thisq += ' ranktable \n' 48 | thisq += 'GROUP BY \n' 49 | thisq += ffd1['groupby2'] + ',\n' 50 | thisq += ffd2['groupby2'] + '\n' 51 | thisq += 'ORDER BY \n Spearmans DESC \n' 52 | return(q3 + thisq) 53 | 54 | 55 | # t-test 56 | # for float vs binary 57 | 58 | 59 | # anova 60 | # for float vs. more than one 61 | 62 | 63 | # Chi-sq or fisher's ? 64 | # for categorical vs categorical 65 | 66 | -------------------------------------------------------------------------------- /python/pairwise/tests/test1/filter_file_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:case_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | filter:project_short_name='TCGA-BRCA' 7 | filter:HGNC_gene_symbol='LARP1' 8 | -------------------------------------------------------------------------------- /python/pairwise/tests/test1/filter_file_2.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16 2 | tablekey:probe_id 3 | tablevar:project_short_name 4 | tablejoin:case_barcode 5 | valuevar:beta_value 6 | annot:isb-cgc.platform_reference.methylation_annotation 7 | annotkey:IlmnID 8 | annotgroup:UCSC.RefGene_Name 9 | filter:project_short_name='TCGA-BRCA' 10 | filter:RefGene_Name = 'GSG1L' 11 | -------------------------------------------------------------------------------- /python/pairwise/tests/test1/filter_file_test_query.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | T1 AS ( 3 | SELECT 4 | project_short_name, 5 | normalized_count, 6 | case_barcode, 7 | HGNC_gene_symbol 8 | FROM `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` 9 | ), 10 | J1 AS ( 11 | SELECT 12 | normalized_count AS normalized_count_J1, 13 | case_barcode AS case_barcode_J1, 14 | HGNC_gene_symbol AS HGNC_gene_symbol_J1 FROM T1 15 | WHERE 16 | project_short_name='TCGA-BRCA' AND HGNC_gene_symbol='LARP1' 17 | ) 18 | , 19 | T2 AS ( 20 | SELECT 21 | project_short_name, 22 | beta_value, 23 | case_barcode, 24 | probe_id 25 | FROM `isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16` 26 | ), 27 | A2 AS ( 28 | SELECT 29 | IlmnID, 30 | RefGene_Name 31 | FROM `isb-cgc.platform_reference.methylation_annotation` 32 | , UNNEST(UCSC) 33 | ), 34 | J2 AS ( 35 | SELECT 36 | beta_value AS beta_value_J2, 37 | case_barcode AS case_barcode_J2, 38 | probe_id AS probe_id_J2, 39 | IlmnID AS IlmnID_J2, 40 | RefGene_Name AS RefGene_Name_J2 41 | FROM T2 JOIN A2 ON T2.probe_id= A2.IlmnID 42 | WHERE 43 | project_short_name='TCGA-BRCA' AND RefGene_Name = 'GSG1L' 44 | ) 45 | , 46 | mainjoin AS ( 47 | SELECT normalized_count_J1, 48 | beta_value_J2, 49 | HGNC_gene_symbol_J1, 50 | RefGene_Name_J2 51 | FROM 52 | J1 JOIN J2 ON 53 | J1.case_barcode_J1 = J2.case_barcode_J2 AND 54 | J1.HGNC_gene_symbol_J1 > J2.RefGene_Name_J2 55 | ), 56 | ranktable AS ( 57 | SELECT 58 | HGNC_gene_symbol_J1, 59 | RefGene_Name_J2, 60 | DENSE_RANK() OVER (PARTITION BY HGNC_gene_symbol_J1 ORDER BY normalized_count_J1 ASC) as rankvar1, 61 | DENSE_RANK() OVER (PARTITION BY RefGene_Name_J2 ORDER BY beta_value_J2 ASC) as rankvar2 62 | FROM 63 | mainjoin 64 | ) 65 | SELECT 66 | HGNC_gene_symbol_J1, 67 | RefGene_Name_J2, 68 | CORR( rankvar1, rankvar2 ) as Spearmans 69 | FROM 70 | ranktable 71 | GROUP BY 72 | HGNC_gene_symbol_J1, 73 | RefGene_Name_J2 74 | ORDER BY 75 | Spearmans DESC 76 | -------------------------------------------------------------------------------- /python/pairwise/tests/test2/filter_file_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:sample_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | filter:project_short_name='TCGA-BRCA' 7 | filter:HGNC_gene_symbol > 'GZ' 8 | filter:HGNC_gene_symbol < 'HB' 9 | -------------------------------------------------------------------------------- /python/pairwise/tests/test2/filter_file_2.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:sample_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | filter:project_short_name='TCGA-BRCA' 7 | filter:HGNC_gene_symbol > 'AZ' 8 | filter:HGNC_gene_symbol < 'BC' 9 | -------------------------------------------------------------------------------- /python/pairwise/tests/test2/filter_file_test_query.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | T1 AS ( 3 | SELECT 4 | project_short_name, 5 | HGNC_gene_symbol, 6 | normalized_count, 7 | sample_barcode 8 | FROM `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` 9 | ), 10 | J1 AS ( 11 | SELECT 12 | HGNC_gene_symbol AS HGNC_gene_symbol_J1, 13 | normalized_count AS normalized_count_J1, 14 | sample_barcode AS sample_barcode_J1 FROM T1 15 | WHERE 16 | project_short_name='TCGA-BRCA' AND HGNC_gene_symbol > 'GZ' AND HGNC_gene_symbol < 'HZ' 17 | ) 18 | , 19 | T2 AS ( 20 | SELECT 21 | project_short_name, 22 | HGNC_gene_symbol, 23 | normalized_count, 24 | sample_barcode 25 | FROM `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` 26 | ), 27 | J2 AS ( 28 | SELECT 29 | HGNC_gene_symbol AS HGNC_gene_symbol_J2, 30 | normalized_count AS normalized_count_J2, 31 | sample_barcode AS sample_barcode_J2 FROM T2 32 | WHERE 33 | project_short_name='TCGA-BRCA' AND HGNC_gene_symbol > 'AZ' AND HGNC_gene_symbol < 'BZ' 34 | ) 35 | , 36 | mainjoin AS ( 37 | SELECT normalized_count_J1, 38 | normalized_count_J2, 39 | HGNC_gene_symbol_J1, 40 | HGNC_gene_symbol_J2 41 | FROM 42 | J1 JOIN J2 ON 43 | J1.sample_barcode_J1 = J2.sample_barcode_J2 AND 44 | J1.HGNC_gene_symbol_J1 > J2.HGNC_gene_symbol_J2 45 | ), 46 | ranktable AS ( 47 | SELECT 48 | HGNC_gene_symbol_J1, 49 | HGNC_gene_symbol_J2, 50 | DENSE_RANK() OVER (PARTITION BY HGNC_gene_symbol_J1 ORDER BY normalized_count_J1 ASC) as rankvar1, 51 | DENSE_RANK() OVER (PARTITION BY HGNC_gene_symbol_J2 ORDER BY normalized_count_J2 ASC) as rankvar2 52 | FROM 53 | mainjoin 54 | ) 55 | SELECT 56 | HGNC_gene_symbol_J1, 57 | HGNC_gene_symbol_J2, 58 | CORR( rankvar1, rankvar2 ) as Spearmans 59 | FROM 60 | ranktable 61 | GROUP BY 62 | HGNC_gene_symbol_J1, 63 | HGNC_gene_symbol_J2 64 | ORDER BY 65 | Spearmans DESC 66 | 67 | -------------------------------------------------------------------------------- /python/pairwise/tests/test3/filter_file_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:sample_barcode 3 | valuevar:normalized_count 4 | tablegroup:project_short_name 5 | tablevar:HGNC_gene_symbol 6 | filter:HGNC_gene_symbol > 'GZ' 7 | filter:HGNC_gene_symbol < 'HB' 8 | -------------------------------------------------------------------------------- /python/pairwise/tests/test3/filter_file_2.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:sample_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | filter:project_short_name IN ('TCGA-BRCA', 'TCGA-PAAD', 'TCGA-GBM') 7 | filter:HGNC_gene_symbol > 'AY' 8 | filter:HGNC_gene_symbol < 'BCC' 9 | -------------------------------------------------------------------------------- /python/pairwise/tests/test4/filter_file_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.Protein_Expression 2 | tablejoin:sample_barcode 3 | tablevar:project_short_name 4 | valuevar:protein_expression 5 | tablegroup:gene_name 6 | filter:project_short_name='TCGA-BRCA' 7 | -------------------------------------------------------------------------------- /python/pairwise/tests/test4/filter_file_2.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:sample_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | filter:project_short_name='TCGA-BRCA' 7 | -------------------------------------------------------------------------------- /python/pairwise/tests/test4/filter_file_test_query.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | T1 AS ( 3 | SELECT 4 | project_short_name, 5 | normalized_count, 6 | case_barcode, 7 | HGNC_gene_symbol 8 | FROM `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` 9 | ), 10 | J1 AS ( 11 | SELECT 12 | normalized_count AS normalized_count_J1, 13 | case_barcode AS case_barcode_J1, 14 | HGNC_gene_symbol AS HGNC_gene_symbol_J1 FROM T1 15 | WHERE 16 | project_short_name='TCGA-BRCA' AND HGNC_gene_symbol='LARP1' 17 | ) 18 | , 19 | T2 AS ( 20 | SELECT 21 | project_short_name, 22 | beta_value, 23 | case_barcode, 24 | probe_id 25 | FROM `isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16` 26 | ), 27 | A2 AS ( 28 | SELECT 29 | IlmnID, 30 | RefGene_Name 31 | FROM `isb-cgc.platform_reference.methylation_annotation` 32 | , UNNEST(UCSC) 33 | ), 34 | J2 AS ( 35 | SELECT 36 | beta_value AS beta_value_J2, 37 | case_barcode AS case_barcode_J2, 38 | probe_id AS probe_id_J2, 39 | IlmnID AS IlmnID_J2, 40 | RefGene_Name AS RefGene_Name_J2 41 | FROM T2 JOIN A2 ON T2.probe_id= A2.IlmnID 42 | WHERE 43 | project_short_name='TCGA-BRCA' AND RefGene_Name = 'GSG1L' 44 | ) 45 | , 46 | mainjoin AS ( 47 | SELECT normalized_count_J1, 48 | beta_value_J2, 49 | HGNC_gene_symbol_J1, 50 | RefGene_Name_J2 51 | FROM 52 | J1 JOIN J2 ON 53 | J1.case_barcode_J1 = J2.case_barcode_J2 AND 54 | J1.HGNC_gene_symbol_J1 > J2.RefGene_Name_J2 55 | ), 56 | ranktable AS ( 57 | SELECT 58 | HGNC_gene_symbol_J1, 59 | RefGene_Name_J2, 60 | DENSE_RANK() OVER (PARTITION BY HGNC_gene_symbol_J1 ORDER BY normalized_count_J1 ASC) as rankvar1, 61 | DENSE_RANK() OVER (PARTITION BY RefGene_Name_J2 ORDER BY beta_value_J2 ASC) as rankvar2 62 | FROM 63 | mainjoin 64 | ) 65 | SELECT 66 | HGNC_gene_symbol_J1, 67 | RefGene_Name_J2, 68 | CORR( rankvar1, rankvar2 ) as Spearmans 69 | FROM 70 | ranktable 71 | GROUP BY 72 | HGNC_gene_symbol_J1, 73 | RefGene_Name_J2 74 | ORDER BY 75 | Spearmans DESC 76 | -------------------------------------------------------------------------------- /python/pairwise/tests/test5/filter_file_1.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM 2 | tablejoin:case_barcode 3 | tablevar:project_short_name 4 | valuevar:normalized_count 5 | tablegroup:HGNC_gene_symbol 6 | annot:isb-cgc.QotM.WikiPathways_20170425_Annotated 7 | annotkey:Symbol 8 | annotvar:pathway 9 | tablekey:HGNC_gene_symbol 10 | filter:project_short_name='TCGA-BRCA' 11 | filter:pathway='Apoptosis' 12 | -------------------------------------------------------------------------------- /python/pairwise/tests/test5/filter_file_2.txt: -------------------------------------------------------------------------------- 1 | table:isb-cgc.TCGA_hg19_data_v0.DNA_Methylation_chr16 2 | tablekey:probe_id 3 | tablevar:project_short_name 4 | tablejoin:case_barcode 5 | valuevar:beta_value 6 | annot:isb-cgc.platform_reference.methylation_annotation 7 | annotkey:IlmnID 8 | annotvar:RefGene_Name 9 | annotgroup:probe_id 10 | filter:project_short_name='TCGA-BRCA' 11 | filter:RefGene_Name IN (select Symbol FROM `isb-cgc.QotM.WikiPathways_20170425_Annotated` WHERE pathway = 'Apoptosis') 12 | -------------------------------------------------------------------------------- /python/tsv2json.py: -------------------------------------------------------------------------------- 1 | # This script generates a JSON schema for a given data file to 2 | # be used with the 'bq load' command line tool. 3 | # ------------------------------------------------------------- 4 | 5 | import copy 6 | import gzip 7 | import sys 8 | ##import string 9 | 10 | # ------------------------------------------------------------- 11 | 12 | # INPUT: path to local data file 13 | # OUTPUT: JSON schema to stdout 14 | 15 | # BigQuery data types = ['string','bytes','integer','float','boolean','record','timestamp'] 16 | # BigQuery modes = ['nullable','required','repeated'] , default is nullable 17 | 18 | # ------------------------------------------------------------- 19 | 20 | def readJSONschemaFromFile ( jsonSchemaFilename ): 21 | 22 | schemaInfo = [] 23 | 24 | try: 25 | fh = file ( jsonSchemaFilename ) 26 | for aLine in fh: 27 | aLine = aLine.strip() 28 | if ( aLine.find ( '"name"' ) >= 0 ): 29 | aTokens = aLine.split('"') 30 | bTokens = [] 31 | for a in aTokens: 32 | a = a.strip() 33 | if ( a!='' and a!=',' and a!=':' ): bTokens += [ a ] 34 | 35 | ## ['{', 'name', 'gs_url', 'type', 'string', 'mode', 'nullable', 'description', '', '},'] 36 | if ( bTokens[1] == 'name' ): 37 | if ( bTokens[3] == 'type' ): 38 | if ( bTokens[5] == 'mode' ): 39 | schemaInfo += [ [ bTokens[2], bTokens[4], bTokens[6] ] ] 40 | 41 | except: 42 | pass 43 | 44 | print " in readJSONschemaFromFile ... ", jsonSchemaFilename 45 | for ii in range(len(schemaInfo)): 46 | print ii, schemaInfo[ii] 47 | 48 | return ( schemaInfo ) 49 | 50 | # ------------------------------------------------------------- 51 | 52 | def splitListString ( aString ): 53 | 54 | # print " in splitListString : <%s> " % aString 55 | 56 | aTokens = [] 57 | if ( aString.startswith("u'") ): 58 | ii = 2 59 | while ( ii < len(aString) ): 60 | jj = aString.find("'",ii) 61 | if ( jj > ii ): 62 | aTokens += [ aString[ii:jj] ] 63 | ii = jj 64 | ii = aString.find("'",jj+1) 65 | if ( ii < 0 ): ii = len(aString) 66 | 67 | else: 68 | aTokens = aString.split(',') 69 | 70 | return ( aTokens ) 71 | 72 | # -------------------------------------------------------------- 73 | 74 | def translateInputRow ( dataRow, schemaInfo ): 75 | 76 | # print " in translateInputRow ... " 77 | # print dataRow 78 | # print len(dataRow) 79 | 80 | ## start the output row with an open curly brace ... 81 | outRow = '{' 82 | 83 | ## now loop over the 'tokens' in the input 'dataRow' list ... 84 | for ii in range(len(dataRow)): 85 | 86 | # print ii, dataRow[ii], schemaInfo[ii] 87 | 88 | ## first handle NON repeated fields ... 89 | if ( schemaInfo[ii][2] != 'repeated' ): 90 | if ( schemaInfo[ii][1] == 'string' ): 91 | try: 92 | outRow += '"%s":"%s",' % ( schemaInfo[ii][0], dataRow[ii].strip() ) 93 | except: 94 | print " FAILED TO WRITE string ??? ", schemaInfo[ii][0], dataRow[ii].strip() 95 | sys.exit(-1) 96 | elif ( schemaInfo[ii][1] == 'integer' ): 97 | try: 98 | outRow += '"%s":%d,' % ( schemaInfo[ii][0], int(dataRow[ii].strip()) ) 99 | except: 100 | print " FAILED TO WRITE integer ??? ", schemaInfo[ii][0], dataRow[ii].strip() 101 | sys.exit(-1) 102 | elif ( schemaInfo[ii][1] == 'float' ): 103 | try: 104 | outRow += '"%s":%f,' % ( schemaInfo[ii][0], float(dataRow[ii].strip()) ) 105 | except: 106 | print " FAILED TO WRITE float ??? ", schemaInfo[ii][0], dataRow[ii].strip() 107 | sys.exit(-1) 108 | elif ( schemaInfo[ii][1] == 'boolean' ): 109 | print " BOOLEAN type TO BE IMPLEMENTED ... " 110 | sys.exit(-1) 111 | elif ( schemaInfo[ii][1] == 'bytes' ): 112 | print " BYTES type TO BE IMPLEMENTED ... " 113 | sys.exit(-1) 114 | elif ( schemaInfo[ii][1] == 'record' ): 115 | print " RECORD type TO BE IMPLEMENTED ... " 116 | sys.exit(-1) 117 | elif ( schemaInfo[ii][1] == 'timestamp' ): 118 | print " TIMESTAMP type TO BE IMPLEMENTED ... " 119 | sys.exit(-1) 120 | 121 | else: 122 | 123 | ## now we handle a REPEATED field ... 124 | 125 | ## print " handle a repeated field !!! " 126 | ## print schemaInfo[ii] 127 | ## print dataRow[ii] 128 | 129 | ## it might be empty ... 130 | if ( len(dataRow[ii]) == 0 ): 131 | outRow += '"%s":null,' % schemaInfo[ii][0] 132 | 133 | elif ( dataRow[ii][0] == '[' ): 134 | 135 | outRow += '"%s":[' % schemaInfo[ii][0] 136 | 137 | ## dTok = dataRow[ii][1:-1].split(',') 138 | hasSingleQ = 0 139 | hasDoubleQ = 0 140 | if ( dataRow[ii].find("'") > 0 ): hasSingleQ = 1 141 | if ( dataRow[ii].find('"') > 0 ): hasDoubleQ = 1 142 | if ( hasSingleQ and hasDoubleQ ): 143 | print " FATAL ERROR ??? !!! single and double quotes ??? !!! " 144 | sys.exit(-1) 145 | 146 | if ( hasSingleQ ): 147 | ## print " Handling repeated field with single quotes ... " 148 | ## print dataRow[ii] 149 | dTok = dataRow[ii][1:-1].split("'") 150 | ## print len(dTok), dTok 151 | d2 = [] 152 | for d in dTok: 153 | d = d.strip() 154 | if ( d!='u' and d!=', u' and len(d) > 0 ): 155 | d2 += [ d ] 156 | ## print len(d2), d2 157 | dTok = d2 158 | ## print " " 159 | ## if ( len(dTok) > 2 ): sys.exit(-1) 160 | 161 | elif ( hasDoubleQ ): 162 | print " Handling repeated field with double quotes ... " 163 | print dataRow[ii] 164 | sys.exit(-1) 165 | 166 | else: 167 | dTok = dataRow[ii][1:-1].split(',') 168 | 169 | sTok = copy.deepcopy(dTok) 170 | dTok.sort() 171 | if ( 0 ): 172 | if ( sTok[0] != dTok[0] ): 173 | print " sorting changed things !!! " 174 | print dTok 175 | print sTok 176 | sys.exit(-1) 177 | 178 | # print dataRow[ii] 179 | # print dTok 180 | for d in dTok: 181 | d = d.strip() 182 | if ( schemaInfo[ii][1] == 'string' ): 183 | if ( d.startswith("u'") ): d = d[2:-1] 184 | if ( d == ',' ): continue 185 | outRow += '"%s",' % d 186 | 187 | outRow = outRow[:-1] + '],' 188 | # print " " 189 | # print outRow 190 | 191 | else: 192 | print " ------ " 193 | print " hmmmmmmmm ... what do I do now ??? " 194 | print schemaInfo[ii] 195 | print " <%s> " % ( dataRow[ii] ) 196 | print " ------ " 197 | sys.exit(-1) 198 | 199 | outRow = outRow[:-1] 200 | outRow += '}' 201 | 202 | # print " " 203 | # print " FINAL ROW " 204 | # print outRow 205 | 206 | return ( outRow ) 207 | 208 | # -------------------------------------------------------------- 209 | # -------------------------------------------------------------- 210 | 211 | if ( len(sys.argv) != 4 ): 212 | print " " 213 | print " Usage : %s " 214 | sys.exit(-1) 215 | 216 | inFilename = sys.argv[1] 217 | jsonSchemaFilename = sys.argv[2] 218 | outFilename = sys.argv[3] 219 | 220 | # first we need to read in the JSON schema ... 221 | schemaInfo = readJSONschemaFromFile ( jsonSchemaFilename ) 222 | 223 | # open data file ... 224 | try: 225 | if inFilename.endswith('gz'): 226 | dataFile = gzip.open(inFilename,"r") 227 | else: 228 | dataFile = open(inFilename,"r") 229 | except: 230 | print 'requires input filename as command-line parameter' 231 | if ( len(inFilename) > 0 ): 232 | print ' --> failed to open <%s> ' % inFilename 233 | sys.exit() 234 | 235 | print " " 236 | print "Parsing input file <%s>." % inFilename 237 | print " " 238 | 239 | # first line is expected to be the header 240 | aLine = dataFile.readline() 241 | aLine = aLine.strip() 242 | headerRow = aLine.split('\t') 243 | ## print headerRow 244 | 245 | # make sure the headerRow matches the JSON schema ... 246 | if ( len(headerRow) != len(schemaInfo) ): 247 | print " ERROR: number of tokens in the first row does not match input schema ... " 248 | sys.exit(-1) 249 | 250 | else: 251 | allMatch = True 252 | for ii in range(len(headerRow)): 253 | ## print " comparing <%s> and <%s> " % ( headerRow[ii], schemaInfo[ii][0] ) 254 | if ( headerRow[ii] != schemaInfo[ii][0] ): allMatch = False 255 | if ( not allMatch ): 256 | print " field names do not match, but that might be ok: " 257 | for ii in range(len(headerRow)): 258 | print headerRow[ii], " : ", schemaInfo[ii] 259 | 260 | # open the output file ... 261 | jsonFile = open(outFilename,"w") 262 | 263 | done = 0 264 | numRows = 0 265 | while not done: 266 | 267 | # now we're going to read and 'translate' each line, one-by-one ... 268 | aLine = dataFile.readline() 269 | if ( len(aLine) == 0 ): 270 | done = 1 271 | continue 272 | 273 | try: 274 | if ( ord(aLine[-1]) < 32 ): aLine = aLine[:-1] 275 | except: 276 | pass 277 | 278 | ## print len(aLine) 279 | ## print " %d <%s> " % ( len(aLine), aLine ) 280 | dataRow = aLine.split('\t') 281 | if ( len(dataRow) == 0 ): continue 282 | 283 | if ( len(dataRow) != len(schemaInfo) ): 284 | print " ERROR ??? # of values in data row is not as expected ??? ", len(dataRow), len(schemaInfo) 285 | print " " 286 | for ii in range(min(len(dataRow),len(schemaInfo))): 287 | print " %3d %s %s " % ( ii, schemaInfo[ii][0], dataRow[ii] ) 288 | sys.exit(-1) 289 | 290 | outRow = translateInputRow ( dataRow, schemaInfo ) 291 | jsonFile.write ( "%s\n" % outRow ) 292 | 293 | numRows += 1 294 | if ( numRows % 10000 == 0 ): print numRows, " ... " 295 | 296 | dataFile.close() 297 | jsonFile.close() 298 | 299 | # -------------------------------------------------------------- 300 | --------------------------------------------------------------------------------