├── Demo.ipynb
├── Gene_Annotation
└── gencode.v22.genes.txt
├── Images
└── TCGA Barcode.png
├── README.md
├── Test_Manifest.txt
├── gdc-rnaseq-tool.py
└── requirements.txt
/Demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## GDC RNA-Seq Tool Example\n",
8 | "### This notebook lets users upload a manifest file, and download Merged RNA-Seq Data"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import base64\n",
18 | "from IPython.display import HTML, display\n",
19 | "from ipyupload import FileUpload\n",
20 | "import subprocess\n",
21 | "import pandas as pd\n",
22 | "\n",
23 | "def Write_Manifest_File(w):\n",
24 | " File_Name = list(w.value.keys())[0]\n",
25 | " file = open(File_Name,'w') \n",
26 | " file.write(w.value[File_Name]['content'].decode('UTF-8')) \n",
27 | " file.close()\n",
28 | " return File_Name"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Step 1: Upload a Manifest File"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "w = FileUpload()\n",
45 | "display('Upload a Manifest File:' , w)"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "## Step 2: Run GDC-RNASeq-Tool On Manifest"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "Manifest_Name = Write_Manifest_File(w)\n",
62 | "print('Running GDC-RNA-Seq Tool on Manifest File called ' + Manifest_Name + '....')\n",
63 | "p = subprocess.run(['python','gdc-rnaseq-tool.py',Manifest_Name,'--hugo'],stdout=subprocess.PIPE)\n",
64 | "print('Finished Running')"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "## Step 3: Download Merged Data"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# Read in Data Frame that was created\n",
81 | "File_Location = p.stdout.decode('UTF-8').split('\\n')[1].split(': ')[1]\n",
82 | "\n",
83 | "Files = []\n",
84 | "for line in p.stdout.decode('UTF-8').split('\\n'):\n",
85 | " if 'Creating merged' in line:\n",
86 | " Files.append(line.split(' ')[7])\n",
87 | "\n",
88 | "def create_download_link( df, title , filename ): \n",
89 | " csv = df.to_csv(sep='\\t')\n",
90 | " b64 = base64.b64encode(csv.encode())\n",
91 | " payload = b64.decode()\n",
92 | " html = '{title}'\n",
93 | " html = html.format(payload=payload,title=title,filename=filename)\n",
94 | " return HTML(html)\n",
95 | "\n",
96 | "for file in Files:\n",
97 | " df = pd.read_csv(File_Location + file,sep='\\t')\n",
98 | " display(create_download_link(df, file, file))"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": []
107 | }
108 | ],
109 | "metadata": {
110 | "kernelspec": {
111 | "display_name": "Python 3",
112 | "language": "python",
113 | "name": "python3"
114 | },
115 | "language_info": {
116 | "codemirror_mode": {
117 | "name": "ipython",
118 | "version": 3
119 | },
120 | "file_extension": ".py",
121 | "mimetype": "text/x-python",
122 | "name": "python",
123 | "nbconvert_exporter": "python",
124 | "pygments_lexer": "ipython3",
125 | "version": "3.7.3"
126 | }
127 | },
128 | "nbformat": 4,
129 | "nbformat_minor": 2
130 | }
131 |
--------------------------------------------------------------------------------
/Images/TCGA Barcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpreid2/gdc-rnaseq-tool/085aeb532bb0dd7b2b84ecb9cd960f2c491e5d6d/Images/TCGA Barcode.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # gdc-rnaseq-tool
2 | Tool to download / merge individual RNASeq files from the [GDC Portal](https://portal.gdc.cancer.gov) into a matrices identified by [TCGA barcode](https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode).
3 |
4 | 
5 |
6 | __Description__:
7 |
8 | The `gdc-rnaseq-tool` performs the following:
9 |
10 | 1. Downloads RNA-Seq / miRNA-Seq data files using a GDC manifest file
11 | 2. Unzips the files into separate folders identified by experimental strategy and bioinformatics workflow
12 | 3. Merges the files into separate matrix files identified in the table below
13 |
14 | *The script will ignore any files in the manifest file that are not [Transcriptome Profiling files](https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Transcriptome%20Profiling%22%5D%7D%7D%5D%7D) generated from the GDC RNA-Seq / miRNA-Seq bioinformatics pipelines located on the [GDC Main Portal](https://portal.gdc.cancer.gov):*
15 |
16 | [RNA-Seq / miRNA-Seq Files](https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Transcriptome%20Profiling%22%5D%7D%7D%5D%7D)
17 |
18 | [RNA-Seq Bioinformatics Pipeline Documentation](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/)
19 |
20 | [miRNA-Seq Bioinformatics Pipeline Documentation](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/miRNA_Pipeline/)
21 |
22 | __Inputs and Outputs__:
23 |
24 | | I/O | File |
25 | |---|---|
26 | | Input | [GDC Manifest File](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload/#obtaining-a-manifest-file-for-data-download) |
27 | | Output | Merged_Counts.tsv ([HTSeq - Counts](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#upper-quartile-fpkm)) |
28 | | | Merged_FPKM.tsv ([HTSeq - FPKM](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#fpkm)) |
29 | | | Merged_FPKM-UQ.tsv ([HTSeq - FPKM-UQ](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#upper-quartile-fpkm)) |
30 | | | Merged_miRNA_Counts.tsv |
31 | | | Merged_miRNA_rpmm.tsv |
32 |
33 |
34 | __Requirements__:
35 |
36 | - Python 3+
37 | - pandas ( https://pandas.pydata.org/pandas-docs/stable/install.html ): `pip3 install pandas`
38 | - requests (http://python-requests.org): `pip3 install requests`
39 |
40 | __Quick Start__:
41 |
42 | 1. [Download](https://github.com/cpreid2/gdc-rnaseq-tool/releases/download/1.0/gdc-rnaseq-tool.py) `gdc-rnaseq-tool.py` python script
43 | 2. Download manifest containing RNA/miRNA expression files from https://portal.gdc.cancer.gov/
44 | 3. `python3 gdc-rnaseq-tool.py `
45 |
46 | Optional: Add `--hugo` to the command to include the HUGO gene symbol as a separate column.
47 |
48 | `python3 gdc-rnaseq-tool.py --hugo`
49 |
50 | ---
51 |
52 | The GDC RNASeq tool produces matrices of merged RNA/MiRNA expression data given a manifest file.
53 |
54 | Usage: `python3 gdc-rnaseq-tool.py `
55 |
56 | Notes:
57 | * A test manifest is provided for troubleshooting: `python3 gdc-rnaseq-tool.py Test_Manifest.txt`
58 | * Files are by default downloaded to the same folder as the manifest file that was provided
59 |
60 | **Release Notes:**
61 |
62 | Version 1.0: Feb 8, 2018
63 |
64 | * Initial release
65 |
66 | Known Issues:
67 | N/A
68 |
--------------------------------------------------------------------------------
/Test_Manifest.txt:
--------------------------------------------------------------------------------
1 | id filename md5 size state
2 | a27546f4-0850-4b84-9af9-89c760474037 14a5b723-8762-4bcd-82bf-5cb50daf80bc.htseq.counts.gz eeab2a853fe0029d3dbe73e475494ff9 245378 live
3 | d72f41c9-f94b-4880-8fd7-394828df6e6b 9361d2a0-761f-42a3-b672-5c6c0a964fd6.mirbase21.isoforms.quantification.txt 94512564c095f5701ce6f6fc6f00bfd2 373138 live
4 | a2b0e1cf-859a-4ee8-8275-6b01b81d8355 1f495645-b2ce-4726-a3fb-4fdd898e423a.htseq.counts.gz 7ca66098ca549454858900f59d2eee48 251678 live
5 | bcfdcdac-c2e5-448f-a2f4-38af70c9bca4 7794958e-853e-4599-a30c-1eee64e28739.FPKM-UQ.txt.gz d7c6057daa0304b26c41de1afc005752 468986 live
6 | c6221cb9-94e3-466f-b6d3-c862db204e1d ce0e7450-7b99-42e7-bb62-4077b53253e2.mirbase21.mirnas.quantification.txt 654f9dc754e42eb2773c852f5007bf5d 50338 live
7 | ca3dab87-6bd9-405e-8810-34c5d095a2a2 ce58ee65-6131-496e-b29a-b5f97a26eb9d.htseq.counts.gz 2d708a10efad30e593fa5bad9cd7c7ef 250853 live
8 | 79d488fe-c1ab-4935-935c-ae80d726bad9 5d418e71-4e7d-4111-8aad-4ea782fd87b6.htseq.counts.gz d7f0282068f0b78f0e77068c86cde07d 248105 live
9 | 96173454-3d93-4569-bdf4-2fae6e091a84 4a6db870-fe49-4e39-822a-f513e05d3f61.mirbase21.isoforms.quantification.txt ec0b04ce0953f2822cd6040678a8d638 319096 live
10 | de09e9ab-56c6-4ade-ba33-26ed46a2bce2 78e5d21b-87ac-4557-8089-77d532039762.mirbase21.isoforms.quantification.txt 66e38557517c8c1ee91a3f9cc6d940bb 434178 live
11 | 1c923966-fac1-4571-b310-11295d14fcdc 7f75084d-fa0a-4a63-9bb0-9a0917fc149e.htseq.counts.gz 21daa282621d83a5884873812e704446 240470 live
12 | 2841c244-29b2-46c3-8f1d-c96ceda9d52a 79d9c719-ff0d-4574-8b43-db11bb355c0d.FPKM-UQ.txt.gz a548234817bc9d8ab64eefaa627505fd 467863 live
13 | 93fee0ad-4995-43b5-9b3d-4e51e529fc4e 87c98a63-24f5-49c7-b452-1d7185fb16a9.htseq.counts.gz e04e110b985308dd9c75083721e63d4e 248201 live
14 | 4a2ae4f2-1525-46c9-bc29-58fa40b4e905 ef4efb01-b06d-4e06-a5a8-74191a2a4779.mirbase21.mirnas.quantification.txt fbdf1345d72141d0080fb07b0124ed6a 50518 live
15 | 7eb56a73-88fa-411e-bd2e-aed4a1dc6a64 659581b9-cd77-4f3b-8fc9-46c5c806888d.mirbase21.isoforms.quantification.txt 5beaf3f493b2e7e37f3649579a311990 358750 live
16 | 67fa2520-5cd9-4c27-8e3e-2d47e69e6dd5 415cbd80-d98d-45c3-beda-e00cd585c2a4.mirbase21.mirnas.quantification.txt 6700dd5df811e5c5f60df8071985dff2 50527 live
17 | 664bf371-a42b-4563-8e2c-b49864bb4f0f 3d5ba239-49aa-4958-a873-06277c7f3999.mirbase21.isoforms.quantification.txt 7818742fc83d2da8f5d1c3fec413922e 384451 live
18 | 9ddc9b46-b158-44b0-9e32-253b1d1fa85c 7bd62b90-1973-491b-8c70-b0f20deaecfd.htseq.counts.gz 5332e483862fd9b63b58060143335d37 244374 live
19 | 731e3393-2409-4f74-aa23-93c56aeccfa3 3e9b3575-a067-4cb2-adf1-13a98c08f6ca.htseq.counts.gz 7ee8dac667363004ee5d5538a2c031a0 243728 live
20 | c0d965cb-072f-4f7e-9385-6441aa8af1ca 82d61f67-f56e-4855-aa2e-44350982f8af.FPKM-UQ.txt.gz 4a796a1885bee1128cb6d51ae71d9393 484825 live
21 | 2e783233-f860-4c57-9ee4-a69e45a1f092 986291b6-5c83-4862-852d-4c591927a986.mirbase21.isoforms.quantification.txt 0bfa74b13c02e08d890c97b971f724f8 299608 live
22 | c3e730ea-5dba-49cb-ad2f-d8e5c640c25d cc16a3ce-4569-47c1-8155-1f6d1e64f999.FPKM-UQ.txt.gz 766d665fdb25bf9078312d0d26acc674 451371 live
23 | daca9eac-4696-4e5f-bcc9-757e485d7073 8b14a5e1-cccc-4596-a367-bbd76e66bdde.mirbase21.isoforms.quantification.txt d5bfe8ae396be19d581eabe75a5963b9 423648 live
24 | 313e9ed7-0684-4711-8dcc-dbb36a64b9a8 0eb17745-aa52-4097-8368-119e27a052cd.mirbase21.isoforms.quantification.txt f5d4e85054a2210d833096ae692fb03a 450157 live
25 | 70b6ce0e-b01b-4e1a-af27-5a0d24bfe11d c668e123-05fe-4983-862d-10b413b4138d.mirbase21.mirnas.quantification.txt 163628c8acabf1d04576373cb7b1c98d 50262 live
26 | 260ca241-b3ca-4d80-9530-3bd27cc86db8 6b0df792-5601-4d5d-8844-4a66391dbcaf.FPKM.txt.gz 4a66e935cb542f3ddfc8cab7d2557e8b 504086 live
27 | 056a99cb-53ab-4146-9277-33caeb9d1a02 25bfcac0-ba7b-41cb-8343-34bb425857aa.FPKM.txt.gz 90e64d1ba9171ca3a18baa3ed1db91c6 498889 live
28 | 0a63413a-4839-4adf-b1c1-52166fe02d80 57817aa5-3bb6-47e7-a344-2857668dc485.FPKM-UQ.txt.gz 9477d36020fed556ba25dcc3bb2c52f7 491429 live
29 | 29426006-e5c3-4b10-a649-b9dda2dac9ba 21dcd167-85e9-46a7-94b2-18ac56852e15.FPKM.txt.gz a0f51b81c5d403a7f64d45325b998a5c 486525 live
30 | 5ddc5a1b-1cc3-4afc-addb-47ff357873ee d5d1ce0c-e908-4890-87d0-0a7b3b1bbb00.htseq.counts.gz 5f91db9440f711fab895d6266e4550b0 239099 live
31 | a85657aa-9ea4-4c8e-aac8-9258cdcc7fc7 fd181f89-6985-469d-93a9-5c108a81f8df.mirbase21.mirnas.quantification.txt 67d158ecd63462ee3c7f63dc84d44ce8 50343 live
32 | 9b54a57d-e0b5-4dc0-bc14-cf8d4be91bf2 55cc5090-826a-4e3b-bdfc-d4db8b69071f.FPKM-UQ.txt.gz 7ef24d19014825c3dd050de107f7e2e9 483165 live
33 | 7f8ccd20-fec1-4b49-b266-9b2057a12e35 fd181f89-6985-469d-93a9-5c108a81f8df.mirbase21.isoforms.quantification.txt 1e081d00e9bc07d61eaadff087056ce2 328729 live
34 | a8b3929e-a417-425a-8601-60c625702b2b 4d76c001-f594-45e4-ae99-3f9a97707485.mirbase21.mirnas.quantification.txt 56832362c899722a2450f4620d2b7c29 50407 live
35 | ae2b5eb3-721a-496b-83f0-9c079f26ed91 52c5b59a-b6e5-46e8-bef4-df855b908f55.mirbase21.isoforms.quantification.txt 4c68942f1fd11082f68a5d1564a2a1f2 415792 live
36 | 677d7e53-20ca-4af9-a6cc-a0db302882f6 fc4e8ec0-39a2-490d-8f37-3d9c44e40675.mirbase21.isoforms.quantification.txt 6d95417a95e9028c080653c4a99f1fe2 339941 live
37 | f2c0e755-05e9-4922-b02e-657b482eae7e 89e5198e-8a80-4cdc-a457-17368563bdeb.FPKM-UQ.txt.gz e3569554171b6fadc9aac5382dd331d4 512024 live
38 | 704fcff5-5a13-4ace-96a2-5fff3202e5d6 986291b6-5c83-4862-852d-4c591927a986.mirbase21.mirnas.quantification.txt f373917d4d3fd83db1ce4cb4a628a806 50249 live
39 | 4b2eaf7a-ab80-4d9c-957f-d8ad1f0abf79 c415aef2-a409-4950-9374-3e2fec218c28.htseq.counts.gz c64698d643f8780786c1c08602da28a3 244452 live
40 | faea4eb7-738f-49fb-ad7a-891b2fbbe041 8196eb2a-ebc9-40ae-ac8c-0ff365a17e50.mirbase21.mirnas.quantification.txt dc4bc95101bbac389ae136903a8268e4 50280 live
41 | 5f3dd1e6-163c-4c26-95b1-76ad2f15d5ba f1fddf0c-babe-4c67-bd3b-5633e17086a4.FPKM-UQ.txt.gz 2389bd06439826ef4ea8bab4fd2cb5d1 487462 live
42 | 79b9dfde-1346-4686-8bd3-801b7ba97103 8ba8fe41-cc4b-4bbc-974e-6b75a3ac3be6.FPKM-UQ.txt.gz fb2a2adc96c58ff1b25810acf76daf14 504102 live
43 | 82bbf244-5cf2-4c7e-81e5-4e3b2055cb97 47416a55-dadc-4cfa-9115-670abd7690b5.mirbase21.mirnas.quantification.txt e2956c61c8f1b02b53069c4ff1d5380d 50438 live
44 | 36e03a23-09b7-4ac1-9226-f7664ae630be 08608b79-aa23-4993-a295-7a6a71b807e9.htseq.counts.gz 27df09349dd9d7f364dc1ae292673c7f 241397 live
45 | 88dea443-cbab-4941-a44c-9bc84f2e6b41 9361d2a0-761f-42a3-b672-5c6c0a964fd6.mirbase21.mirnas.quantification.txt 78837c65db25894c9dad32e64e376753 50337 live
46 | d8425e1d-e91d-4e38-86c1-ea295378cb46 4d1c7dee-2663-4b64-8d5d-685c5c42b2c8.mirbase21.mirnas.quantification.txt 305acc2b154ed76ee57c4cd8228c99f3 50498 live
47 | abb729f5-5bc5-4b43-84ed-10aa3fc5170a 071c48ec-7410-42e7-af93-1575152e8e48.htseq.counts.gz d175914e3eb03e4c10901a867b014082 248942 live
48 | 9a304e4a-d833-4c5f-8312-691753943b82 7d1546b5-f408-4d76-9621-88f49b4310e4.FPKM.txt.gz bf23c77d5d1df5361b545effc9a073d3 493308 live
49 | b70ba737-dad3-4d10-a494-4ae72ca89a71 2e22ed16-230b-4695-a634-dcdbb6b9b2c6.FPKM-UQ.txt.gz 6468049503cf41cacf4a5a87dd7ef515 468098 live
50 | c9fa73ae-192e-4a8f-ae98-41bf52ce9aa7 835ae143-fbbd-426b-b072-6ed8ca004174.htseq.counts.gz afec9753f45289cda12a366507eb25fa 250638 live
51 | f2b9f9ef-927e-443b-bfee-335beb92ffd8 1b27c21f-792f-4691-a7c9-7b2eccba9425.mirbase21.isoforms.quantification.txt bebde2c28d74506798497bb2fed54d29 295064 live
52 | cf1ea66d-7817-4eca-931f-46aae260e394 3aeafc0b-b413-4dff-b633-6faef3a32bc3.FPKM-UQ.txt.gz 44c0bbb7c94f49da67432a13ca10fe15 512022 live
53 | 1f3dd1d2-5efb-4b31-99bb-264c8c0092f8 0a2b79c6-657b-46ec-b1b5-fa46f1ff9d6b.mirbase21.mirnas.quantification.txt 2df0bd8b8e240a46bbf278f198d5da50 50352 live
54 | 224aa26c-c313-4ad3-a645-75c367801f18 e586c24f-6627-4c1d-abcd-e97701612b40.mirbase21.mirnas.quantification.txt ac4f523579ba1941a4659919ec9730de 50404 live
55 | 409d0cb5-b494-4443-a1fe-d76b283212dc 5d418e71-4e7d-4111-8aad-4ea782fd87b6.FPKM-UQ.txt.gz 96f1063b37e4527615a94787c0dee726 495106 live
56 | b6e0804b-8f19-4615-ba89-a8e1ce1d2d2a b8217881-f8a2-4f50-a2bd-4f641b3a23d8.mirbase21.isoforms.quantification.txt 59af979ef22ddac5208d8e4769b1e86c 354398 live
--------------------------------------------------------------------------------
/gdc-rnaseq-tool.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import urllib
4 | import pandas as pd
5 | import sys
6 | import hashlib
7 | import argparse
8 | import os, fnmatch, gzip, shutil, tarfile
9 | from pathlib import Path
10 | import time
11 |
12 | ## -------------- JSON Filters constructor :
13 | class Filter(object):
14 |
15 | def __init__(self):
16 | self.filter = {"op": "and","content": []}
17 |
18 | def add_filter(self, Field, Value, Operator):
19 | self.filter['content'].append({"op":Operator,"content":{"field":Field,"value":Value}})
20 |
21 | def create_filter(self):
22 | self.final_filter = json.dumps(self.filter,separators=(',',':'))
23 | return self.final_filter
24 |
25 | ## -------------- Function for downloading files :
26 | def download(uuid, name, md5, ES, WF, DT, retry=0):
27 | try :
28 | fout = OFILE['data'].format(ES=ES, WF=WF, DT=DT, uuid=uuid, name=name)
29 | def md5_ok() :
30 | with open(fout, 'rb') as f :
31 | return (md5 == hashlib.md5(f.read()).hexdigest())
32 |
33 | print("Downloading (attempt {}): {}".format(retry, uuid))
34 | url = PARAM['url-data'].format(uuid=uuid)
35 |
36 | with urllib.request.urlopen(url) as response :
37 | data = response.read()
38 |
39 | os.makedirs(os.path.dirname(fout), exist_ok=True)
40 |
41 | with open(fout, 'wb') as f :
42 | f.write(data)
43 |
44 | if md5_ok():
45 | return (uuid, retry, md5_ok())
46 | else:
47 | os.remove(fout)
48 | raise ValueError('MD5 Sum Error on ' + uuid)
49 | except Exception as e :
50 | print("Error (attempt {}): {}".format(retry, e))
51 | if (retry >= PARAM['max retry']) :
52 | raise e
53 | return download(uuid, name, md5, ES, WF, DT, retry + 1)
54 |
55 | ## -------------- Function for reading manifest file :
56 | def read_manifest(manifest_loc):
57 | uuid_list = []
58 | with open(manifest_loc,'r') as myfile:
59 | if myfile.readline()[0:2] != 'id': raise ValueError('Bad Manifest File')
60 | else:
61 | for x in myfile:
62 | uuid = x.split('\t')[0]
63 | uuid_list.append(uuid)
64 | return uuid_list
65 |
66 | ## -------------- Function that unpacks gz files into another directory :
67 | def gunzip(file_path,output_path):
68 | with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
69 | shutil.copyfileobj(f_in, f_out)
70 |
71 | ## -------------- Argument Parser Function :
72 | def arg_parse():
73 | parser = argparse.ArgumentParser(
74 | description='----GDC RNA Seq File Merging Tool v0.1----',
75 | usage= 'python3 gdc-rnaseq-tool.py MANIFEST_FILE')
76 | parser.add_argument('manifest_file', action="store",help='Path to manifest file (or UUID List with -u)')
77 | parser.add_argument('-g','--hugo', action="store_true",help='Add Hugo Symbol Name')
78 | args = parser.parse_args()
79 | return args
80 |
81 | ## -------------- Errors when passing incorrect name :
82 | def error_parse(code):
83 | '''
84 | Generates the error messages
85 | '''
86 | error = {
87 | "bad_mani":"Input must be valid GDC Manifest. " \
88 | "\n\tGo to https://portal.gdc.cancer.gov/ to download a manifest",
89 | }
90 | print("ERROR : " + error[code])
91 | sys.exit(2)
92 |
93 | ## -------------- Main function :
94 | def main(args):
95 | global manifest_file
96 | global hugo
97 | manifest_file = args.manifest_file
98 | hugo = args.hugo
99 |
100 | # 0. Run Program
101 | # -------------------------------------------------------
102 | main(arg_parse())
103 |
104 | # Get current time
105 | timestr = time.strftime("%Y%m%d-%H%M%S")
106 |
107 | # 1. Read in manifest and location of folder
108 | # -------------------------------------------------------
109 | #Location = os.path.dirname(os.path.abspath(__file__)) + '/'
110 | File = manifest_file
111 | Manifest_Loc = str(File.replace('\\', '').strip())
112 | Location = str(Path(File).parents[0]) + '/Merged_RNASeq_' + timestr + '/' # Create path object from the directory
113 |
114 | os.makedirs(Location)
115 |
116 | print('Reading Manifest File from: ' + Manifest_Loc)
117 | print('Downloading Files to: ' + Location)
118 |
119 | UUIDs = read_manifest(Manifest_Loc)
120 |
121 | # 2. Get info about files in manifest
122 | # -------------------------------------------------------
123 | File_Filter = Filter()
124 | File_Filter.add_filter("files.file_id",UUIDs,"in")
125 | File_Filter.add_filter("files.analysis.workflow_type",["HTSeq - Counts","HTSeq - FPKM","HTSeq - FPKM-UQ","BCGSC miRNA Profiling"],"in")
126 | File_Filter.create_filter()
127 |
128 | EndPoint = 'files'
129 | Fields = 'cases.samples.portions.analytes.aliquots.submitter_id,file_name,cases.samples.sample_type,file_id,md5sum,experimental_strategy,analysis.workflow_type,data_type'
130 | Size = '10000'
131 |
132 | Payload = {'filters':File_Filter.create_filter(),
133 | 'format':'json',
134 | 'fields':Fields,
135 | 'size':Size}
136 | r = requests.post('https://api.gdc.cancer.gov/files', json=Payload)
137 | data = json.loads(r.text)
138 | file_list = data['data']['hits']
139 |
140 | Dictionary = {}
141 | TCGA_Barcode_Dict = {}
142 | for file in file_list:
143 | UUID = file['file_id']
144 | Barcode = file['cases'][0]['samples'][0]['portions'][0]['analytes'][0]['aliquots'][0]['submitter_id']
145 | File_Name = file['file_name']
146 |
147 | Dictionary[UUID] = {'File Name': File_Name,
148 | 'TCGA Barcode':Barcode,
149 | 'MD5': file['md5sum'],
150 | 'Sample Type': file['cases'][0]['samples'][0]['sample_type'],
151 | 'Experimental Strategy': file['experimental_strategy'],
152 | 'Workflow Type': file['analysis']['workflow_type'],
153 | 'Data Type': file['data_type']}
154 |
155 | TCGA_Barcode_Dict[File_Name] = {Barcode}
156 |
157 | # 3. Download files
158 | # -------------------------------------------------------
159 |
160 | # Location to save files as they are downloaded
161 | OFILE = {'data':Location+"{ES}/{WF}/{DT}/{uuid}/{name}"}
162 |
163 | PARAM = {
164 |
165 | # URL
166 | 'url-data' : "https://api.gdc.cancer.gov/data/{uuid}",
167 |
168 | # Persistence upon error
169 | 'max retry' : 10,
170 | }
171 |
172 | for key, value in Dictionary.items():
173 | download(key,
174 | value['File Name'],
175 | value['MD5'],
176 | value['Experimental Strategy'],
177 | value['Workflow Type'],
178 | value['Data Type'])
179 |
180 | # 4. Merge the RNA Seq files
181 | # -------------------------------------------------------
182 |
183 | RNASeq_WFs = ['HTSeq - Counts', 'HTSeq - FPKM-UQ','HTSeq - FPKM']
184 |
185 | GZipLocs = [Location + 'RNA-Seq/' + WF for WF in RNASeq_WFs]
186 |
187 | # Add Hugo Symbol
188 | if hugo == True:
189 | url = 'https://github.com/cpreid2/gdc-rnaseq-tool/raw/master/Gene_Annotation/gencode.v22.genes.txt'
190 | gene_map = pd.read_csv(url,sep='\t')
191 | gene_map = gene_map[['gene_id','gene_name']]
192 | gene_map = gene_map.set_index('gene_id')
193 |
194 | for i in range(len(RNASeq_WFs)):
195 |
196 | print('--------------')
197 | # Find all .gz files and ungzip into the folder
198 | pattern = '*.gz'
199 | Files = []
200 |
201 | # Create .gz directory in subfolder
202 | if os.path.exists(GZipLocs[i] + '/UnzippedFiles/'):
203 | shutil.rmtree(GZipLocs[i] + '/UnzippedFiles/')
204 | os.makedirs(GZipLocs[i] + '/UnzippedFiles/')
205 | else:
206 | os.makedirs(GZipLocs[i] + '/UnzippedFiles/')
207 |
208 | for root, dirs, files in os.walk(GZipLocs[i]):
209 | for filename in fnmatch.filter(files, pattern):
210 | OldFilePath = os.path.join(root, filename)
211 | NewFilePath = os.path.join(GZipLocs[i] + '/UnzippedFiles/', filename.replace(".gz",".tsv"))
212 |
213 | gunzip(OldFilePath, NewFilePath) # unzip to New file path
214 |
215 | Files.append(NewFilePath) # append file to list of files
216 |
217 | Matrix = {}
218 |
219 | for file in Files:
220 | p = Path(file)
221 | Name = str(p.name).replace('.tsv','')
222 | Name = Name + '.gz'
223 | Name = TCGA_Barcode_Dict[Name]
224 | Name = str(list(Name)[0])
225 | Counts_DataFrame = pd.read_csv(file,sep='\t',header=None,names=['GeneId', Name])
226 | Matrix[Name] = tuple(Counts_DataFrame[Name])
227 |
228 | # Merge Matrices to dataframes and write to files
229 | if len(Matrix) > 0:
230 | Merged_File_Name = 'Merged_'+ RNASeq_WFs[i].replace('HTSeq - ','') + '.tsv'
231 | print('Creating merged ' + RNASeq_WFs[i] + ' File... ' + '( ' + Merged_File_Name + ' )')
232 | Counts_Final_Df = pd.DataFrame(Matrix, index=tuple((Counts_DataFrame['GeneId'])))
233 | if hugo == True:
234 | Counts_Final_Df = gene_map.merge(Counts_Final_Df, how='outer', left_index=True, right_index=True)
235 | Counts_Final_Df.to_csv(str(Location) + '/' + Merged_File_Name,sep='\t',index=True)
236 |
237 | # 5. Merge the miRNA Seq files
238 | # -------------------------------------------------------
239 | miRNASeq_WF = ['BCGSC miRNA Profiling']
240 | miRNASeq_DTs = ['Isoform Expression Quantification','miRNA Expression Quantification']
241 | miRNALocs = [Location + 'miRNA-Seq/BCGSC miRNA Profiling/' + DT for DT in miRNASeq_DTs]
242 |
243 | print('--------------')
244 |
245 | for i in range(len(miRNASeq_DTs)):
246 |
247 | # Find all .gz files and ungzip into the folder
248 | pattern = '*.mirnas.quantification.txt'
249 | Files = []
250 |
251 | for root, dirs, files in os.walk(miRNALocs[i]):
252 | for filename in fnmatch.filter(files, pattern):
253 | FilePath = os.path.join(root, filename)
254 |
255 | Files.append(FilePath) # append file to list of files
256 |
257 | miRNA_count_Matrix = {}
258 | miRNA_rpmm_Matrix = {}
259 |
260 | for file in Files:
261 | p = Path(file)
262 | Name = str(p.name)
263 | Name = TCGA_Barcode_Dict[Name]
264 | Name = str(list(Name)[0])
265 |
266 | miRNA_DataFrame = pd.read_csv(file,sep='\t')
267 |
268 | miRNA_count_DataFrame = miRNA_DataFrame[['miRNA_ID','read_count']]
269 | miRNA_count_DataFrame.columns = ['miRNA_ID',Name]
270 |
271 | miRNA_rpmm_DataFrame = miRNA_DataFrame[['miRNA_ID','reads_per_million_miRNA_mapped']]
272 | miRNA_rpmm_DataFrame.columns = ['miRNA_ID',Name]
273 |
274 | miRNA_count_Matrix[Name] = tuple(miRNA_count_DataFrame[Name])
275 | miRNA_rpmm_Matrix[Name] = tuple(miRNA_rpmm_DataFrame[Name])
276 |
277 | if len(miRNA_count_Matrix) > 0:
278 | print('Creating merged miRNASeq Counts File... ( Merged_miRNA_Counts.tsv )')
279 | miRNA_Count_Final_Df = pd.DataFrame(miRNA_count_Matrix, index=tuple((miRNA_count_DataFrame['miRNA_ID'])))
280 | miRNA_Count_Final_Df.to_csv(str(Location) + '/Merged_miRNA_Counts.tsv',sep='\t',index=True)
281 | if len(miRNA_rpmm_Matrix) > 0:
282 | print('Creating merged miRNASeq rpmm File... ( Merged_miRNA_rpmm.tsv )')
283 | miRNA_rpmm_Final_Df = pd.DataFrame(miRNA_rpmm_Matrix, index=tuple((miRNA_rpmm_DataFrame['miRNA_ID'])))
284 | miRNA_rpmm_Final_Df.to_csv(str(Location) + '/Merged_miRNA_rpmm.tsv',sep='\t',index=True)
285 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | ipyupload
3 |
--------------------------------------------------------------------------------