├── Demo.ipynb
├── Gene_Annotation
    └── gencode.v22.genes.txt
├── Images
    └── TCGA Barcode.png
├── README.md
├── Test_Manifest.txt
├── gdc-rnaseq-tool.py
└── requirements.txt


/Demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## GDC RNA-Seq Tool Example\n",
  8 |     "### This notebook lets users upload a manifest file, and download Merged RNA-Seq Data"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import base64\n",
 18 |     "from IPython.display import HTML, display\n",
 19 |     "from ipyupload import FileUpload\n",
 20 |     "import subprocess\n",
 21 |     "import pandas as pd\n",
 22 |     "\n",
 23 |     "def Write_Manifest_File(w):\n",
 24 |     "    File_Name = list(w.value.keys())[0]\n",
 25 |     "    file = open(File_Name,'w') \n",
 26 |     "    file.write(w.value[File_Name]['content'].decode('UTF-8')) \n",
 27 |     "    file.close()\n",
 28 |     "    return File_Name"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Step 1: Upload a Manifest File"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "w = FileUpload()\n",
 45 |     "display('Upload a Manifest File:' , w)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Step 2: Run GDC-RNASeq-Tool On Manifest"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "Manifest_Name = Write_Manifest_File(w)\n",
 62 |     "print('Running GDC-RNA-Seq Tool on Manifest File called ' + Manifest_Name + '....')\n",
 63 |     "p = subprocess.run(['python','gdc-rnaseq-tool.py',Manifest_Name,'--hugo'],stdout=subprocess.PIPE)\n",
 64 |     "print('Finished Running')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Step 3: Download Merged Data"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Read in Data Frame that was created\n",
 81 |     "File_Location = p.stdout.decode('UTF-8').split('\\n')[1].split(': ')[1]\n",
 82 |     "\n",
 83 |     "Files = []\n",
 84 |     "for line in p.stdout.decode('UTF-8').split('\\n'):\n",
 85 |     "    if 'Creating merged' in line:\n",
 86 |     "        Files.append(line.split(' ')[7])\n",
 87 |     "\n",
 88 |     "def create_download_link( df, title , filename ): \n",
 89 |     "    csv = df.to_csv(sep='\\t')\n",
 90 |     "    b64 = base64.b64encode(csv.encode())\n",
 91 |     "    payload = b64.decode()\n",
 92 |     "    html = '<a download=\"{filename}\" href=\"data:text/tsv;base64,{payload}\" target=\"_blank\">{title}</a>'\n",
 93 |     "    html = html.format(payload=payload,title=title,filename=filename)\n",
 94 |     "    return HTML(html)\n",
 95 |     "\n",
 96 |     "for file in Files:\n",
 97 |     "    df = pd.read_csv(File_Location + file,sep='\\t')\n",
 98 |     "    display(create_download_link(df, file, file))"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": []
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.7.3"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 2
130 | }
131 | 


--------------------------------------------------------------------------------
/Images/TCGA Barcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cpreid2/gdc-rnaseq-tool/085aeb532bb0dd7b2b84ecb9cd960f2c491e5d6d/Images/TCGA Barcode.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gdc-rnaseq-tool
 2 | Tool to download / merge individual RNASeq files from the [GDC Portal](https://portal.gdc.cancer.gov) into a matrices identified by [TCGA barcode](https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode).
 3 | 
 4 | ![Image](https://raw.githubusercontent.com/cpreid2/gdc-rnaseq-tool/master/Images/TCGA%20Barcode.png)
 5 | 
 6 | __Description__:
 7 | 
 8 | The `gdc-rnaseq-tool` performs the following:
 9 | 
10 | 1. Downloads RNA-Seq / miRNA-Seq data files using a GDC manifest file
11 | 2. Unzips the files into separate folders identified by experimental strategy and bioinformatics workflow
12 | 3. Merges the files into separate matrix files identified in the table below
13 | 
14 | *The script will ignore any files in the manifest file that are not [Transcriptome Profiling files](https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Transcriptome%20Profiling%22%5D%7D%7D%5D%7D) generated from the GDC RNA-Seq / miRNA-Seq bioinformatics pipelines located on the [GDC Main Portal](https://portal.gdc.cancer.gov):*
15 | 
16 | [RNA-Seq / miRNA-Seq Files](https://portal.gdc.cancer.gov/repository?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Transcriptome%20Profiling%22%5D%7D%7D%5D%7D)
17 | 
18 | [RNA-Seq Bioinformatics Pipeline Documentation](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/)
19 | 
20 | [miRNA-Seq Bioinformatics Pipeline Documentation](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/miRNA_Pipeline/)
21 | 
22 | __Inputs and Outputs__:
23 | 
24 | | I/O | File |
25 | |---|---|
26 | | Input | [GDC Manifest File](https://docs.gdc.cancer.gov/Data_Transfer_Tool/Users_Guide/Preparing_for_Data_Download_and_Upload/#obtaining-a-manifest-file-for-data-download) |
27 | | Output | Merged_Counts.tsv ([HTSeq - Counts](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#upper-quartile-fpkm)) |
28 | |  | Merged_FPKM.tsv ([HTSeq - FPKM](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#fpkm)) |
29 | |  | Merged_FPKM-UQ.tsv ([HTSeq - FPKM-UQ](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#upper-quartile-fpkm)) |
30 | |  | Merged_miRNA_Counts.tsv |
31 | |  | Merged_miRNA_rpmm.tsv |
32 | 
33 | 
34 | __Requirements__:
35 | 
36 | - Python 3+
37 | - pandas ( https://pandas.pydata.org/pandas-docs/stable/install.html ): `pip3 install pandas`
38 | - requests (http://python-requests.org): `pip3 install requests`
39 | 
40 | __Quick Start__:
41 | 
42 | 1. [Download](https://github.com/cpreid2/gdc-rnaseq-tool/releases/download/1.0/gdc-rnaseq-tool.py) `gdc-rnaseq-tool.py` python script
43 | 2. Download manifest containing RNA/miRNA expression files from https://portal.gdc.cancer.gov/
44 | 3. `python3 gdc-rnaseq-tool.py <manifest_file>`
45 | 
46 | Optional: Add `--hugo` to the command to include the HUGO gene symbol as a separate column.  
47 | 
48 | `python3 gdc-rnaseq-tool.py <manifest_file> --hugo`
49 | 
50 | ---
51 | 
52 | The GDC RNASeq tool produces matrices of merged RNA/MiRNA expression data given a manifest file.
53 | 
54 | Usage: `python3 gdc-rnaseq-tool.py <manifest_file>`
55 | 
56 | Notes:
57 | * A test manifest is provided for troubleshooting:  `python3 gdc-rnaseq-tool.py Test_Manifest.txt`
58 | * Files are by default downloaded to the same folder as the manifest file that was provided
59 | 
60 | **Release Notes:**
61 | 
62 | Version 1.0: Feb 8, 2018
63 | 
64 | * Initial release
65 | 
66 | Known Issues:
67 | N/A
68 | 


--------------------------------------------------------------------------------
/Test_Manifest.txt:
--------------------------------------------------------------------------------
 1 | id	filename	md5	size	state
 2 | a27546f4-0850-4b84-9af9-89c760474037	14a5b723-8762-4bcd-82bf-5cb50daf80bc.htseq.counts.gz	eeab2a853fe0029d3dbe73e475494ff9	245378	live
 3 | d72f41c9-f94b-4880-8fd7-394828df6e6b	9361d2a0-761f-42a3-b672-5c6c0a964fd6.mirbase21.isoforms.quantification.txt	94512564c095f5701ce6f6fc6f00bfd2	373138	live
 4 | a2b0e1cf-859a-4ee8-8275-6b01b81d8355	1f495645-b2ce-4726-a3fb-4fdd898e423a.htseq.counts.gz	7ca66098ca549454858900f59d2eee48	251678	live
 5 | bcfdcdac-c2e5-448f-a2f4-38af70c9bca4	7794958e-853e-4599-a30c-1eee64e28739.FPKM-UQ.txt.gz	d7c6057daa0304b26c41de1afc005752	468986	live
 6 | c6221cb9-94e3-466f-b6d3-c862db204e1d	ce0e7450-7b99-42e7-bb62-4077b53253e2.mirbase21.mirnas.quantification.txt	654f9dc754e42eb2773c852f5007bf5d	50338	live
 7 | ca3dab87-6bd9-405e-8810-34c5d095a2a2	ce58ee65-6131-496e-b29a-b5f97a26eb9d.htseq.counts.gz	2d708a10efad30e593fa5bad9cd7c7ef	250853	live
 8 | 79d488fe-c1ab-4935-935c-ae80d726bad9	5d418e71-4e7d-4111-8aad-4ea782fd87b6.htseq.counts.gz	d7f0282068f0b78f0e77068c86cde07d	248105	live
 9 | 96173454-3d93-4569-bdf4-2fae6e091a84	4a6db870-fe49-4e39-822a-f513e05d3f61.mirbase21.isoforms.quantification.txt	ec0b04ce0953f2822cd6040678a8d638	319096	live
10 | de09e9ab-56c6-4ade-ba33-26ed46a2bce2	78e5d21b-87ac-4557-8089-77d532039762.mirbase21.isoforms.quantification.txt	66e38557517c8c1ee91a3f9cc6d940bb	434178	live
11 | 1c923966-fac1-4571-b310-11295d14fcdc	7f75084d-fa0a-4a63-9bb0-9a0917fc149e.htseq.counts.gz	21daa282621d83a5884873812e704446	240470	live
12 | 2841c244-29b2-46c3-8f1d-c96ceda9d52a	79d9c719-ff0d-4574-8b43-db11bb355c0d.FPKM-UQ.txt.gz	a548234817bc9d8ab64eefaa627505fd	467863	live
13 | 93fee0ad-4995-43b5-9b3d-4e51e529fc4e	87c98a63-24f5-49c7-b452-1d7185fb16a9.htseq.counts.gz	e04e110b985308dd9c75083721e63d4e	248201	live
14 | 4a2ae4f2-1525-46c9-bc29-58fa40b4e905	ef4efb01-b06d-4e06-a5a8-74191a2a4779.mirbase21.mirnas.quantification.txt	fbdf1345d72141d0080fb07b0124ed6a	50518	live
15 | 7eb56a73-88fa-411e-bd2e-aed4a1dc6a64	659581b9-cd77-4f3b-8fc9-46c5c806888d.mirbase21.isoforms.quantification.txt	5beaf3f493b2e7e37f3649579a311990	358750	live
16 | 67fa2520-5cd9-4c27-8e3e-2d47e69e6dd5	415cbd80-d98d-45c3-beda-e00cd585c2a4.mirbase21.mirnas.quantification.txt	6700dd5df811e5c5f60df8071985dff2	50527	live
17 | 664bf371-a42b-4563-8e2c-b49864bb4f0f	3d5ba239-49aa-4958-a873-06277c7f3999.mirbase21.isoforms.quantification.txt	7818742fc83d2da8f5d1c3fec413922e	384451	live
18 | 9ddc9b46-b158-44b0-9e32-253b1d1fa85c	7bd62b90-1973-491b-8c70-b0f20deaecfd.htseq.counts.gz	5332e483862fd9b63b58060143335d37	244374	live
19 | 731e3393-2409-4f74-aa23-93c56aeccfa3	3e9b3575-a067-4cb2-adf1-13a98c08f6ca.htseq.counts.gz	7ee8dac667363004ee5d5538a2c031a0	243728	live
20 | c0d965cb-072f-4f7e-9385-6441aa8af1ca	82d61f67-f56e-4855-aa2e-44350982f8af.FPKM-UQ.txt.gz	4a796a1885bee1128cb6d51ae71d9393	484825	live
21 | 2e783233-f860-4c57-9ee4-a69e45a1f092	986291b6-5c83-4862-852d-4c591927a986.mirbase21.isoforms.quantification.txt	0bfa74b13c02e08d890c97b971f724f8	299608	live
22 | c3e730ea-5dba-49cb-ad2f-d8e5c640c25d	cc16a3ce-4569-47c1-8155-1f6d1e64f999.FPKM-UQ.txt.gz	766d665fdb25bf9078312d0d26acc674	451371	live
23 | daca9eac-4696-4e5f-bcc9-757e485d7073	8b14a5e1-cccc-4596-a367-bbd76e66bdde.mirbase21.isoforms.quantification.txt	d5bfe8ae396be19d581eabe75a5963b9	423648	live
24 | 313e9ed7-0684-4711-8dcc-dbb36a64b9a8	0eb17745-aa52-4097-8368-119e27a052cd.mirbase21.isoforms.quantification.txt	f5d4e85054a2210d833096ae692fb03a	450157	live
25 | 70b6ce0e-b01b-4e1a-af27-5a0d24bfe11d	c668e123-05fe-4983-862d-10b413b4138d.mirbase21.mirnas.quantification.txt	163628c8acabf1d04576373cb7b1c98d	50262	live
26 | 260ca241-b3ca-4d80-9530-3bd27cc86db8	6b0df792-5601-4d5d-8844-4a66391dbcaf.FPKM.txt.gz	4a66e935cb542f3ddfc8cab7d2557e8b	504086	live
27 | 056a99cb-53ab-4146-9277-33caeb9d1a02	25bfcac0-ba7b-41cb-8343-34bb425857aa.FPKM.txt.gz	90e64d1ba9171ca3a18baa3ed1db91c6	498889	live
28 | 0a63413a-4839-4adf-b1c1-52166fe02d80	57817aa5-3bb6-47e7-a344-2857668dc485.FPKM-UQ.txt.gz	9477d36020fed556ba25dcc3bb2c52f7	491429	live
29 | 29426006-e5c3-4b10-a649-b9dda2dac9ba	21dcd167-85e9-46a7-94b2-18ac56852e15.FPKM.txt.gz	a0f51b81c5d403a7f64d45325b998a5c	486525	live
30 | 5ddc5a1b-1cc3-4afc-addb-47ff357873ee	d5d1ce0c-e908-4890-87d0-0a7b3b1bbb00.htseq.counts.gz	5f91db9440f711fab895d6266e4550b0	239099	live
31 | a85657aa-9ea4-4c8e-aac8-9258cdcc7fc7	fd181f89-6985-469d-93a9-5c108a81f8df.mirbase21.mirnas.quantification.txt	67d158ecd63462ee3c7f63dc84d44ce8	50343	live
32 | 9b54a57d-e0b5-4dc0-bc14-cf8d4be91bf2	55cc5090-826a-4e3b-bdfc-d4db8b69071f.FPKM-UQ.txt.gz	7ef24d19014825c3dd050de107f7e2e9	483165	live
33 | 7f8ccd20-fec1-4b49-b266-9b2057a12e35	fd181f89-6985-469d-93a9-5c108a81f8df.mirbase21.isoforms.quantification.txt	1e081d00e9bc07d61eaadff087056ce2	328729	live
34 | a8b3929e-a417-425a-8601-60c625702b2b	4d76c001-f594-45e4-ae99-3f9a97707485.mirbase21.mirnas.quantification.txt	56832362c899722a2450f4620d2b7c29	50407	live
35 | ae2b5eb3-721a-496b-83f0-9c079f26ed91	52c5b59a-b6e5-46e8-bef4-df855b908f55.mirbase21.isoforms.quantification.txt	4c68942f1fd11082f68a5d1564a2a1f2	415792	live
36 | 677d7e53-20ca-4af9-a6cc-a0db302882f6	fc4e8ec0-39a2-490d-8f37-3d9c44e40675.mirbase21.isoforms.quantification.txt	6d95417a95e9028c080653c4a99f1fe2	339941	live
37 | f2c0e755-05e9-4922-b02e-657b482eae7e	89e5198e-8a80-4cdc-a457-17368563bdeb.FPKM-UQ.txt.gz	e3569554171b6fadc9aac5382dd331d4	512024	live
38 | 704fcff5-5a13-4ace-96a2-5fff3202e5d6	986291b6-5c83-4862-852d-4c591927a986.mirbase21.mirnas.quantification.txt	f373917d4d3fd83db1ce4cb4a628a806	50249	live
39 | 4b2eaf7a-ab80-4d9c-957f-d8ad1f0abf79	c415aef2-a409-4950-9374-3e2fec218c28.htseq.counts.gz	c64698d643f8780786c1c08602da28a3	244452	live
40 | faea4eb7-738f-49fb-ad7a-891b2fbbe041	8196eb2a-ebc9-40ae-ac8c-0ff365a17e50.mirbase21.mirnas.quantification.txt	dc4bc95101bbac389ae136903a8268e4	50280	live
41 | 5f3dd1e6-163c-4c26-95b1-76ad2f15d5ba	f1fddf0c-babe-4c67-bd3b-5633e17086a4.FPKM-UQ.txt.gz	2389bd06439826ef4ea8bab4fd2cb5d1	487462	live
42 | 79b9dfde-1346-4686-8bd3-801b7ba97103	8ba8fe41-cc4b-4bbc-974e-6b75a3ac3be6.FPKM-UQ.txt.gz	fb2a2adc96c58ff1b25810acf76daf14	504102	live
43 | 82bbf244-5cf2-4c7e-81e5-4e3b2055cb97	47416a55-dadc-4cfa-9115-670abd7690b5.mirbase21.mirnas.quantification.txt	e2956c61c8f1b02b53069c4ff1d5380d	50438	live
44 | 36e03a23-09b7-4ac1-9226-f7664ae630be	08608b79-aa23-4993-a295-7a6a71b807e9.htseq.counts.gz	27df09349dd9d7f364dc1ae292673c7f	241397	live
45 | 88dea443-cbab-4941-a44c-9bc84f2e6b41	9361d2a0-761f-42a3-b672-5c6c0a964fd6.mirbase21.mirnas.quantification.txt	78837c65db25894c9dad32e64e376753	50337	live
46 | d8425e1d-e91d-4e38-86c1-ea295378cb46	4d1c7dee-2663-4b64-8d5d-685c5c42b2c8.mirbase21.mirnas.quantification.txt	305acc2b154ed76ee57c4cd8228c99f3	50498	live
47 | abb729f5-5bc5-4b43-84ed-10aa3fc5170a	071c48ec-7410-42e7-af93-1575152e8e48.htseq.counts.gz	d175914e3eb03e4c10901a867b014082	248942	live
48 | 9a304e4a-d833-4c5f-8312-691753943b82	7d1546b5-f408-4d76-9621-88f49b4310e4.FPKM.txt.gz	bf23c77d5d1df5361b545effc9a073d3	493308	live
49 | b70ba737-dad3-4d10-a494-4ae72ca89a71	2e22ed16-230b-4695-a634-dcdbb6b9b2c6.FPKM-UQ.txt.gz	6468049503cf41cacf4a5a87dd7ef515	468098	live
50 | c9fa73ae-192e-4a8f-ae98-41bf52ce9aa7	835ae143-fbbd-426b-b072-6ed8ca004174.htseq.counts.gz	afec9753f45289cda12a366507eb25fa	250638	live
51 | f2b9f9ef-927e-443b-bfee-335beb92ffd8	1b27c21f-792f-4691-a7c9-7b2eccba9425.mirbase21.isoforms.quantification.txt	bebde2c28d74506798497bb2fed54d29	295064	live
52 | cf1ea66d-7817-4eca-931f-46aae260e394	3aeafc0b-b413-4dff-b633-6faef3a32bc3.FPKM-UQ.txt.gz	44c0bbb7c94f49da67432a13ca10fe15	512022	live
53 | 1f3dd1d2-5efb-4b31-99bb-264c8c0092f8	0a2b79c6-657b-46ec-b1b5-fa46f1ff9d6b.mirbase21.mirnas.quantification.txt	2df0bd8b8e240a46bbf278f198d5da50	50352	live
54 | 224aa26c-c313-4ad3-a645-75c367801f18	e586c24f-6627-4c1d-abcd-e97701612b40.mirbase21.mirnas.quantification.txt	ac4f523579ba1941a4659919ec9730de	50404	live
55 | 409d0cb5-b494-4443-a1fe-d76b283212dc	5d418e71-4e7d-4111-8aad-4ea782fd87b6.FPKM-UQ.txt.gz	96f1063b37e4527615a94787c0dee726	495106	live
56 | b6e0804b-8f19-4615-ba89-a8e1ce1d2d2a	b8217881-f8a2-4f50-a2bd-4f641b3a23d8.mirbase21.isoforms.quantification.txt	59af979ef22ddac5208d8e4769b1e86c	354398	live


--------------------------------------------------------------------------------
/gdc-rnaseq-tool.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import urllib
  4 | import pandas as pd
  5 | import sys
  6 | import hashlib
  7 | import argparse
  8 | import os, fnmatch, gzip, shutil, tarfile
  9 | from pathlib import Path
 10 | import time
 11 | 
 12 | ## -------------- JSON Filters constructor :
 13 | class Filter(object):
 14 | 
 15 |     def __init__(self):
 16 |         self.filter = {"op": "and","content": []}
 17 | 
 18 |     def add_filter(self, Field, Value, Operator):
 19 |         self.filter['content'].append({"op":Operator,"content":{"field":Field,"value":Value}})
 20 | 
 21 |     def create_filter(self):
 22 |         self.final_filter = json.dumps(self.filter,separators=(',',':'))
 23 |         return self.final_filter
 24 | 
 25 | ## -------------- Function for downloading files :
 26 | def download(uuid, name, md5, ES, WF, DT, retry=0):
 27 |     try :
 28 |         fout = OFILE['data'].format(ES=ES, WF=WF, DT=DT, uuid=uuid, name=name)
 29 |         def md5_ok() :
 30 |             with open(fout, 'rb') as f :
 31 |                 return (md5 == hashlib.md5(f.read()).hexdigest())
 32 | 
 33 |         print("Downloading (attempt {}): {}".format(retry, uuid))
 34 |         url = PARAM['url-data'].format(uuid=uuid)
 35 | 
 36 |         with urllib.request.urlopen(url) as response :
 37 |             data = response.read()
 38 | 
 39 |         os.makedirs(os.path.dirname(fout), exist_ok=True)
 40 | 
 41 |         with open(fout, 'wb') as f :
 42 |             f.write(data)
 43 | 
 44 |         if md5_ok():
 45 |             return (uuid, retry, md5_ok())
 46 |         else:
 47 |             os.remove(fout)
 48 |             raise ValueError('MD5 Sum Error on ' + uuid)
 49 |     except Exception as e :
 50 |         print("Error (attempt {}): {}".format(retry, e))
 51 |         if (retry >= PARAM['max retry']) :
 52 |             raise e
 53 |         return download(uuid, name, md5, ES, WF, DT, retry + 1)
 54 | 
 55 | ## -------------- Function for reading manifest file :
 56 | def read_manifest(manifest_loc):
 57 |     uuid_list = []
 58 |     with open(manifest_loc,'r') as myfile:
 59 |         if myfile.readline()[0:2] != 'id': raise ValueError('Bad Manifest File')
 60 |         else:
 61 |         	for x in myfile:
 62 |         		uuid = x.split('\t')[0]
 63 |         		uuid_list.append(uuid)
 64 |     return uuid_list
 65 | 
 66 | ## -------------- Function that unpacks gz files into another directory :
 67 | def gunzip(file_path,output_path):
 68 |     with gzip.open(file_path,"rb") as f_in, open(output_path,"wb") as f_out:
 69 |         shutil.copyfileobj(f_in, f_out)
 70 | 
 71 | ## -------------- Argument Parser Function :
 72 | def arg_parse():
 73 |     parser = argparse.ArgumentParser(
 74 | 		description='----GDC RNA Seq File Merging Tool v0.1----',
 75 | 		usage= 'python3 gdc-rnaseq-tool.py MANIFEST_FILE')
 76 |     parser.add_argument('manifest_file', action="store",help='Path to manifest file (or UUID List with -u)')
 77 |     parser.add_argument('-g','--hugo', action="store_true",help='Add Hugo Symbol Name')
 78 |     args = parser.parse_args()
 79 |     return args
 80 | 
 81 | ## -------------- Errors when passing incorrect name :
 82 | def error_parse(code):
 83 | 	'''
 84 | 	Generates the error messages
 85 | 	'''
 86 | 	error = {
 87 | 		"bad_mani":"Input must be valid GDC Manifest. " \
 88 | 		"\n\tGo to https://portal.gdc.cancer.gov/ to download a manifest",
 89 | 	}
 90 | 	print("ERROR : " + error[code])
 91 | 	sys.exit(2)
 92 | 
 93 | ## -------------- Main function :
 94 | def main(args):
 95 |     global manifest_file
 96 |     global hugo
 97 |     manifest_file = args.manifest_file
 98 |     hugo = args.hugo
 99 | 
100 | # 0. Run Program
101 | # -------------------------------------------------------
102 | main(arg_parse())
103 | 
104 | # Get current time
105 | timestr = time.strftime("%Y%m%d-%H%M%S")
106 | 
107 | # 1. Read in manifest and location of folder
108 | # -------------------------------------------------------
109 | #Location = os.path.dirname(os.path.abspath(__file__)) + '/'
110 | File = manifest_file
111 | Manifest_Loc = str(File.replace('\\', '').strip())
112 | Location = str(Path(File).parents[0]) + '/Merged_RNASeq_' + timestr + '/' # Create path object from the directory
113 | 
114 | os.makedirs(Location)
115 | 
116 | print('Reading Manifest File from: ' + Manifest_Loc)
117 | print('Downloading Files to: ' + Location)
118 | 
119 | UUIDs = read_manifest(Manifest_Loc)
120 | 
121 | # 2. Get info about files in manifest
122 | # -------------------------------------------------------
123 | File_Filter = Filter()
124 | File_Filter.add_filter("files.file_id",UUIDs,"in")
125 | File_Filter.add_filter("files.analysis.workflow_type",["HTSeq - Counts","HTSeq - FPKM","HTSeq - FPKM-UQ","BCGSC miRNA Profiling"],"in")
126 | File_Filter.create_filter()
127 | 
128 | EndPoint = 'files'
129 | Fields = 'cases.samples.portions.analytes.aliquots.submitter_id,file_name,cases.samples.sample_type,file_id,md5sum,experimental_strategy,analysis.workflow_type,data_type'
130 | Size = '10000'
131 | 
132 | Payload = {'filters':File_Filter.create_filter(),
133 |            'format':'json',
134 |            'fields':Fields,
135 |            'size':Size}
136 | r = requests.post('https://api.gdc.cancer.gov/files', json=Payload)
137 | data = json.loads(r.text)
138 | file_list = data['data']['hits']
139 | 
140 | Dictionary = {}
141 | TCGA_Barcode_Dict = {}
142 | for file in file_list:
143 |     UUID = file['file_id']
144 |     Barcode = file['cases'][0]['samples'][0]['portions'][0]['analytes'][0]['aliquots'][0]['submitter_id']
145 |     File_Name = file['file_name']
146 | 
147 |     Dictionary[UUID] = {'File Name': File_Name,
148 |     'TCGA Barcode':Barcode,
149 |     'MD5': file['md5sum'],
150 |     'Sample Type': file['cases'][0]['samples'][0]['sample_type'],
151 |     'Experimental Strategy': file['experimental_strategy'],
152 |     'Workflow Type': file['analysis']['workflow_type'],
153 |     'Data Type': file['data_type']}
154 | 
155 |     TCGA_Barcode_Dict[File_Name] = {Barcode}
156 | 
157 | # 3. Download files
158 | # -------------------------------------------------------
159 | 
160 | # Location to save files as they are downloaded
161 | OFILE = {'data':Location+"{ES}/{WF}/{DT}/{uuid}/{name}"}
162 | 
163 | PARAM = {
164 | 
165 | # URL
166 | 'url-data' : "https://api.gdc.cancer.gov/data/{uuid}",
167 | 
168 | # Persistence upon error
169 | 'max retry' : 10,
170 | }
171 | 
172 | for key, value in Dictionary.items():
173 |     download(key,
174 |              value['File Name'],
175 |              value['MD5'],
176 |              value['Experimental Strategy'],
177 |              value['Workflow Type'],
178 |              value['Data Type'])
179 | 
180 | # 4. Merge the RNA Seq files
181 | # -------------------------------------------------------
182 | 
183 | RNASeq_WFs = ['HTSeq - Counts', 'HTSeq - FPKM-UQ','HTSeq - FPKM']
184 | 
185 | GZipLocs = [Location + 'RNA-Seq/' + WF for WF in RNASeq_WFs]
186 | 
187 | # Add Hugo Symbol
188 | if hugo == True:
189 |     url = 'https://github.com/cpreid2/gdc-rnaseq-tool/raw/master/Gene_Annotation/gencode.v22.genes.txt'
190 |     gene_map = pd.read_csv(url,sep='\t')
191 |     gene_map = gene_map[['gene_id','gene_name']]
192 |     gene_map = gene_map.set_index('gene_id')
193 | 
194 | for i in range(len(RNASeq_WFs)):
195 | 
196 |     print('--------------')
197 |     # Find all .gz files and ungzip into the folder
198 |     pattern = '*.gz'
199 |     Files = []
200 | 
201 |     # Create .gz directory in subfolder
202 |     if os.path.exists(GZipLocs[i] + '/UnzippedFiles/'):
203 |         shutil.rmtree(GZipLocs[i] + '/UnzippedFiles/')
204 |         os.makedirs(GZipLocs[i] + '/UnzippedFiles/')
205 |     else:
206 |         os.makedirs(GZipLocs[i] + '/UnzippedFiles/')
207 | 
208 |     for root, dirs, files in os.walk(GZipLocs[i]):
209 |         for filename in fnmatch.filter(files, pattern):
210 |             OldFilePath = os.path.join(root, filename)
211 |             NewFilePath = os.path.join(GZipLocs[i] + '/UnzippedFiles/', filename.replace(".gz",".tsv"))
212 | 
213 |             gunzip(OldFilePath, NewFilePath) # unzip to New file path
214 | 
215 |             Files.append(NewFilePath) # append file to list of files
216 | 
217 |     Matrix = {}
218 | 
219 |     for file in Files:
220 |         p = Path(file)
221 |         Name = str(p.name).replace('.tsv','')
222 |         Name = Name + '.gz'
223 |         Name = TCGA_Barcode_Dict[Name]
224 |         Name = str(list(Name)[0])
225 |         Counts_DataFrame = pd.read_csv(file,sep='\t',header=None,names=['GeneId', Name])
226 |         Matrix[Name] = tuple(Counts_DataFrame[Name])
227 | 
228 |     # Merge Matrices to dataframes and write to files
229 |     if len(Matrix) > 0:
230 |         Merged_File_Name = 'Merged_'+ RNASeq_WFs[i].replace('HTSeq - ','') + '.tsv'
231 |         print('Creating merged ' + RNASeq_WFs[i] + ' File... ' + '( ' + Merged_File_Name + ' )')
232 |         Counts_Final_Df = pd.DataFrame(Matrix, index=tuple((Counts_DataFrame['GeneId'])))
233 |         if hugo == True:
234 |             Counts_Final_Df = gene_map.merge(Counts_Final_Df, how='outer', left_index=True, right_index=True)
235 |         Counts_Final_Df.to_csv(str(Location) + '/' + Merged_File_Name,sep='\t',index=True)
236 | 
237 | # 5. Merge the miRNA Seq files
238 | # -------------------------------------------------------
239 | miRNASeq_WF = ['BCGSC miRNA Profiling']
240 | miRNASeq_DTs = ['Isoform Expression Quantification','miRNA Expression Quantification']
241 | miRNALocs = [Location + 'miRNA-Seq/BCGSC miRNA Profiling/' + DT for DT in miRNASeq_DTs]
242 | 
243 | print('--------------')
244 | 
245 | for i in range(len(miRNASeq_DTs)):
246 | 
247 |     # Find all .gz files and ungzip into the folder
248 |     pattern = '*.mirnas.quantification.txt'
249 |     Files = []
250 | 
251 |     for root, dirs, files in os.walk(miRNALocs[i]):
252 |         for filename in fnmatch.filter(files, pattern):
253 |             FilePath = os.path.join(root, filename)
254 | 
255 |             Files.append(FilePath) # append file to list of files
256 | 
257 |     miRNA_count_Matrix = {}
258 |     miRNA_rpmm_Matrix = {}
259 | 
260 |     for file in Files:
261 |         p = Path(file)
262 |         Name = str(p.name)
263 |         Name = TCGA_Barcode_Dict[Name]
264 |         Name = str(list(Name)[0])
265 | 
266 |         miRNA_DataFrame = pd.read_csv(file,sep='\t')
267 | 
268 |         miRNA_count_DataFrame = miRNA_DataFrame[['miRNA_ID','read_count']]
269 |         miRNA_count_DataFrame.columns = ['miRNA_ID',Name]
270 | 
271 |         miRNA_rpmm_DataFrame = miRNA_DataFrame[['miRNA_ID','reads_per_million_miRNA_mapped']]
272 |         miRNA_rpmm_DataFrame.columns = ['miRNA_ID',Name]
273 | 
274 |         miRNA_count_Matrix[Name] = tuple(miRNA_count_DataFrame[Name])
275 |         miRNA_rpmm_Matrix[Name] = tuple(miRNA_rpmm_DataFrame[Name])
276 | 
277 |     if len(miRNA_count_Matrix) > 0:
278 |         print('Creating merged miRNASeq Counts File... ( Merged_miRNA_Counts.tsv )')
279 |         miRNA_Count_Final_Df = pd.DataFrame(miRNA_count_Matrix, index=tuple((miRNA_count_DataFrame['miRNA_ID'])))
280 |         miRNA_Count_Final_Df.to_csv(str(Location) + '/Merged_miRNA_Counts.tsv',sep='\t',index=True)
281 |     if len(miRNA_rpmm_Matrix) > 0:
282 |         print('Creating merged miRNASeq rpmm File... ( Merged_miRNA_rpmm.tsv )')
283 |         miRNA_rpmm_Final_Df = pd.DataFrame(miRNA_rpmm_Matrix, index=tuple((miRNA_rpmm_DataFrame['miRNA_ID'])))
284 |         miRNA_rpmm_Final_Df.to_csv(str(Location) + '/Merged_miRNA_rpmm.tsv',sep='\t',index=True)
285 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | ipyupload
3 | 


--------------------------------------------------------------------------------