├── CODE_OF_CONDUCT.md
├── LICENSE
├── NVIDIA Clara Parabricks on Azure
    ├── NVIDIA_Clara_Parabricks_on_Azure.ipynb
    └── README.md
├── README.md
├── SECURITY.md
├── customdsvmbioconductor
    └── README.md
├── docs
    ├── Genomics_Data_Lake_Azure_Storage_Explorer.pdf
    ├── azurenotebooks.JPG
    ├── azurenotebooks_2_LI.jpg
    ├── azurenotebooks_3_LI.jpg
    ├── bioc_arch_1.JPG
    ├── bioc_arch_2.JPG
    ├── clinvar_chr1_sample_mini.txt
    ├── create-workspace.gif
    ├── fhir_long_read_1.JPG
    ├── genomics_notebook_codespaces.mp4
    ├── image328.png
    ├── initial.md
    ├── utils.py
    └── video.JPG
├── fhirgenomics
    ├── 1-data-export.ipynb
    ├── 2-clustering.ipynb
    ├── 3-pharmacogenomics-confidential.ipynb
    ├── 3-simple-breast-cancer-module.json
    └── intro.md
├── genomics-data-science-vm
    └── README.md
├── sample-notebooks
    ├── 1000-genomes_Azure_Genomics_Data_Lake.ipynb
    ├── AzureNotebooks-azure-storage-genomics-giab.ipynb
    ├── Bioconductor.ipynb
    ├── SnpEff.ipynb
    ├── fhir-vcf-clustering.ipynb
    ├── fhir_long_read.ipynb
    ├── genomics-clinvar.ipynb
    ├── genomics-encode.ipynb
    ├── genomics-gatk-resource-bundle.ipynb
    ├── genomics-opencravat.ipynb
    ├── genomics-platinum-genomes.ipynb
    ├── genomics-prereqs.ipynb
    ├── genomics-reference-genomes.ipynb
    ├── genomics.ipynb
    ├── genomicsML.ipynb
    ├── graphragforgenomics.ipynb
    ├── igv_jupyter_extension_sample.ipynb
    ├── initial-notebook.md
    ├── radiogenomics.ipynb
    └── simtotable.ipynb
└── vcf2parquet-conversion
    ├── 1000genomes
        ├── README.md
        ├── sql
        │   ├── sampleQueriesFlattened.sql
        │   ├── sampleQueriesNested.sql
        │   └── setup.sql
        └── vcf2parquet-1000genomes.ipynb
    ├── README.md
    ├── gnomad
        ├── README.md
        ├── sql
        │   └── sampleQueries.sql
        └── vcf2parquet-gnomad.ipynb
    └── vcf2parquet-walkthrough.ipynb


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/NVIDIA Clara Parabricks on Azure/NVIDIA_Clara_Parabricks_on_Azure.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "e1c6b305",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# NVIDIA Clara Parabricks on Microsoft Azure "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "b8d187d0",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "NVIDIA introduced the Clara Parabricks software suite for performing analysis of NGS DNA and RNA data. It delivers results at blazing fast speeds and low cost. Clara Parabricks can analyze 30x WGS data in under 25 minutes on a single 8-GPU server, instead of 30 hours for traditional CPU-based methods. Its output matches commonly used software, making it simple to verify the accuracy of the results.\n",
 17 |     "\n",
 18 |     "Clara Parabricks software provides at least an order of magnitude acceleration in compute time while generating identical outputs and reducing analysis costs. Clara Parabricks is available free on NVIDIA GPU Cloud (NGC) and can be easily deployed on Azure GPU based virtual machines (VM).\n",
 19 |     "\n",
 20 |     "Clara Parabricks provides optimal performance for multiple Microsoft Azure instance types and can be used out of the box for essential bioinformatics needs. Currently, the Clara Parabricks accelerated analysis tools start from FASTQ files and perform alignment through variant calling and expression analysis, including QC tools for both types of outputs. The suite of tools can be used to support end-to-end workflows for germline, somatic and RNA-Seq pipelines, providing the flexibility to meet the individual needs of most projects. The tools can also be used individually, as drop-in replacements for steps in existing workflows.\n",
 21 |     "\n",
 22 |     "You can learn more from this [link](https://www.nvidia.com/en-us/clara/genomics)\n",
 23 |     "\n",
 24 |     "\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "0a8d8c86-63a2-4d32-9d47-94ba5feca6a9",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## The pre-requisites for running Parabricks 4.0 on Microsoft Azure\n",
 33 |     "\n",
 34 |     "- An Azure subscription with Compute-VM (cores-vCPUs) quota allowing to create GPU based VMs (preferably NCas_T4_v3 and ND96asr_A100_v4)\n",
 35 |     "- An NVIDIA driver greater than version 465.32.*\n",
 36 |     "- Any Linux Operating System that supports nvidia-docker2 Docker version 20.10 (or higher)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "925f3733-f1dc-4d97-b6b3-81657a7d187e",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "To make sure you have **nvidia-docker2** installed, run this command:"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "id": "292aabf6",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "!docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "6a2261d9-f066-4871-bb46-d5d77a569f68",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "When it finishes downloading the container, it will run the nvidia-smi command and show you the same output as above. The Clara Parabricks Docker image can be obtained from NGC by running the following command (please check https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara/containers/clara-parabricks for the latest version):"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "55cec0c5-8c65-464f-b9a5-68ad5f7e08b8",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "!docker pull nvcr.io/nvidia/clara/clara-parabricks:4.0.0-1"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "05d39fca",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Sample Run- 'fq2bam' pipeline with Clara Parabricks"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "b6944461-48a5-4a88-8c2d-f58ed7d39b0d",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "! docker run \\\n",
 91 |     "\t--gpus all \\\n",
 92 |     "\t--rm \\\n",
 93 |     "\t--volume /host/data:/input_data \\\n",
 94 |     "\t--volume /host/results:/outputdir \\\n",
 95 |     "\t--workdir /image/input_data \\\n",
 96 |     "    nvcr.io/nvidia/clara/clara-parabricks:4.0.0-1 \\\n",
 97 |     "\tpbrun fq2bam \\\n",
 98 |     "\t--ref /input_data/Homo_sapiens_assembly38.fasta \\\n",
 99 |     "\t--in-fq /input_data/fastq1.gz /input_data/fastq2.gz \\\n",
100 |     "\t--out-bam /image/outputdir/fq2bam_output.bam"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "2b35c098",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Download reference file"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "1656a223",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "! wget -O parabricks_sample.tar.gz https://datasettoaexample.blob.core.windows.net/publicsample/parabricks_sample.tar.gz"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "57112760-18e7-4e1e-8dbb-679bf79bc1ef",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "! tar xzvf parabricks_sample.tar.gz"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "id": "87cd5849",
134 |    "metadata": {},
135 |    "source": [
136 |     "### Download Sample fastq paired-end data"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "id": "3b6637be",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "!wget https://datasettoaexample.blob.core.windows.net/publicsample/HG001.novaseq.pcr-free.30x.R1.fastq.gz"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "39a66009",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "!wget https://datasettoaexample.blob.core.windows.net/publicsample/HG001.novaseq.pcr-free.30x.R2.fastq.gz"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "id": "2d49f917",
162 |    "metadata": {},
163 |    "source": [
164 |     "### `fq2bam` pipeline submission to Clara Parabricks "
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "id": "5910faeb",
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "! sudo time -v docker run --gpus all -v /data:/parabricks nvcr.io/nvidia/clara/clara-parabricks:4.0.0-1 pbrun germline \\\n",
175 |     "--ref /parabricks/parabricks_sample/Ref/Homo_sapiens_assembly38.fasta \\\n",
176 |     "--in-fq /parabricks/HG002-NA24385-pFDA_S2_L002_R1_001-30x.fastq.gz /parabricks/HG002-NA24385-pFDA_S2_L002_R2_001-30x.fastq.gz \\\n",
177 |     "--knownSites /parabricks/parabricks_sample/Ref/Homo_sapiens_assembly38.known_indels.vcf.gz --out-bam /parabricks/output.bam \\\n",
178 |     "--out-variants /parabricks/output.vcf \\\n",
179 |     "--out-recal-file /parabricks/report.txt \\\n",
180 |     "--run-partition --no-alt-contigs |& tee germline_30x_4gpu.txt."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "id": "821e29b1",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Notices\n",
189 |     "\n",
190 |     "Third party software notices from [NVIDIA CLARA PARABRICKS](https://docs.nvidia.com/clara/parabricks/v3.5/text/software_notices.html)\n",
191 |     "\n",
192 |     "THE NOTEBOOK THIS PROJECT JUST PROVIDES A SAMPLE CODES FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS NOTEBOOK AND SAMPLE USE OF NVIDIA Clara™ Parabricks® codes ON AN “AS IS” BASIS. DATA OR ANY MATERIAL ON THIS NOTEBOOK. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS NOTEBOOK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS NOTEBOOK.\n",
193 |     "\n",
194 |     "### Support\n",
195 |     "\n",
196 |     "For questions about this notebook: Please send an e-mail to genomics@microsoft.com\n",
197 |     "\n",
198 |     "For other questions about NVIDIA Clara Parabricks [Developer forum of NVIDIA Clara Parabricks](https://forums.developer.nvidia.com/c/healthcare/parabricks/290)"
199 |    ]
200 |   }
201 |  ],
202 |  "metadata": {
203 |   "kernelspec": {
204 |    "display_name": "Python [conda env:azureml_py38_PT_and_TF]",
205 |    "language": "python",
206 |    "name": "conda-env-azureml_py38_PT_and_TF-py"
207 |   },
208 |   "language_info": {
209 |    "codemirror_mode": {
210 |     "name": "ipython",
211 |     "version": 3
212 |    },
213 |    "file_extension": ".py",
214 |    "mimetype": "text/x-python",
215 |    "name": "python",
216 |    "nbconvert_exporter": "python",
217 |    "pygments_lexer": "ipython3",
218 |    "version": "3.8.5"
219 |   }
220 |  },
221 |  "nbformat": 4,
222 |  "nbformat_minor": 5
223 | }
224 | 


--------------------------------------------------------------------------------
/NVIDIA Clara Parabricks on Azure/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # NVIDIA Clara Parabricks on Microsoft Azure 
 3 | 
 4 | <!-- #region -->
 5 | 
 6 | [NVIDIA_Clara_Parabricks_on_Azure.ipnyb](https://github.com/microsoft/genomicsnotebook/blob/main/NVIDIA%20Clara%20Parabricks%20on%20Azure/NVIDIA_Clara_Parabricks_on_Azure.ipynb) presents the **sample codes** of NVIDIA Clara Parabricks pipeline with [Data Science Virtual Machine for Linux (Ubuntu)](https://learn.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro) on [Microsoft Azure](https://azure.microsoft.com/en-us/)
 7 | 
 8 | ###  NVIDIA Clara™ Parabricks®
 9 | NVIDIA introduced the Clara Parabricks software suite for performing analysis of NGS DNA and RNA data. It delivers results at blazing fast speeds and low cost. Clara Parabricks can analyze 30x WGS data in under 25 minutes on a single 8-GPU server, instead of 30 hours for traditional CPU-based methods. Its output matches commonly used software, making it simple to verify the accuracy of the results.
10 | 
11 | Clara Parabricks software provides at least an order of magnitude acceleration in compute time while generating identical outputs and reducing analysis costs. Clara Parabricks is available free on NVIDIA GPU Cloud (NGC) and can be easily deployed on Azure GPU based virtual machines (VM).
12 | 
13 | Clara Parabricks provides optimal performance for multiple Microsoft Azure instance types and can be used out of the box for essential bioinformatics needs. Currently, the Clara Parabricks accelerated analysis tools start from FASTQ files and perform alignment through variant calling and expression analysis, including QC tools for both types of outputs. The suite of tools can be used to support end-to-end workflows for germline, somatic and RNA-Seq pipelines, providing the flexibility to meet the individual needs of most projects. The tools can also be used individually, as drop-in replacements for steps in existing workflows.
14 | 
15 | You can learn more from this [link](https://www.nvidia.com/en-us/clara/genomics/*)
16 | 
17 | 
18 | ### Microsoft Azure Resources 
19 | 
20 | If you are new to Azure, see:
21 | - [Microsoft Genomics](https://www.microsoft.com/en-us/genomics/)
22 | - [Azure Virtual Machines](https://azure.microsoft.com/services/virtual-machines/)
23 | 
24 | 
25 | ### Support
26 | 
27 | For questions about the notebook: Please send an e-mail to genomics@microsoft.com
28 | 
29 | For other questions about NVIDIA Clara Parabricks [Developer forum of NVIDIA Clara Parabricks](https://forums.developer.nvidia.com/c/healthcare/parabricks/290)
30 | 
31 | 
32 | ### Contributing
33 | 
34 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
35 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
36 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
37 | 
38 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
39 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
40 | provided by the bot. You will only need to do this once across all repos using our CLA.
41 | 
42 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
43 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
44 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
45 | 
46 | 
47 | ### Notices
48 | 
49 | Third party software notices from [NVIDIA CLARA PARABRICKS](https://docs.nvidia.com/clara/parabricks/v3.5/text/software_notices.html)
50 | 
51 | THE NOTEBOOK THIS PROJECT JUST PROVIDES A SAMPLE CODES FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS NOTEBOOK AND SAMPLE USE OF NVIDIA Clara™ Parabricks® codes ON AN “AS IS” BASIS. DATA OR ANY MATERIAL ON THIS NOTEBOOK. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS NOTEBOOK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS NOTEBOOK.
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Dear Community,
 2 | 
 3 | After careful consideration, we have decided to shift our focus to new and innovative initiatives that will better serve our community and align with our long-term goals.
 4 | 
 5 | **Effective Date: May 12th, 2025**
 6 | 
 7 | Impact on Users:
 8 | 
 9 | The project repository will be archived and set to read-only mode, ensuring that it remains accessible for reference.
10 | While no further updates, bug fixes, or support will be provided, we encourage you to explore the wealth of knowledge and resources available in the repository.
11 | 
12 | Licensing: The project will remain under its current open-source license, allowing others to fork and continue development if they choose.
13 | 
14 | We understand that this change may come as a surprise, but we are incredibly grateful for your support and contributions over the years. Your dedication has been instrumental in the success of this project, and we look forward to your continued involvement in our future endeavors.
15 | 
16 | Thank you for your understanding and support.
17 | 
18 | # Genomics Data Analysis with Jupyter Notebooks on Azure
19 | ![text](https://github.com/microsoft/genomicsnotebook/blob/main/docs/image328.png)
20 | 
21 | Jupyter notebook is a great tool for data scientists who are working on genomics data analysis. In this repo, we demonstrate the use of [Azure Notebooks](https://docs.microsoft.com/en-us/azure/notebooks/) for genomics data analysis via GATK, Picard, Bioconductor and Python libraries.
22 | 
23 | Here is the list of sample notebooks on this repo:
24 | 
25 | 1. [`genomics.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics.ipynb): Analysis from 'uBAM' to 'structured data table' analysis.
26 | 2. [`genomicsML.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomicsML.ipynb): Train Machine Learning models with Genomics + Clinical Data
27 | 3. [`genomics-platinum-genomes.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-platinum-genomes.ipynb): Accessing Illumina Platinum Genomes data from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* and to make initial data analysis.
28 | 4. [`genomics-reference-genomes.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-reference-genomes.ipynb): Accessing reference genomes from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
29 | 5. [`genomics-clinvar.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-clinvar.ipynb): Accessing ClinVar data from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)*
30 | 6. [`genomics-giab.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/AzureNotebooks-azure-storage-genomics-giab.ipynb): Accessing Genome in a Bottle data from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
31 | 7. [`SnpEff.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/SnpEff.ipynb): Accessing SnpEff databases from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
32 | 8. [`1000 Genomes.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/1000-genomes_Azure_Genomics_Data_Lake.ipynb): Accessing 1000 Genomes dataset from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
33 | 9. [`GATKResourceBundle.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-gatk-resource-bundle.ipynb): Accessing GATK resource bundle  from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
34 | 10. [`ENCODE.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-encode.ipynb): Accessing ENCODE dataset from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/)* 
35 | 11. [`genomics-OpenCRAVAT.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/genomics-opencravat.ipynb): Accessing OpenCRAVAT dataset from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/) and deploy built-in Azure Data Science VM for OpenCRAVAT*  
36 | 12. [`Bioconductor.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/Bioconductor.ipynb): Pulling Bioconductor Docker image from [Microsoft Container Registry](https://hub.docker.com/_/microsoft-bioconductor)
37 | 13. [`simtotable.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/simtotable.ipynb): Simulate NGS data, use Cromwell on Azure OR Microsoft Genomics service for secondary analysis and convert the gVCF data to a structured data table.
38 | 14. [`igv_jupyter_extension_sample.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/igv_jupyter_extension_sample.ipynb): Download sample VCF file from [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-data-lake/) and use igv-jupyter extension on Jupyter Lab environment.
39 | 15. [`radiogenomics.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/radiogenomics.ipynb): Combine DICOM, VCF and gene expression data for patient segmentation analysis.
40 | 16. [`fhir+PacBio.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/fhir_long_read.ipynb): Convert Synthetic FHIR and PacBio VCF Data to parquet and Explore with Azure Synapse Analytics
41 | 17. [`fhir-vcf-clustering.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/fhir-vcf-clustering.ipynb): Convert Synthetic FHIR and PacBio VCF Data to parquet and Explore with Azure Synapse Analytics
42 | 18. [`graphragforgenomics.ipynb`](https://github.com/microsoft/genomicsnotebook/blob/main/sample-notebooks/graphragforgenomics.ipynb): Use GraphRAG for genomics annotation.
43 | 
44 | *Technical note: [Explore Azure Genomics Data Lake with Azure Storage Explorer](https://github.com/microsoft/genomicsnotebook/blob/main/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf)
45 | 
46 | # 1. Prerequisites
47 | 
48 | #  Create and manage Azure Machine Learning workspaces in the Azure portal
49 | 
50 | ![text](https://github.com/microsoft/genomicsnotebook/blob/main/docs/create-workspace.gif)
51 | 
52 | For further details on creation of Azure ML workspace please visit [this page.](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace)
53 | 
54 | # Run the notebook in your workspace
55 | 
56 | This chapter uses the cloud notebook server in your workspace for an install-free and pre-configured experience. Use your own environment if you prefer to have control over your environment, packages and dependencies.
57 | 
58 | Follow along with this video or use the detailed steps below to clone and run the tutorial from your workspace.
59 | 
60 | [![Watch the video](https://github.com/microsoft/genomicsnotebook/blob/main/docs/video.JPG)](https://www.microsoft.com/en-us/videoplayer/embed/RE4mTUr) 
61 | 
62 | 
63 | # 2. Contributing
64 | 
65 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
66 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
67 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
68 | 
69 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
70 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
71 | provided by the bot. You will only need to do this once across all repos using our CLA.
72 | 
73 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
74 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
75 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
76 | 
77 | # 3. References
78 | 
79 | 1. [Jupyter Notebook on Azure](https://docs.microsoft.com/en-us/azure/notebooks/tutorial-create-run-jupyter-notebook) 
80 | 2. [Introduction to Azure Notebooks](https://notebooks.azure.com)
81 | 3. [GATK](https://gatk.broadinstitute.org/hc/en-us) 
82 | 4. [Picard](http://broadinstitute.github.io/picard/index.html)
83 | 5. [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning/)
84 | 6. [Azure Open Datasets](https://azure.microsoft.com/en-us/services/open-datasets/)
85 | 7. [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure)
86 | 8. [Bioconductor](https://www.bioconductor.org/)
87 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/customdsvmbioconductor/README.md:
--------------------------------------------------------------------------------
 1 | ## Deploy Custom Data Science VM for Bioconductor in Azure with Linux Extension
 2 | 
 3 | Azure Virtual machine (VM) extensions are small applications that provide post-deployment configuration and automation tasks on Azure VMs. For example, if a virtual machine requires software installation, antivirus protection, or the ability to run a script inside it, you can use a VM extension [1]. Users can run Azure VM extensions by using the Azure CLI, PowerShell, Azure Resource Manager templates (ARM templates), and the Azure portal. 
 4 | 
 5 | Users can bundle extensions with a new VM deployment or run them against any existing system [1]. In this document, we will provide an overview of Azure VM extensions with using Bioconductor’s system dependency shell file.  
 6 | 
 7 | ###### Figure 1. Overall architectural design of custom Linux extension implementation
 8 | 
 9 | <img src="https://github.com/microsoft/genomicsnotebook/blob/main/docs/bioc_arch_1.JPG" width="600" />
10 | 
11 | Once deployment is completed with the prepared code, users will have a Linux Data Science VM which has the following features: 
12 | 
13 | ###### Table 1. Data Science VM pre-built configurations
14 | 
15 | <img src="https://github.com/microsoft/genomicsnotebook/blob/main/docs/bioc_arch_2.JPG" width="600" />
16 | 
17 | These kinds of custom deployment options will provide flexibility to the researchers who would like to use cloud technologies. They do not need to keep the VMs, just deploy new resources once needed.
18 | 
19 | # Custom DSVM deployment command: 
20 | 
21 | Users need to fill the mandatory fields (subscription ID, resource group etc.) to submit the DSVM deployment command from their terminal.
22 | 
23 | ```az login \
24 | az group deployment create --subscription <SUBSCRIPTION ID> \
25 | -g <RESOURCE_GROUP_NAME> \
26 | --template-uri <TEMPLATE_JSON_LOCATION> \
27 | --parameters <PARAMETERS_JSON_LOCATION> \
28 | --parameters location=<REGION> \
29 | --parameters virtualNetworkId=/subscriptions/<SUBSCRIPTION ID>/resourceGroups/<RESOURCE_GROUP_NAME>/providers/Microsoft.Network/virtualNetworks/<RESOURCE_GROUP_NAME> \
30 | --parameters adminPassword=<VM PASSWORD> \
31 | --parameters virtualMachineSize=Standard_D8s_v3 \
32 | --parameters networkInterfaceName1=<NETWORK_INTERFACE_NAME> \
33 | --parameters networkSecurityGroupName=<NETWORK_SECURITY_GROUP> \
34 | --parameters virtualMachineName=<VMNAME>\
35 | --parameters adminUsername=<USERNAME>
36 | ```
37 | 
38 | # Resources:
39 | 
40 | 1.	[Sample 'template.json'  location for ARM Template](https://datasettoaexample.blob.core.windows.net/publicsample/template.json)
41 | 2.	[Sample 'parameters.json' location for ARM Template](https://datasettoaexample.blob.core.windows.net/publicsample/parameters.json)
42 | 3.	[Sample command to deploy Custom DSVM](https://datasettoaexample.blob.core.windows.net/publicsample/deployment_command.txt)
43 |   
44 |     _Note: Users need to log-in (az login) their Azure account to use this sample command_
45 |   
46 | 4.	[Bioconductor’s system dependency shell file](https://github.com/Bioconductor/bioconductor_docker/blob/master/bioc_scripts/install_bioc_sysdeps.sh)
47 | 
48 | 5. [Demo video for BioC 2022 conference](https://datasettoaexample.blob.core.windows.net/publicsample/record_bioc_22_erdal_cosgun.mp4)
49 | 
50 | # References:
51 | 1.	[Azure VM extensions and features for Linux - Azure Virtual Machines | Microsoft Docs](https://docs.microsoft.com/en-us/azure/virtual-machines/extensions/features-linux)
52 | 
53 | 2.	[Bioconductor on Microsoft Azure - Microsoft Tech Community](https://techcommunity.microsoft.com/t5/healthcare-and-life-sciences/bioconductor-on-microsoft-azure/ba-p/3101837)
54 | 
55 | 3.	[What is the Azure Data Science Virtual Machine - Azure Data Science Virtual Machine | Microsoft Docs](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/overview)
56 | 
57 | # Notices
58 | THIS DOCUMENT (README.md) JUST PROVIDES A SAMPLE CODES,INSTRUCTIONS FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES SHELL FILE FROM BIOCONDUCTOR PROJECT, ARM TEMPLATE FOR CUSTOM DSVM AND ANY MATERIAL ON AN “AS IS” BASIS. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS DOCUMENT. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS DOCUMENT.
59 | 
60 | #### Thanks to Bioconductor Project's Core team to provide system dependency shell file.
61 | 


--------------------------------------------------------------------------------
/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf


--------------------------------------------------------------------------------
/docs/azurenotebooks.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/azurenotebooks.JPG


--------------------------------------------------------------------------------
/docs/azurenotebooks_2_LI.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/azurenotebooks_2_LI.jpg


--------------------------------------------------------------------------------
/docs/azurenotebooks_3_LI.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/azurenotebooks_3_LI.jpg


--------------------------------------------------------------------------------
/docs/bioc_arch_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/bioc_arch_1.JPG


--------------------------------------------------------------------------------
/docs/bioc_arch_2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/bioc_arch_2.JPG


--------------------------------------------------------------------------------
/docs/clinvar_chr1_sample_mini.txt:
--------------------------------------------------------------------------------
 1 | chr1:930187,rs375780070,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930187	reference_allele:C	alternative_allele:T	dbSNP_ID:rs375780070	Variation_ID:1144630	Allele_ID:1131738	canonical_SPDI:NC_000001.11:g.930187C>T	molecular_consequence:SO:0001819|synonymous_variant	germline_review:Likely_benign	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1144630/"
 2 | chr1:930188,rs770001898,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930188	reference_allele:G	alternative_allele:A	dbSNP_ID:rs770001898	Variation_ID:846933	Allele_ID:824438	canonical_SPDI:NC_000001.11:g.930188G>A	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/846933/"
 3 | chr1:930189,rs776005293,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930189	reference_allele:C	alternative_allele:T	dbSNP_ID:rs776005293	Variation_ID:1478180	Allele_ID:1351687	canonical_SPDI:NC_000001.11:g.930189C>T	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1478180/"
 4 | chr1:930195,rs143052291,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930195	reference_allele:C	alternative_allele:G	dbSNP_ID:rs143052291	Variation_ID:1380424	Allele_ID:1421246	canonical_SPDI:NC_000001.11:g.930195C>G	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1380424/"
 5 | chr1:930195,rs143052291,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930195	reference_allele:C	alternative_allele:T	dbSNP_ID:rs143052291	Variation_ID:1354167	Allele_ID:1341285	canonical_SPDI:NC_000001.11:g.930195C>T	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1354167/"
 6 | chr1:930199,rs764574771,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930199	reference_allele:C	alternative_allele:T	dbSNP_ID:rs764574771	Variation_ID:1095790	Allele_ID:1067609	canonical_SPDI:NC_000001.11:g.930199C>T	molecular_consequence:SO:0001819|synonymous_variant	germline_review:Likely_benign	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1095790/"
 7 | chr1:930200,rs368250686,SAMD11,not_specified|not_provided,"GRCh38_chr:chr1	GRCh38_pos:930200	reference_allele:G	alternative_allele:A	dbSNP_ID:rs368250686	Variation_ID:1043045	Allele_ID:1023510	canonical_SPDI:NC_000001.11:g.930200G>A	molecular_consequence:SO:0001583|missense_variant	germline_review:Conflicting_classifications_of_pathogenicity	germline_status:criteria_provided,_conflicting_classifications	Gene:SAMD11	Condition:not_specified|not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1043045/"
 8 | chr1:930201,na,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930201	reference_allele:C	alternative_allele:T	dbSNP_ID:na	Variation_ID:2012623	Allele_ID:2074412	canonical_SPDI:NC_000001.11:g.930201C>T	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/2012623/"
 9 | chr1:930203,rs767794127,SAMD11,not_specified|not_provided,"GRCh38_chr:chr1	GRCh38_pos:930203	reference_allele:C	alternative_allele:T	dbSNP_ID:rs767794127	Variation_ID:972363	Allele_ID:959431	canonical_SPDI:NC_000001.11:g.930203C>T	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_multiple_submitters,_no_conflicts	Gene:SAMD11	Condition:not_specified|not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/972363/"
10 | chr1:930204,rs148711625,SAMD11,SAMD11-related_disorder|not_provided,"GRCh38_chr:chr1	GRCh38_pos:930204	reference_allele:G	alternative_allele:A	dbSNP_ID:rs148711625	Variation_ID:1170208	Allele_ID:1153702	canonical_SPDI:NC_000001.11:g.930204G>A	molecular_consequence:SO:0001583|missense_variant	germline_review:Benign	germline_status:criteria_provided,_multiple_submitters,_no_conflicts	Gene:SAMD11	Condition:SAMD11-related_disorder|not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1170208/"
11 | chr1:930209,rs2100306465,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930209	reference_allele:C	alternative_allele:T	dbSNP_ID:rs2100306465	Variation_ID:1653547	Allele_ID:1548120	canonical_SPDI:NC_000001.11:g.930209C>T	molecular_consequence:SO:0001819|synonymous_variant	germline_review:Likely_benign	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1653547/"
12 | chr1:930210,na,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930210	reference_allele:TGAA	alternative_allele:T	dbSNP_ID:na	Variation_ID:2059344	Allele_ID:2117203	canonical_SPDI:NC_000001.11:g.930212AAG[1]	molecular_consequence:SO:0001822|inframe_deletion	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/2059344/"
13 | chr1:930215,rs903331232,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930215	reference_allele:A	alternative_allele:G	dbSNP_ID:rs903331232	Variation_ID:1409578	Allele_ID:1340647	canonical_SPDI:NC_000001.11:g.930215A>G	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1409578/"
14 | chr1:930218,rs1157485345,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930218	reference_allele:G	alternative_allele:A	dbSNP_ID:rs1157485345	Variation_ID:1523336	Allele_ID:1350505	canonical_SPDI:NC_000001.11:g.930218G>A	molecular_consequence:SO:0001583|missense_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1523336/"
15 | chr1:930220,rs2100306518,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930220	reference_allele:G	alternative_allele:A	dbSNP_ID:rs2100306518	Variation_ID:1667751	Allele_ID:1657034	canonical_SPDI:NC_000001.11:g.930220G>A	molecular_consequence:SO:0001819|synonymous_variant	germline_review:Likely_benign	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/1667751/"
16 | chr1:930221,na,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930221	reference_allele:C	alternative_allele:T	dbSNP_ID:na	Variation_ID:2133131	Allele_ID:2191970	canonical_SPDI:NC_000001.11:g.930221C>T	molecular_consequence:SO:0001587|nonsense	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/2133131/"
17 | chr1:930222,rs1641106338,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930222	reference_allele:GAACTC	alternative_allele:TTCTTCTG	dbSNP_ID:rs1641106338	Variation_ID:998906	Allele_ID:987768	canonical_SPDI:NC_000001.11:g.930222_930227delinsTTCTTCTG	molecular_consequence:SO:0001589|frameshift_variant	germline_review:Uncertain_significance	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/998906/"
18 | chr1:930226,na,SAMD11,not_provided,"GRCh38_chr:chr1	GRCh38_pos:930226	reference_allele:T	alternative_allele:G	dbSNP_ID:na	Variation_ID:2081213	Allele_ID:2143012	canonical_SPDI:NC_000001.11:g.930226T>G	molecular_consequence:SO:0001819|synonymous_variant	germline_review:Likely_benign	germline_status:criteria_provided,_single_submitter	Gene:SAMD11	Condition:not_provided	source:clinvar	clinvar_URL:https://www.ncbi.nlm.nih.gov/clinvar/variation/2081213/"


--------------------------------------------------------------------------------
/docs/create-workspace.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/create-workspace.gif


--------------------------------------------------------------------------------
/docs/fhir_long_read_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/fhir_long_read_1.JPG


--------------------------------------------------------------------------------
/docs/genomics_notebook_codespaces.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/genomics_notebook_codespaces.mp4


--------------------------------------------------------------------------------
/docs/image328.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/image328.png


--------------------------------------------------------------------------------
/docs/initial.md:
--------------------------------------------------------------------------------
1 | initial.md
2 | 


--------------------------------------------------------------------------------
/docs/utils.py:
--------------------------------------------------------------------------------
  1 | #src/utils.py
  2 | 
  3 | import numpy as np 
  4 | import os 
  5 | import glob 
  6 | import matplotlib.pylab as plt 
  7 | #loading requried packages 
  8 | 
  9 | import sys 
 10 | import logging 
 11 | import six
 12 | #for image
 13 | import pydicom as dicom 
 14 | import pydicom as dicom 
 15 | import scipy
 16 | from scipy.spatial import ConvexHull
 17 | 
 18 | class dicom_sample:
 19 |     def __init__(self, path):
 20 |         self.path = path 
 21 |         self.ROI, self.scans, self.segmentation,self.sample_id, self.N, self.spacing = self.extract_dicom(self.path)
 22 |         self.firstorderfeature =[] 
 23 |         self.shapefeature = []
 24 |         self.getfeature()
 25 | 
 26 |     def getfeature(self):
 27 |         if (self.ROI is not None):
 28 |             self.firstorderfeature = np.array(self.get_firstorderfeature())
 29 |             self.shapefeature =  np.array(self.get_shapefeature3D())
 30 | 
 31 |     def load_scan(self,path):
 32 |         slices = [dicom.read_file(s) for s in glob.glob(path+"/*")]
 33 |         slices.sort(key = lambda x: int(x.InstanceNumber))
 34 |         try:
 35 |             slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
 36 |         except:
 37 |             slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
 38 |         
 39 |         return slices,slice_thickness
 40 | 
 41 |     def _moment(self,a, moment=1): 
 42 |         """
 43 |         Calculate n-order moment of an array for a given axis
 44 |         """
 45 |         if moment == 1:
 46 |             return np.float(0.0)
 47 |         else:
 48 |             mn = np.nanmean(a, 1, keepdims=True)
 49 |             s = np.power((a - mn), moment)
 50 |             return np.nanmean(s, 1)
 51 | 
 52 |     def get_firstorderfeature(self):
 53 |         ROI = self.ROI
 54 |         features = [] 
 55 |         Np = len(ROI) 
 56 |         V_voxel = self.spacing[0]*self.spacing[1] * self.spacing[2]
 57 |         #1
 58 |         energy = np.sum(np.square(ROI))
 59 |         features.append(energy)
 60 |         #2
 61 |         features.append(energy * V_voxel)
 62 |         _, p_i = np.unique(ROI, return_counts=True)
 63 |         p_i = p_i.reshape((1,-1))
 64 |         #3
 65 |         features.append(-1.0*np.sum(p_i * np.log2(p_i+np.spacing(1)),1)[0])
 66 |         #4 minimum
 67 |         features.append(np.nanmin(ROI,1)[0])
 68 |         #5 10th percentile of ROI 
 69 |         prcnt10 = np.nanpercentile(ROI,10,axis=1)
 70 |         features.append(prcnt10[0])
 71 |         #6 90th percentile 
 72 |         prcnt90 = np.nanpercentile(ROI,90, axis=1)
 73 |         features.append(prcnt90[0])
 74 |         #7 max
 75 |         features.append(np.nanmax(ROI,1)[0])
 76 |         #8 mean 
 77 |         features.append(np.nanmean(ROI,1)[0])
 78 |         #9 median 
 79 |         features.append(np.nanmedian(ROI,1)[0])
 80 |         #10 Interquartile range 
 81 |         features.append(np.nanpercentile(ROI,75,1)[0] - np.nanpercentile(ROI,25,1)[0])
 82 |         #11 Range of gray values
 83 |         features.append(np.nanmax(ROI,1)[0] - np.nanmin(ROI,1)[0])
 84 |         #12  mean absolute deviation
 85 |         u_x =np.nanmean(ROI,1,keepdims=True)
 86 |         features.append(np.nanmean(np.absolute(ROI - u_x),1)[0])
 87 |         #13 robust mean absolute deviation 
 88 |         percentileArray = ROI.copy() 
 89 |         msk = ~np.isnan(percentileArray) 
 90 |         msk[msk] = ((percentileArray - prcnt10[:, None])[msk] < 0) | ((percentileArray - prcnt90[:, None])[msk] > 0)
 91 |         # Finally, exclude the invalid voxels by setting them to nan.
 92 |         percentileArray[msk] = np.nan
 93 |         features.append(np.nanmean(np.absolute(percentileArray - np.nanmean(percentileArray, 1, keepdims=True)), 1)[0])
 94 |         #14RMS
 95 |         Nvox = np.sum(~np.isnan(ROI), 1).astype('float')
 96 |         features.append(np.sqrt(np.nansum(ROI ** 2 , 1)/Nvox)[0])
 97 |         #15 
 98 |         features.append(np.nanstd(ROI, axis=1)[0])
 99 |         #16 skewness 
100 |         m2 = self._moment(ROI,2)
101 |         m3 = self._moment(ROI, 3)
102 |         features.append(m3[0]/m2[0] ** 1.5)
103 |         #17
104 |         m4 = self._moment(ROI,4) 
105 |         m2[m2==0] = 1
106 |         m4[m2 ==0] = 0 
107 |         features.append(m4[0] / m2[0] ** 2.0 )
108 |         #18 
109 |         features.append(np.nanstd(ROI,1)[0] ** 2)
110 |         #19 
111 |         features.append( np.nansum(p_i ** 2, 1)[0])
112 |         return features 
113 | 
114 |     def get_shapefeature3D(self):
115 |         pixel_spacing = self.spacing
116 |         tumor_points = np.array(np.where(self.segmentation != 0 )).transpose()
117 |         scaled_points = np.dot(tumor_points, np.diag(pixel_spacing))
118 |         tumor = ConvexHull(scaled_points)
119 |         tumorSurfaceArea, tumorVolume = tumor.area,tumor.volume
120 |         #feret diameter
121 |         tumorRadius = np.sqrt(np.sum((tumor.max_bound - tumor.min_bound) **  2))
122 |         Np = scaled_points.shape[0]
123 |         physicalCoordinates = scaled_points - np.mean(scaled_points, axis=0)
124 |         physicalCoordinates = physicalCoordinates / np.sqrt(Np)
125 |         covariance = np.dot(physicalCoordinates.T.copy(), physicalCoordinates)
126 |         eigenValues = np.linalg.eigvals(covariance)
127 |         eigenValues.sort()
128 |         features=[]
129 |         features.append(tumorVolume)
130 |         features.append( Np*pixel_spacing[0]*pixel_spacing[1]*pixel_spacing[2])
131 |         features.append(tumorSurfaceArea)
132 |         features.append(tumorSurfaceArea/tumorVolume)
133 |         features.append((36.0 * np.pi *tumorVolume **2) ** (1.0/3.0) / tumorSurfaceArea)
134 |         features.append(tumorVolume / (tumorSurfaceArea ** (3.0 / 2.0) * np.sqrt(np.pi)))
135 |         features.append( (36.0 * np.pi) * (tumorVolume ** 2.0) / (tumorSurfaceArea ** 3.0))
136 |         features.append(tumorSurfaceArea / (36.0 * np.pi * tumorVolume ** 2) ** (1.0/3.0))
137 |         features.append(tumorRadius * 2)
138 |         #missing: 2d diameters on 3 axies
139 |         if eigenValues[2] <0 : 
140 |             features.append(np.nan)
141 |         else:
142 |             features.append(np.sqrt(eigenValues[2]) *4) 
143 |         if eigenValues[0] <0 : 
144 |             features.append(np.nan)
145 |         else:
146 |             features.append(np.sqrt(eigenValues[0]) *4) 
147 |         if eigenValues[1] <0 or eigenValues[2] < 0  : 
148 |             features.append(np.nan)
149 |         else:
150 |             features.append(np.sqrt(eigenValues[1] / eigenValues[2]))
151 |         if eigenValues[0] < 0 or eigenValues[2] < 0: 
152 |             features.append(np.nan)
153 |         else: 
154 |             features.append(np.sqrt(eigenValues[0] / eigenValues[2]))
155 |         return features 
156 | 
157 |     
158 |     def get_pixels_hu(self,scans):
159 |         image = np.stack([s.pixel_array for s in scans])[::-1]
160 |         # Convert to int16 (from sometimes int16), 
161 |         # should be possible as values should always be low enough (<32k)
162 |         image = image.astype(np.int16)
163 | 
164 |         # Set outside-of-scan pixels to 1
165 |         # The intercept is usually -1024, so air is approximately 0
166 |         image[image == -2000] = 0
167 |     
168 |         # Convert to Hounsfield units (HU)
169 |         intercept = scans[0].RescaleIntercept
170 |         slope = scans[0].RescaleSlope
171 |     
172 |         if slope != 1:
173 |             image = slope * image.astype(np.float64)
174 |             image = image.astype(np.int16)
175 |         
176 |         image += np.int16(intercept)
177 |         return np.array(image, dtype=np.int16)
178 | 
179 |     def extract_dicom(self, path):
180 |         N=0 
181 |         ROI,sample_id,spacing = None,None,None
182 |         scans,segmentation = None, None
183 |         tumor_scans, tumor_segmentation = None ,None
184 |         for film_path in glob.glob(path+"/*"):
185 |             if "segmentation" not in film_path and len(glob.glob(film_path+"/*.dcm")) > 1: 
186 |                 N = len(glob.glob(film_path + "/*.dcm"))
187 |                 patients, slice_thickness = self.load_scan(film_path)
188 |                 scans = self.get_pixels_hu(patients)
189 |                 spacing  = np.array([float(slice_thickness),float(patients[0].PixelSpacing[0]), float(patients[0].PixelSpacing[1])], dtype='float32')
190 |                 #scans_resample ,spacing= self.resample(scans,patients, [1,1,1])
191 |             else:
192 |                 cur_file = glob.glob(film_path+"/*.dcm")[0]
193 |                 segmentation = dicom.read_file(cur_file).pixel_array
194 |                 sample_id = dicom.read_file(cur_file).PatientID
195 |         if (segmentation is not None and scans is not None and (segmentation.shape[0] == scans.shape[0])):
196 |             ROI = scans[segmentation==1].astype('float').reshape((1,-1))       
197 |             tumor_slices = np.unique(np.where(segmentation==1)[0])
198 |             tumor_scans = scans[tumor_slices,:,:]
199 |             tumor_segmentation = segmentation[tumor_slices, :,:]
200 |         return ROI, tumor_scans, tumor_segmentation ,sample_id, N, spacing
201 | 
202 |     def resample(self,image, scan, new_spacing=[1,1,1]):
203 |         # Determine current pixel spacing
204 |         spacing = np.array([float(scan[0].SliceThickness),float(scan[0].PixelSpacing[0]), float(scan[0].PixelSpacing[1])], dtype='float32') 
205 | 
206 |         resize_factor = spacing / new_spacing
207 |         new_real_shape = image.shape * resize_factor
208 |         new_shape = np.round(new_real_shape)
209 |         real_resize_factor = new_shape / image.shape
210 |         new_spacing = spacing / real_resize_factor
211 |     
212 |         image = scipy.ndimage.interpolation.zoom(image, real_resize_factor)
213 |     
214 |         return image, new_spacing
215 |     def display(self):
216 |        
217 |         if self.segmentation is not None:
218 |             plt.imshow(self.scans[0],cmap='gray')
219 |             plt.imshow(self.segmentation[0], cmap = 'Reds', alpha=0.5)
220 |             #tumor_slices = self.segmentation.shape[0]
221 |             #rows=6
222 |            # cols = int(tumor_slices/rows)+1
223 |             #fig,ax = plt.subplots(rows, cols,figsize=[12,12])
224 |             #k=0
225 |             #for i in range(tumor_slices):
226 |             #    ind = i
227 |             #    row_i = int(k/cols) 
228 |             #    row_j = int(k % cols) 
229 |            #     ax[row_i,row_j].set_title('slice %d' % ind)            
230 |            #     ax[row_i,row_j].imshow(self.scans[ind],cmap='gray')
231 |            #     ax[row_i,row_j].imshow(self.segmentation[ind], cmap = 'Reds', alpha=0.5)
232 |           #      ax[row_i, row_j].axis('off')
233 |           #      k  = k+1
234 |            # while row_j < cols:
235 |            #     ax[rows-1, row_j].set_visible(False)
236 |            #     row_j += 1
237 |             
238 |             plt.show()
239 |         else: 
240 |             print ("Sample has no tumor")
241 | 


--------------------------------------------------------------------------------
/docs/video.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/genomicsnotebook/e91b69768c74c7f9a277194e95b7c30631c142f5/docs/video.JPG


--------------------------------------------------------------------------------
/fhirgenomics/3-simple-breast-cancer-module.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "simple_breast_cancer",
  3 |   "remarks": [
  4 |     "Simplified module describing breast cancer.",
  5 |     "- all females have breast cancer",
  6 |     "- they either take Epirubicin or Doxorubicin"
  7 |   ],
  8 |   "states": {
  9 |     "Initial": {
 10 |       "type": "Initial",
 11 |       "conditional_transition": [
 12 |         {
 13 |           "condition": {
 14 |             "condition_type": "Gender",
 15 |             "gender": "F"
 16 |           },
 17 |           "transition": "Breast Cancer"
 18 |         },
 19 |         {
 20 |           "transition": "Terminal"
 21 |         }
 22 |       ]
 23 |     },
 24 |     "Terminal": {
 25 |       "type": "Terminal"
 26 |     },
 27 |     "Breast Cancer": {
 28 |       "type": "ConditionOnset",
 29 |       "target_encounter": "",
 30 |       "codes": [
 31 |         {
 32 |           "system": "SNOMED-CT",
 33 |           "code": 254837009,
 34 |           "display": "Malignant neoplasm of breast (disorder)"
 35 |         }
 36 |       ],
 37 |       "assign_to_attribute": "simple_breast_cancer",
 38 |       "direct_transition": "Breast Cancer Diagnosis"
 39 |     },
 40 |     "Death": {
 41 |       "type": "Death",
 42 |       "direct_transition": "Terminal",
 43 |       "range": {
 44 |         "low": 1,
 45 |         "high": 100,
 46 |         "unit": "days"
 47 |       }
 48 |     },
 49 |     "Breast Cancer Diagnosis": {
 50 |       "type": "Encounter",
 51 |       "encounter_class": "emergency",
 52 |       "reason": "",
 53 |       "telemedicine_possibility": "none",
 54 |       "codes": [
 55 |         {
 56 |           "system": "SNOMED-CT",
 57 |           "code": "254837009-1",
 58 |           "display": "Malignant neoplasm of breast (disorder)"
 59 |         }
 60 |       ],
 61 |       "distributed_transition": [
 62 |         {
 63 |           "transition": "Epirubicin Prescribed",
 64 |           "distribution": 0.5
 65 |         },
 66 |         {
 67 |           "transition": "Doxorubicin Prescribed",
 68 |           "distribution": 0.5
 69 |         }
 70 |       ]
 71 |     },
 72 |     "Patient Will Die": {
 73 |       "type": "EncounterEnd",
 74 |       "direct_transition": "Death"
 75 |     },
 76 |     "Patient Will Live": {
 77 |       "type": "EncounterEnd",
 78 |       "direct_transition": "Terminal"
 79 |     },
 80 |     "Epirubicin Prescribed": {
 81 |       "type": "MedicationOrder",
 82 |       "codes": [
 83 |         {
 84 |           "system": "RxNorm",
 85 |           "code": 1732186,
 86 |           "display": "100 ML Epirubicin Hydrochloride 2 MG/ML Injection"
 87 |         }
 88 |       ],
 89 |       "reason": "Breast Cancer",
 90 |       "assign_to_attribute": "epirubicin",
 91 |       "distributed_transition": [
 92 |         {
 93 |           "transition": "Epirubicin with Variant",
 94 |           "distribution": 0.5
 95 |         },
 96 |         {
 97 |           "transition": "Epirubicin without Variant",
 98 |           "distribution": 0.5
 99 |         }
100 |       ]
101 |     },
102 |     "Doxorubicin Prescribed": {
103 |       "type": "MedicationOrder",
104 |       "codes": [
105 |         {
106 |           "system": "RxNorm",
107 |           "code": 1790099,
108 |           "display": "10 ML Doxorubicin Hydrochloride 2 MG/ML Injection"
109 |         }
110 |       ],
111 |       "distributed_transition": [
112 |         {
113 |           "transition": "Doxorubicin without Variant",
114 |           "distribution": 0.5
115 |         },
116 |         {
117 |           "transition": "Doxorubicin with Variant",
118 |           "distribution": 0.5
119 |         }
120 |       ],
121 |       "reason": "Breast Cancer",
122 |       "assign_to_attribute": "doxorubicin"
123 |     },
124 |     "Epirubicin without Variant": {
125 |       "type": "MedicationOrder",
126 |       "codes": [
127 |         {
128 |           "system": "RxNorm",
129 |           "code": "REF",
130 |           "display": "variant not present"
131 |         }
132 |       ],
133 |       "distributed_transition": [
134 |         {
135 |           "transition": "Patient Will Live",
136 |           "distribution": 0.5
137 |         },
138 |         {
139 |           "transition": "Patient Will Die",
140 |           "distribution": 0.5
141 |         }
142 |       ]
143 |     },
144 |     "Epirubicin with Variant": {
145 |       "type": "MedicationOrder",
146 |       "codes": [
147 |         {
148 |           "system": "RxNorm",
149 |           "code": "ALT",
150 |           "display": "variant present"
151 |         }
152 |       ],
153 |       "distributed_transition": [
154 |         {
155 |           "transition": "Patient Will Live",
156 |           "distribution": 0.9
157 |         },
158 |         {
159 |           "transition": "Patient Will Die",
160 |           "distribution": 0.1
161 |         }
162 |       ]
163 |     },
164 |     "Doxorubicin with Variant": {
165 |       "type": "MedicationOrder",
166 |       "codes": [
167 |         {
168 |           "system": "RxNorm",
169 |           "code": "ALT",
170 |           "display": "variant present"
171 |         }
172 |       ],
173 |       "distributed_transition": [
174 |         {
175 |           "transition": "Patient Will Live",
176 |           "distribution": 0.5
177 |         },
178 |         {
179 |           "transition": "Patient Will Die",
180 |           "distribution": 0.5
181 |         }
182 |       ]
183 |     },
184 |     "Doxorubicin without Variant": {
185 |       "type": "MedicationOrder",
186 |       "codes": [
187 |         {
188 |           "system": "RxNorm",
189 |           "code": "REF",
190 |           "display": "variant not present"
191 |         }
192 |       ],
193 |       "distributed_transition": [
194 |         {
195 |           "transition": "Patient Will Die",
196 |           "distribution": 0.1
197 |         },
198 |         {
199 |           "transition": "Patient Will Live",
200 |           "distribution": 0.9
201 |         }
202 |       ]
203 |     }
204 |   },
205 |   "gmf_version": 2
206 | }


--------------------------------------------------------------------------------
/fhirgenomics/intro.md:
--------------------------------------------------------------------------------
1 | Sample resources for: 'A cloud-based pipeline for analysis of FHIR and long-read data' 
2 | 


--------------------------------------------------------------------------------
/genomics-data-science-vm/README.md:
--------------------------------------------------------------------------------
 1 | # Custom Genomics Data Science Virtual Machine on Microsoft Azure
 2 | 
 3 | 
 4 | 
 5 | <!-- #region -->
 6 | Virtual Machine (VM) templates in this page deploys a **Data Science Virtual Machine- Ubuntu 20.04**.
 7 | 
 8 | If you are new to Azure virtual machines, see:
 9 | - [Azure Virtual Machines](https://azure.microsoft.com/services/virtual-machines/)
10 | - [Azure Linux Virtual Machines documentation](https://docs.microsoft.com/azure/virtual-machines/linux/)
11 | 
12 | Please review the following articles before deploying the VMs on Microsoft Azure:
13 | 
14 | - [Security recommendations for virtual machines in Azure - Azure Virtual Machines | Microsoft Learn](https://learn.microsoft.com/en-us/azure/virtual-machines/security-recommendations)
15 | 
16 | - [Security features used with Azure VMs - Azure security | Microsoft Learn](https://learn.microsoft.com/en-us/azure/security/fundamentals/virtual-machines-overview?source=recommendations)
17 | 
18 | ##  Custom Genomics Data Science VM- Ubuntu 20.04
19 | 
20 | **[Guideline: Deploy custom Genomics Data Science VM](https://datasettoaexample.blob.core.windows.net/publicsample/deploycommunitygenomics.mp4)**
21 | 
22 | 
23 | 
24 | The 'Data Science Virtual Machine (DSVM)' is a 'Ubuntu 20.04' VM that has several popular tools for data exploration, analysis, modeling & development pre installed.
25 | 
26 | ## Highlights:
27 | 
28 | ### Custom Genomics features
29 | 
30 | * [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure)
31 | * [NextFlow on Azure](https://www.nextflow.io/blog/2021/introducing-nextflow-for-azure-batch.html)
32 | * [Microsoft Genomics service](https://azure.microsoft.com/en-us/products/genomics/)
33 | * [Microsoft Genomics Jupyter Notebooks](https://github.com/microsoft/genomicsnotebook)
34 | * [Bioconductor common workflows from Official Bioconductor Site](https://www.bioconductor.org/packages/release/BiocViews.html#___Workflow)
35 | 
36 | ### Operating System, Drivers and other base components
37 | 
38 | * Nvidia drivers, CUDA Toolkit, cuDNN (when GPU machines are used)
39 | * Docker
40 | * Anaconda ("conda")
41 | * Git
42 | 
43 | ### Authoring Tools
44 | 
45 | * Visual Studio Code
46 | * PyCharm Community Edition
47 | * Jupyter, Jupyter Lab
48 | 
49 | 
50 | ### ML Framework
51 | 
52 | * PyTorch, TensorFlow, scikit-learn
53 | * pyspark
54 | * dask
55 | * Vowpal Wabbit
56 | 
57 | ### Other Notable Components
58 | 
59 | * Azure CLI
60 | * Azure ML SDK for Python
61 | * Azure Storage Explorer
62 | 
63 | Users can access the DSVM via Remote Desktop, SSH or browser (eg: Jupyter Hub). Optionally, the VM can be placed in a corporate network.
64 | 
65 | Users have full access to the DSVM. If needed, configurations can be adjusted, and additional frameworks can be installed like with any other virtual machine. The image provided here is a static VM image. Maintenance and protection against vulnerabilities of provisioned DSVMs is in the customer's responsibility.
66 | 
67 | 
68 | ## Contributing
69 | 
70 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
71 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
72 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
73 | 
74 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
75 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
76 | provided by the bot. You will only need to do this once across all repos using our CLA.
77 | 
78 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
79 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
80 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
81 | 
82 | ## Licensing
83 | GATK script is released under the WDL source code license (BSD-3) (see LICENSE in https://github.com/broadinstitute/wdl). Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. [Bioconductor common workflows](https://www.bioconductor.org/packages/release/BiocViews.html#___Workflow) are the public content that users can download seperately. Users are responsible for checking of these public scripts' licensing rules. 
84 | 
85 | ## Support
86 | 
87 | For questions: Please send an e-mail to genomics@microsoft.com
88 | 
89 | ## NOTICES
90 | THIS VM IMAGE JUST PROVIDE A SAMPLE SCHEMA FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS VM IMAGE AND SAMPLE USE OF LIBRARIES ON AN “AS IS” BASIS.MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS VM IMAGE. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS VM IMAGE.
91 | 
92 | <!-- #endregion -->
93 | 


--------------------------------------------------------------------------------
/sample-notebooks/1000-genomes_Azure_Genomics_Data_Lake.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 1000 Genomes Project's data on Azure Genomics Data Lake"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebook is a great tool for data scientists who is working on Genomics data analysis. We will demonstrate Azure Jupyter notebook usage via GATK and Picard with Azure Open Dataset. \n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Download the specific data from Azure Genomics Data Lake\n",
 19 |     "2. BuildBamIndex (Picard)\n",
 20 |     "\n",
 21 |     "**Dependencies:**\n",
 22 |     "\n",
 23 |     "This notebook requires the following libraries:\n",
 24 |     "\n",
 25 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "- Genome Analysis Toolkit (GATK) (*Users need to download GATK from Broad Institute's webpage into the same compute environment with this notebook: https://github.com/broadinstitute/gatk/releases*)\n",
 29 |     "\n",
 30 |     "- Technical note: [Explore Azure Genomics Data Lake with Azure Storage Explorer](https://github.com/microsoft/genomicsnotebook/blob/main/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf)\n",
 31 |     "\n",
 32 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "# 1. Getting the 1000 Genomes Project's data from Azure Open Dataset\n",
 40 |     "\n",
 41 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find example of data calling procedure from Azure Open Dataset for `1000 Genomes Project` datasets in below:"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "**1.a.Install Azure Blob Storage SDK**"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "pip install azure-storage-blob==2.1.0"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "**1.b.Download the targeted file**"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "import os\n",
 74 |     "import uuid\n",
 75 |     "import sys\n",
 76 |     "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 77 |     "\n",
 78 |     "blob_service_client = BlockBlobService(account_name='dataset1000genomes', sas_token='sv=2019-10-10&si=prod&sr=c&sig=9nzcxaQn0NprMPlSh4RhFQHcXedLQIcFgbERiooHEqM%3D')     \n",
 79 |     "blob_service_client.get_blob_to_path('dataset/phase3/data/HG00096/alignment', 'HG00096.chrom11.ILLUMINA.bwa.GBR.low_coverage.20120522.bam', './HG00096.chrom11.ILLUMINA.bwa.GBR.low_coverage.20120522.bam')"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "# 2. BuildBamIndex (Picard)\n",
 87 |     "Generates a BAM index \".bai\" file. This tool creates an index file for the input BAM that allows fast look-up of data in a BAM file, lke an index on a database. Note that this tool cannot be run on SAM files, and that the input BAM file must be sorted in coordinate order[1].\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "!./gatk BuildBamIndex I=HG00096.chrom11.ILLUMINA.bwa.GBR.low_coverage.20120522.bam O=HG00096.chrom11.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.bai"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "# References\n",
104 |     "\n",
105 |     "1. BuildBamIndex: http://broadinstitute.github.io/picard/command-line-overview.html#BuildBamIndex\n",
106 |     "2. 1000 Genomes Project: https://www.internationalgenome.org/\n",
107 |     "\n"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "jupytext": {
113 |    "formats": "ipynb,md"
114 |   },
115 |   "kernelspec": {
116 |    "display_name": "Python 3.6 - AzureML",
117 |    "language": "python",
118 |    "name": "python3-azureml"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.6.9"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/sample-notebooks/AzureNotebooks-azure-storage-genomics-giab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Genomics Data Analysis with Azure Jupyter Notebooks- Genome in a Bottle (GIAB)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebook is a great tool for data scientists who is working on Genomics data analysis. We will demonstrate Azure Jupyter notebook usage via GATK and Picard with Azure Open Dataset. \n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Create index file for VCF file\n",
 19 |     "2. Convert the  VCF file to a table \n",
 20 |     "\n",
 21 |     "**Dependencies:**\n",
 22 |     "\n",
 23 |     "This notebook requires the following libraries:\n",
 24 |     "\n",
 25 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "- Genome Analysis Toolkit (GATK) (*Users need to download GATK from Broad Institute's webpage into the same compute environment with this notebook: https://github.com/broadinstitute/gatk/releases*)\n",
 29 |     "\n",
 30 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "# 1. Getting the GIAB Genomics data from Azure Open Dataset\n",
 38 |     "\n",
 39 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find example of data calling procedure from Azure Open Dataset for `Genome In a Bottle- GIAB` datasets in below:"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "**1.a.Install Azure Blob Storage SDK**"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "pip install azure-storage-blob==2.1.0"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "**1.b.Download the targeted file**"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |     "source": [
 71 |       "import os\n",
 72 |       "import uuid\n",
 73 |       "import sys\n",
 74 |       "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 75 |       "\n",
 76 |       "blob_service_client = BlockBlobService(account_name='datasetgiab', sas_token='sv=2019-02-02&se=2050-01-01T08%3A00%3A00Z&si=prod&sr=c&sig=7qp%2BxGLGc%2BO2MIVzzDZY7GSqEwthyGnhXJ566KoH7As%3D')     \n",
 77 |       "blob_service_client.get_blob_to_path('dataset/data/NA12878/analysis/GIAB_integration', 'NIST_RTG_PlatGen_merged_highconfidence_v0.2_Allannotate.vcf.gz', './NIST_RTG_PlatGen_merged_highconfidence_v0.2_Allannotate.vcf.gz')"
 78 |     ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# 2. Creates an index for a feature file, e.g. VCF or BED file\n",
 85 |     "\n",
 86 |     "This tool creates an index file for the various kinds of feature-containing files supported by GATK (such as VCF and BED files). An index allows querying features by a genomic interval.\n"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "!./gatk IndexFeatureFile -I NIST_RTG_PlatGen_merged_highconfidence_v0.2_Allannotate.vcf.gz "
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "# 3. Extract fields from a VCF file to a tab-delimited table "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "This tool creates an index file for the various kinds of feature-containing files supported by GATK (such as VCF and BED files). An index allows querying features by a genomic interval.\n",
110 |     "\n",
111 |     "\n",
112 |     "**INFO/site-level fields**\n",
113 |     "\n",
114 |     "Use the `-F` argument to extract INFO fields; each field will occupy a single column in the output file. The field can be any standard VCF column (e.g. CHROM, ID, QUAL) or any annotation name in the INFO field (e.g. AC, AF). The tool also supports the following additional fields:\n",
115 |     "\n",
116 |     "EVENTLENGTH (length of the event)\n",
117 |     "TRANSITION (1 for a bi-allelic transition (SNP), 0 for bi-allelic transversion (SNP), -1 for INDELs and multi-allelics)\n",
118 |     "HET (count of het genotypes)\n",
119 |     "HOM-REF (count of homozygous reference genotypes)\n",
120 |     "HOM-VAR (count of homozygous variant genotypes)\n",
121 |     "NO-CALL (count of no-call genotypes)\n",
122 |     "TYPE (type of variant, possible values are NO_VARIATION, SNP, MNP, INDEL, SYMBOLIC, and MIXED\n",
123 |     "VAR (count of non-reference genotypes)\n",
124 |     "NSAMPLES (number of samples)\n",
125 |     "NCALLED (number of called samples)\n",
126 |     "MULTI-ALLELIC (is this variant multi-allelic? true/false)\n",
127 |     "\n",
128 |     "\n",
129 |     "**FORMAT/sample-level fields**\n",
130 |     "\n",
131 |     "Use the `-GF` argument to extract FORMAT/sample-level fields. The tool will create a new column per sample with the name \"SAMPLE_NAME.FORMAT_FIELD_NAME\" e.g. NA12877.GQ, NA12878.GQ.\n",
132 |     "\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "**Input**\n",
140 |     "\n",
141 |     "A VCF file to convert to a table\n",
142 |     "\n",
143 |     "**Output**\n",
144 |     "\n",
145 |     "A tab-delimited file containing the values of the requested fields in the VCF file.\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "!./gatk VariantsToTable -V NIST_RTG_PlatGen_merged_highconfidence_v0.2_Allannotate.vcf.gz -F CHROM -F POS -F TYPE -O outputtable.table"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "# References\n",
162 |     "\n",
163 |     "1. IndexFeatureFile: https://gatk.broadinstitute.org/hc/en-us/articles/360037069832-IndexFeatureFile\n",
164 |     "2. Variants to table: https://gatk.broadinstitute.org/hc/en-us/articles/360036882811-VariantsToTable \n",
165 |     "3. Genome in a Bottle: https://www.nist.gov/programs-projects/genome-bottle \n",
166 |     "\n"
167 |    ]
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "jupytext": {
172 |    "formats": "ipynb,md"
173 |   },
174 |   "kernelspec": {
175 |    "display_name": "Python 3.6 - AzureML",
176 |    "language": "python",
177 |    "name": "python3-azureml"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": {
181 |     "name": "ipython",
182 |     "version": 3
183 |    },
184 |    "file_extension": ".py",
185 |    "mimetype": "text/x-python",
186 |    "name": "python",
187 |    "nbconvert_exporter": "python",
188 |    "pygments_lexer": "ipython3",
189 |    "version": "3.6.9"
190 |   }
191 |  },
192 |  "nbformat": 4,
193 |  "nbformat_minor": 4
194 | }
195 | 


--------------------------------------------------------------------------------
/sample-notebooks/Bioconductor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Using Bioconductor images on Azure \n",
  8 |     "\n",
  9 |     "## Featured Tags\n",
 10 |     "  \n",
 11 |     "  - latest `docker pull mcr.microsoft.com/bioconductor/bioconductor_docker:latest`\n",
 12 |     "\n",
 13 |     "## About bioconductor\n",
 14 |     "Bioconductor provides tools for the analysis and comprehension of high-throughput genomic data. Bioconductor uses the R statistical programming language, and is open source and open development. It has two releases each year, and an active user community. We're now offering a mirror of the official `Bioconductor` docker image on Microsoft Container Registry. This image can also be used as a base for your own custom genomics related docker images.\n",
 15 |     "\n",
 16 |     "**Useful links for Bioconductor**\n",
 17 |     "\n",
 18 |     "1. [Bioconductor webpage](https://www.bioconductor.org)\n",
 19 |     "\n",
 20 |     "2. [Official Bioconductor docker image](https://hub.docker.com/u/bioconductor)\n",
 21 |     "\n",
 22 |     "3. [For more information about Bioconductor and docker images](https://bioconductor.org/help/docker)\n",
 23 |     "\n",
 24 |     "## Related Repos\n",
 25 |     " - [Official Bioconductor docker image](https://hub.docker.com/r/bioconductor/bioconductor_docker)\n",
 26 |     " \n",
 27 |     " - [Bioconductor docker image on Microsoft Container Registry](https://hub.docker.com/_/microsoft-bioconductor)\n",
 28 |     "\n",
 29 |     "## How to Use this Image\n",
 30 |     "\n",
 31 |     "**Official Bioconductor docker image mirrored on Microsoft Container Registry** \n",
 32 |     "\n",
 33 |     "*Pull the 'bioconductor_docker' image from Microsoft Container Registry*\n",
 34 |     "\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "!docker pull mcr.microsoft.com/bioconductor/bioconductor_docker:latest"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "\n",
 51 |     "**Sample: Run RStudio interactively from your docker container**\n",
 52 |     "\n",
 53 |     "To run RStudio in a web browser session, run the following and access it from `127.0.0.1:8787`. The default user name is \"rstudio\" and you can specify your password as the example below (here, it is set to 'bioc'):\n",
 54 |     "```\n",
 55 |     "docker run --name bioconductor_docker_rstudio \\\n",
 56 |     "      -v ~/host-site-library:/usr/local/lib/R/host-site-library \\\n",
 57 |     "      -e PASSWORD='bioc'                               \\\n",
 58 |     "      -p 8787:8787                                     \\\n",
 59 |     "      mcr.microsoft.com/bioconductor/bioconductor_docker:latest\n",
 60 |     "```\n",
 61 |     "\n",
 62 |     "To run RStudio on your terminal:\n",
 63 |     "```\n",
 64 |     "docker run --name bioconductor_docker_rstudio \\\n",
 65 |     "      -it                                            \\\n",
 66 |     "      -v ~/host-site-library:/usr/local/lib/R/host-site-library \\\n",
 67 |     "      -e PASSWORD='bioc'                               \\\n",
 68 |     "      -p 8787:8787                                     \\\n",
 69 |     "      mcr.microsoft.com/bioconductor/bioconductor_docker:latest R \n",
 70 |     "```\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Support and feedback\n",
 78 |     "If you have any problems with or questions about this image OR to provide feedback, please contact us through an [e-mail](mailto:genomics@microsoft.com).\n",
 79 |     "\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## License\n",
 87 |     "Microsoft is mirroring the official Bioconductor docker image for Azure users. You can find the licensing details from [Official Bioconductor Package Guidelines](https://www.bioconductor.org/developers/package-guidelines/)"
 88 |    ]
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "jupytext": {
 93 |    "cell_metadata_filter": "-all",
 94 |    "notebook_metadata_filter": "-all",
 95 |    "text_representation": {
 96 |     "extension": ".md",
 97 |     "format_name": "markdown"
 98 |    }
 99 |   },
100 |   "kernelspec": {
101 |    "display_name": "Python 3.6 - AzureML",
102 |    "language": "python",
103 |    "name": "python3-azureml"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.6.9"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 4
120 | }
121 | 


--------------------------------------------------------------------------------
/sample-notebooks/SnpEff.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Genomics Data Analysis with Azure Jupyter Notebooks- SnpEff"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebooks are a great tool for data scientists who is working on Genomics data analysis. We will demonstrate how to donwload specific SnpEff dataset/database from Azure Open Datasets. \n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Download specific database from SnpEff datasets\n",
 19 |     "\n",
 20 |     "\n",
 21 |     "**Dependencies:**\n",
 22 |     "\n",
 23 |     "This notebook requires the following libraries:\n",
 24 |     "\n",
 25 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 26 |     "\n",
 27 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# 1. Getting the SnpEff data from Azure Open Datasets\n",
 35 |     "\n",
 36 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find examples of data calling procedure from Azure Open Dataset for `SnpEff` datasets in below:"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "**1.a.Install Azure Blob Storage SDK**"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "pip install azure-storage-blob==2.1.0"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "**1.b.Download the targeted file**"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |     "source": [
 68 |       "import os\n",
 69 |       "import uuid\n",
 70 |       "import sys\n",
 71 |       "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 72 |       "\n",
 73 |       "blob_service_client = BlockBlobService(account_name='datasetsnpeff', sas_token='sv=2019-10-10&st=2020-09-01T00%3A00%3A00Z&se=2050-09-01T00%3A00%3A00Z&si=prod&sr=c&sig=isafOa9tGnYBAvsXFUMDGMTbsG2z%2FShaihzp7JE5dHw%3D')     \n",
 74 |       "blob_service_client.get_blob_to_path('dataset/v5_0/hg19', 'NR_039688.1.txt', './NR_039688.1.txt')"
 75 |     ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "# References\n",
 82 |     "\n",
 83 |     "1. SnpEff: https://pcingola.github.io/SnpEff/\n"
 84 |    ]
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "jupytext": {
 89 |    "formats": "ipynb,md"
 90 |   },
 91 |   "kernelspec": {
 92 |    "display_name": "Python 3.6 - AzureML",
 93 |    "language": "python",
 94 |    "name": "python3-azureml"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 3
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython3",
106 |    "version": "3.6.9"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 4
111 | }
112 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-clinvar.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting the ClinVar data from Azure Open Dataset\n",
  8 |     "\n",
  9 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find examples of data calling procedure from Azure Open Dataset for `ClinVar` dataset in below:\n",
 10 |     "\n",
 11 |     "Users can call and download the following path with this notebook: https://datasetclinvar.blob.core.windows.net/dataset/ClinVarFullRelease_00-latest.xml.gz.md5\n",
 12 |     "\n",
 13 |     "**Important note:** Users needs to log-in their Azure Account via Azure CLI for viewing the data with Azure ML SDK. On the other hand, they do not need do any actions for downloading the data.\n",
 14 |     "\n",
 15 |     "Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Calling the data from  'ClinVar Data Set'"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import azureml.core\n",
 32 |     "print(\"Azure ML SDK Version: \", azureml.core.VERSION)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from azureml.core import  Dataset\n",
 42 |     "reference_dataset = Dataset.File.from_files('https://datasetclinvar.blob.core.windows.net/dataset')\n",
 43 |     "mount = reference_dataset.mount()"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import os\n",
 53 |     "\n",
 54 |     "REF_DIR = '/dataset'\n",
 55 |     "path = mount.mount_point + REF_DIR\n",
 56 |     "\n",
 57 |     "with mount:\n",
 58 |     "    print(os.listdir(path))"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import pandas as pd\n",
 68 |     "\n",
 69 |     "# create mount context\n",
 70 |     "mount.start()\n",
 71 |     "\n",
 72 |     "# specify path to README file\n",
 73 |     "REF_DIR = '/dataset'\n",
 74 |     "metadata_filename = '{}/{}/{}'.format(mount.mount_point, REF_DIR, '_README')\n",
 75 |     "\n",
 76 |     "# read README file\n",
 77 |     "metadata = pd.read_table(metadata_filename)\n",
 78 |     "metadata"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Download the specific file"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |     "source": [
 94 |       "import os\n",
 95 |       "import uuid\n",
 96 |       "import sys\n",
 97 |       "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 98 |       "\n",
 99 |       "blob_service_client = BlockBlobService(account_name='datasetclinvar', sas_token='sv=2019-02-02&se=2050-01-01T08%3A00%3A00Z&si=prod&sr=c&sig=qFPPwPba1RmBvaffkzkLuzabYU5dZstSTgMwxuLNME8%3D')     \n",
100 |       "blob_service_client.get_blob_to_path('dataset', 'ClinVarFullRelease_00-latest.xml.gz.md5', './ClinVarFullRelease_00-latest.xml.gz.md5')"
101 |     ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "**END OF NOTEBOOK**"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "jupytext": {
113 |    "formats": "ipynb,md"
114 |   },
115 |   "kernelspec": {
116 |    "display_name": "Python 3.6 - AzureML",
117 |    "language": "python",
118 |    "name": "python3-azureml"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.6.9"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-encode.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ENCODE on Azure Genomics Data Lake"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebook is a great tool for data scientists who is working on Genomics data analysis. We will demonstrate usage of Encyclopedia of DNA Elements (ENCODE) data from Azure Open Datasets.\n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Getting the ENCODE data from Azure Open Dataset\n",
 19 |     "2. Import the 'encode_file_manifest.tsv' to a table\n",
 20 |     "3. Checking the count of specific files\n",
 21 |     "\n",
 22 |     "**Dependencies:**\n",
 23 |     "\n",
 24 |     "This notebook requires the following libraries:\n",
 25 |     "\n",
 26 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "- Technical note: [Explore Azure Genomics Data Lake with Azure Storage Explorer](https://github.com/microsoft/genomicsnotebook/blob/main/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf)\n",
 30 |     "\n",
 31 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "# 1. Getting the ENCODE data from Azure Open Dataset\n",
 39 |     "\n",
 40 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find example of data calling procedure from Azure Open Dataset for `ENCODE` datasets in below:"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "**1.a.Install Azure Blob Storage SDK**"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "pip install azure-storage-blob==2.1.0"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "**1.b.Download the targeted file**"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import os\n",
 73 |     "import uuid\n",
 74 |     "import sys\n",
 75 |     "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 76 |     "\n",
 77 |     "blob_service_client = BlockBlobService(account_name='datasetencode', sas_token='?sv=2019-10-10&si=prod&sr=c&sig=9qSQZo4ggrCNpybBExU8SypuUZV33igI11xw0P7rB3c%3D')     \n",
 78 |     "blob_service_client.get_blob_to_path('dataset', 'encode_file_manifest.tsv', './encode_file_manifest.tsv')"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "# 2. Import the 'encode_file_manifest.tsv' to a table"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 88,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/html": [
 96 |        "<div>\n",
 97 |        "<style scoped>\n",
 98 |        "    .dataframe tbody tr th:only-of-type {\n",
 99 |        "        vertical-align: middle;\n",
100 |        "    }\n",
101 |        "\n",
102 |        "    .dataframe tbody tr th {\n",
103 |        "        vertical-align: top;\n",
104 |        "    }\n",
105 |        "\n",
106 |        "    .dataframe thead th {\n",
107 |        "        text-align: right;\n",
108 |        "    }\n",
109 |        "</style>\n",
110 |        "<table border=\"1\" class=\"dataframe\">\n",
111 |        "  <thead>\n",
112 |        "    <tr style=\"text-align: right;\">\n",
113 |        "      <th></th>\n",
114 |        "      <th>status</th>\n",
115 |        "      <th>file_format</th>\n",
116 |        "      <th>file_type</th>\n",
117 |        "      <th>assembly</th>\n",
118 |        "      <th>award.rfa</th>\n",
119 |        "      <th>output_type</th>\n",
120 |        "      <th>output_category</th>\n",
121 |        "      <th>file_size</th>\n",
122 |        "    </tr>\n",
123 |        "  </thead>\n",
124 |        "  <tbody>\n",
125 |        "    <tr>\n",
126 |        "      <th>0</th>\n",
127 |        "      <td>released</td>\n",
128 |        "      <td>bigWig</td>\n",
129 |        "      <td>bigWig</td>\n",
130 |        "      <td>GRCh38</td>\n",
131 |        "      <td>ENCODE</td>\n",
132 |        "      <td>signal p-value</td>\n",
133 |        "      <td>signal</td>\n",
134 |        "      <td>6.206849e+08</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>1</th>\n",
138 |        "      <td>released</td>\n",
139 |        "      <td>bigWig</td>\n",
140 |        "      <td>bigWig</td>\n",
141 |        "      <td>GRCh38</td>\n",
142 |        "      <td>ENCODE</td>\n",
143 |        "      <td>plus strand signal of all reads</td>\n",
144 |        "      <td>signal</td>\n",
145 |        "      <td>6.236199e+08</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>2</th>\n",
149 |        "      <td>released</td>\n",
150 |        "      <td>bigWig</td>\n",
151 |        "      <td>bigWig</td>\n",
152 |        "      <td>GRCh38</td>\n",
153 |        "      <td>ENCODE</td>\n",
154 |        "      <td>signal p-value</td>\n",
155 |        "      <td>signal</td>\n",
156 |        "      <td>6.222111e+08</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>3</th>\n",
160 |        "      <td>released</td>\n",
161 |        "      <td>bigWig</td>\n",
162 |        "      <td>bigWig</td>\n",
163 |        "      <td>GRCh38</td>\n",
164 |        "      <td>ENCODE</td>\n",
165 |        "      <td>signal p-value</td>\n",
166 |        "      <td>signal</td>\n",
167 |        "      <td>6.442427e+08</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>4</th>\n",
171 |        "      <td>released</td>\n",
172 |        "      <td>bigWig</td>\n",
173 |        "      <td>bigWig</td>\n",
174 |        "      <td>GRCh38</td>\n",
175 |        "      <td>ENCODE</td>\n",
176 |        "      <td>signal p-value</td>\n",
177 |        "      <td>signal</td>\n",
178 |        "      <td>6.222841e+08</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>...</th>\n",
182 |        "      <td>...</td>\n",
183 |        "      <td>...</td>\n",
184 |        "      <td>...</td>\n",
185 |        "      <td>...</td>\n",
186 |        "      <td>...</td>\n",
187 |        "      <td>...</td>\n",
188 |        "      <td>...</td>\n",
189 |        "      <td>...</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>641541</th>\n",
193 |        "      <td>released</td>\n",
194 |        "      <td>bigWig</td>\n",
195 |        "      <td>bigWig</td>\n",
196 |        "      <td>hg19</td>\n",
197 |        "      <td>ENCODE2</td>\n",
198 |        "      <td>summed densities signal</td>\n",
199 |        "      <td>signal</td>\n",
200 |        "      <td>1.309956e+07</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>641542</th>\n",
204 |        "      <td>released</td>\n",
205 |        "      <td>bigWig</td>\n",
206 |        "      <td>bigWig</td>\n",
207 |        "      <td>hg19</td>\n",
208 |        "      <td>ENCODE2</td>\n",
209 |        "      <td>wavelet-smoothed signal</td>\n",
210 |        "      <td>signal</td>\n",
211 |        "      <td>1.015879e+07</td>\n",
212 |        "    </tr>\n",
213 |        "    <tr>\n",
214 |        "      <th>641543</th>\n",
215 |        "      <td>released</td>\n",
216 |        "      <td>bigWig</td>\n",
217 |        "      <td>bigWig</td>\n",
218 |        "      <td>hg19</td>\n",
219 |        "      <td>ENCODE2</td>\n",
220 |        "      <td>wavelet-smoothed signal</td>\n",
221 |        "      <td>signal</td>\n",
222 |        "      <td>1.021096e+07</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>641544</th>\n",
226 |        "      <td>released</td>\n",
227 |        "      <td>bigWig</td>\n",
228 |        "      <td>bigWig</td>\n",
229 |        "      <td>hg19</td>\n",
230 |        "      <td>ENCODE2</td>\n",
231 |        "      <td>signal</td>\n",
232 |        "      <td>signal</td>\n",
233 |        "      <td>1.781798e+10</td>\n",
234 |        "    </tr>\n",
235 |        "    <tr>\n",
236 |        "      <th>641545</th>\n",
237 |        "      <td>NaN</td>\n",
238 |        "      <td>NaN</td>\n",
239 |        "      <td>NaN</td>\n",
240 |        "      <td>NaN</td>\n",
241 |        "      <td>NaN</td>\n",
242 |        "      <td>NaN</td>\n",
243 |        "      <td>NaN</td>\n",
244 |        "      <td>NaN</td>\n",
245 |        "    </tr>\n",
246 |        "  </tbody>\n",
247 |        "</table>\n",
248 |        "<p>641546 rows × 8 columns</p>\n",
249 |        "</div>"
250 |       ],
251 |       "text/plain": [
252 |        "          status file_format file_type assembly award.rfa  \\\n",
253 |        "0       released      bigWig    bigWig   GRCh38    ENCODE   \n",
254 |        "1       released      bigWig    bigWig   GRCh38    ENCODE   \n",
255 |        "2       released      bigWig    bigWig   GRCh38    ENCODE   \n",
256 |        "3       released      bigWig    bigWig   GRCh38    ENCODE   \n",
257 |        "4       released      bigWig    bigWig   GRCh38    ENCODE   \n",
258 |        "...          ...         ...       ...      ...       ...   \n",
259 |        "641541  released      bigWig    bigWig     hg19   ENCODE2   \n",
260 |        "641542  released      bigWig    bigWig     hg19   ENCODE2   \n",
261 |        "641543  released      bigWig    bigWig     hg19   ENCODE2   \n",
262 |        "641544  released      bigWig    bigWig     hg19   ENCODE2   \n",
263 |        "641545       NaN         NaN       NaN      NaN       NaN   \n",
264 |        "\n",
265 |        "                            output_type output_category     file_size  \n",
266 |        "0                        signal p-value          signal  6.206849e+08  \n",
267 |        "1       plus strand signal of all reads          signal  6.236199e+08  \n",
268 |        "2                        signal p-value          signal  6.222111e+08  \n",
269 |        "3                        signal p-value          signal  6.442427e+08  \n",
270 |        "4                        signal p-value          signal  6.222841e+08  \n",
271 |        "...                                 ...             ...           ...  \n",
272 |        "641541          summed densities signal          signal  1.309956e+07  \n",
273 |        "641542          wavelet-smoothed signal          signal  1.015879e+07  \n",
274 |        "641543          wavelet-smoothed signal          signal  1.021096e+07  \n",
275 |        "641544                           signal          signal  1.781798e+10  \n",
276 |        "641545                              NaN             NaN           NaN  \n",
277 |        "\n",
278 |        "[641546 rows x 8 columns]"
279 |       ]
280 |      },
281 |      "execution_count": 88,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "import pandas as pd\n",
288 |     "\n",
289 |     "# read encode_file_manifest.tsv into a dataframe\n",
290 |     "\n",
291 |     "metadata = pd.read_table('encode_file_manifest.tsv',sep='\\t')\n",
292 |     "\n",
293 |     "metadata.iloc[:,[1,2,3,4,5,10,11,12]]"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "# 3. Checking the count of specific files"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 94,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "name": "stdout",
310 |      "output_type": "stream",
311 |      "text": [
312 |       "There are 641546 files in this dataset\n",
313 |       "There are 41423 ENCODE award.rfa files in this dataset\n",
314 |       "There are 22487 ENCODE2 award.rfa files in this dataset:\n",
315 |       "There are 163495 hg.19 assembled files in this dataset\n",
316 |       "There are 281223 GRCh38 assembled files in this dataset\n",
317 |       "There are 74579 signal p-value output type in this dataset\n",
318 |       "There are 142 wavelet-smoothed signal output type in this dataset\n",
319 |       "There are 16 summed densities signal output type in this dataset\n"
320 |      ]
321 |     }
322 |    ],
323 |    "source": [
324 |     "# let's take a quick look around\n",
325 |     "\n",
326 |     "num_entries = len(metadata)\n",
327 |     "\n",
328 |     "print(\"There are {} files in this dataset\".format(num_entries))\n",
329 |     "\n",
330 |     "num_ENCODE=metadata['award.rfa'].eq('ENCODE').sum()\n",
331 |     "\n",
332 |     "print(\"There are {} ENCODE award.rfa files in this dataset\".format(num_ENCODE))\n",
333 |     "\n",
334 |     "num_ENCODE2=metadata['award.rfa'].eq('ENCODE2').sum()\n",
335 |     "\n",
336 |     "print(\"There are {} ENCODE2 award.rfa files in this dataset:\".format(num_ENCODE2))\n",
337 |     "\n",
338 |     "num_hg19=metadata['assembly'].eq('hg19').sum()\n",
339 |     "\n",
340 |     "print(\"There are {} hg.19 assembled files in this dataset\".format(num_hg19))\n",
341 |     "\n",
342 |     "\n",
343 |     "num_GRCh38=metadata['assembly'].eq('GRCh38').sum()\n",
344 |     "\n",
345 |     "print(\"There are {} GRCh38 assembled files in this dataset\".format(num_GRCh38))\n",
346 |     "\n",
347 |     "\n",
348 |     "num_signal=metadata['output_type'].eq('signal p-value').sum()\n",
349 |     "\n",
350 |     "print(\"There are {} signal p-value output type in this dataset\".format(num_signal))\n",
351 |     "\n",
352 |     "\n",
353 |     "num_wavelet=metadata['output_type'].eq('wavelet-smoothed signal').sum()\n",
354 |     "\n",
355 |     "print(\"There are {} wavelet-smoothed signal output type in this dataset\".format(num_wavelet))\n",
356 |     "\n",
357 |     "\n",
358 |     "num_density=metadata['output_type'].eq('summed densities signal').sum()\n",
359 |     "\n",
360 |     "print(\"There are {} summed densities signal output type in this dataset\".format(num_density))\n",
361 |     "\n"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "# Reference\n",
369 |     "\n",
370 |     "1. [ENCODE: Encyclopedia of DNA Elements](https://www.encodeproject.org)\n",
371 |     "\n"
372 |    ]
373 |   }
374 |  ],
375 |  "metadata": {
376 |   "jupytext": {
377 |    "formats": "ipynb,md"
378 |   },
379 |   "kernelspec": {
380 |    "display_name": "Python 3.6 - AzureML",
381 |    "language": "python",
382 |    "name": "python3-azureml"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 3
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython3",
394 |    "version": "3.6.9"
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 4
399 | }
400 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-opencravat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Download Datasets from Azure Genomics Data Lake with Jupyter Notebooks-OpenCRAVAT"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebooks are a great tool for data scientists who is working on Genomics data analysis. We will demonstrate how to donwload specific OpenCRAVAT dataset/database from Azure Open Datasets. \n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Download specific database from OpenCRAVAT datasets\n",
 19 |     "2. Deploy Data Science VM on Azure for OpenCRAVAT\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "**Dependencies:**\n",
 23 |     "\n",
 24 |     "This notebook requires the following libraries:\n",
 25 |     "\n",
 26 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 27 |     "\n",
 28 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# 1. Getting the OpenCRAVAT data from Azure Open Datasets\n",
 36 |     "\n",
 37 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find examples of data calling procedure from Azure Open Dataset for `OpenCRAVAT` datasets in below:"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "**1.a.Install Azure Blob Storage SDK**"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "pip install azure-storage-blob==2.1.0"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "**1.b.Download the targeted file**"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import os\n",
 70 |     "import uuid\n",
 71 |     "import sys\n",
 72 |     "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 73 |     "\n",
 74 |     "blob_service_client = BlockBlobService(account_name='datasetopencravat', sas_token='sv=2020-04-08&st=2021-03-11T23%3A50%3A01Z&se=2025-07-26T22%3A50%3A00Z&sr=c&sp=rl&sig=J9J9wnJOXsmEy7TFMq9wjcxjXDE%2B7KhGpCUL4elsC14%3D')     \n",
 75 |     "blob_service_client.get_blob_to_path('dataset/modules/gnomad/2.2.0', 'gnomad.yml', './gnomad.yml')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "# 2. Deploy Data Science VM on Microsoft Azure for OpenCRAVAT"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "Virtual Machine (VM) image in this page deploys a **Custom Data Science Virtual Machine for Genomics Analysis- Ubuntu 20.04**.\n",
 90 |     "\n",
 91 |     "If you are new to Azure virtual machines, see:\n",
 92 |     "- [Azure Virtual Machines](https://azure.microsoft.com/services/virtual-machines/)\n",
 93 |     "- [Azure Data Science VM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/)\n",
 94 |     "- [Azure Windows Virtual Machines documentation](https://docs.microsoft.com/azure/virtual-machines/windows/)\n",
 95 |     "\n",
 96 |     "\n",
 97 |     "This VM image allows you to deploy a recommended **Data Science Virtual Machine for using OpenCRAVAT**.\n",
 98 |     "\n",
 99 |     "BEFORE USE THIS VM Image- PLEASE READ: [Security recommendations for virtual machines in Azure](https://learn.microsoft.com/en-us/azure/virtual-machines/security-recommendations).\n",
100 |     "\n",
101 |     "ATTENTION: THIS VM Image JUST PROVIDES A SAMPLE SCHEMA FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS VM TEMPLATE AND SAMPLE USE OF LIBRARIES ON AN “AS IS” BASIS.MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS VM TEMPLATE. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS VM TEMPLATE.\n",
102 |     "\n",
103 |     "[Guideline: Deploy custom Genomics Data Science VM](https://datasettoaexample.blob.core.windows.net/publicsample/deploycommunitygenomics.mp4)\n"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "# References\n",
111 |     "\n",
112 |     "1.[OpenCRAVAT offical webpage](https://opencravat.org/)\n",
113 |     "\n"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "jupytext": {
119 |    "formats": "ipynb,md"
120 |   },
121 |   "kernelspec": {
122 |    "display_name": "Python 3 (ipykernel)",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.8.15"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 4
141 | }
142 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-platinum-genomes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting the Illumina Platinum Genomes from Azure Open Datasets and Doing Initial Analysis "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebooks are a great tool for data scientists who is working on Genomics data analysis. We will demonstrate Azure Jupyter notebook usage via GATK and Picard with Azure Open Datasets. \n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "1. Annotate genotypes using VariantFiltration\n",
 19 |     "2. Select Specific Variants\n",
 20 |     "3. Filter the relevant variants- no calls OR specific regions\n",
 21 |     "4. Perform concordance analysis\n",
 22 |     "5. Convert the final VCF files to a table \n",
 23 |     "\n",
 24 |     "**Dependencies:**\n",
 25 |     "\n",
 26 |     "This notebook requires the following libraries:\n",
 27 |     "\n",
 28 |     "- Azure storage `pip install azure-storage-blob`\n",
 29 |     "\n",
 30 |     "- numpy `pip install numpy`\n",
 31 |     "\n",
 32 |     "- Genome Analysis Toolkit (GATK) (*Users need to download GATK from Broad Institute's webpage into the same compute environment with this notebook: https://github.com/broadinstitute/gatk/releases*)\n",
 33 |     "\n",
 34 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Getting the Genomics data from Azure Open Datasets\n",
 42 |     "\n",
 43 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find examples of data calling procedure from Azure Open Dataset for `Illumina Platinum Genomes` datasets in below:"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Downloading the specific 'Illumina Platinum Genomes'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |     "source": [
 59 |       "import os\n",
 60 |       "import uuid\n",
 61 |       "import sys\n",
 62 |       "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 63 |       "\n",
 64 |       "blob_service_client = BlockBlobService(account_name='datasetplatinumgenomes', sas_token='sv=2019-02-02&se=2050-01-01T08%3A00%3A00Z&si=prod&sr=c&sig=FFfZ0QaDcnEPQmWsshtpoYOjbzd4jtwIWeK%2Fc4i9MqM%3D')     \n",
 65 |       "blob_service_client.get_blob_to_path('dataset/2017-1.0/hg38/small_variants/NA12877', 'NA12877.vcf.gz', './NA12877.vcf.gz')"
 66 |     ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "# 1. Annotate genotypes using VariantFiltration\n",
 73 |     "\n",
 74 |     "**Important note: Please check your GATK is running on your system.**\n",
 75 |     "\n",
 76 |     "If we want to filter heterozygous genotypes, we use VariantFiltration's `--genotype-filter-expression isHet == 1` option. We can specify the annotation value for the tool to label the heterozygous genotypes with with the `--genotype-filter-name` option. Here, this parameter's value is set to `isHetFilter`. In our first example, we used `NA12877.vcf.gz` from Illimina Platinum Genomes but users can use any vcf files from other datasets:`Platinum Genomes`"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "run gatk VariantFiltration -V NA12877.vcf.gz -O outputannot.vcf --genotype-filter-expression \"isHet == 1\" --genotype-filter-name \"isHetFilter\""
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# 2. Select Specific Variants"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "Select a subset of variants from a VCF file\n",
102 |     "This tool makes it possible to select a subset of variants based on various criteria in order to facilitate certain analyses. Examples of such analyses include comparing and contrasting cases vs. controls, extracting variant or non-variant loci that meet certain requirements, or troubleshooting some unexpected results, to name a few.\n",
103 |     "\n",
104 |     "There are many different options for selecting subsets of variants from a larger call set:\n",
105 |     "\n",
106 |     "Extract one or more samples from a callset based on either a complete sample name or a pattern match.\n",
107 |     "Specify criteria for inclusion that place thresholds on annotation values, **e.g. \"DP > 1000\" (depth of coverage greater than 1000x), \"AF < 0.25\" (sites with allele frequency less than 0.25)**.These criteria are written as \"JEXL expressions\", which are documented in the article about using JEXL expressions.\n",
108 |     "Provide concordance or discordance tracks in order to include or exclude variants that are also present in other given callsets.\n",
109 |     "Select variants based on criteria like their type (e.g. INDELs only), evidence of mendelian violation, filtering status, allelicity, etc.\n",
110 |     "There are also several options for recording the original values of certain annotations which are recalculated when one subsets the new callset, trims alleles, etc.\n",
111 |     "\n",
112 |     "**Input**\n",
113 |     "\n",
114 |     "A variant call set in VCF format from which a subset can be selected.\n",
115 |     "\n",
116 |     "**Output**\n",
117 |     "\n",
118 |     "A new VCF file containing the selected subset of variants."
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "scrolled": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "run gatk SelectVariants -R Homo_sapiens_assembly38.fasta -V outputannot.vcf --select-type-to-include SNP --select-type-to-include INDEL -O selective.vcf"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "# 3. Transform filtered genotypes to no call \n",
137 |     "\n",
138 |     "Running SelectVariants with --set-filtered-gt-to-nocall will further transform the flagged genotypes with a null genotype call. \n",
139 |     "\n",
140 |     "This conversion is necessary because downstream tools do not parse the FORMAT-level filter field."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "How can we filter the variants with with **'No call'**\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "run gatk SelectVariants -V outputannot.vcf --set-filtered-gt-to-nocall -O outputnocall.vcf"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "# 4. Check the Concordance of VCF file with Ground Truth"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Evaluate site-level concordance of an input VCF against a truth VCF.\n",
171 |     "This tool evaluates two variant callsets against each other and produces a six-column summary metrics table. \n",
172 |     "\n",
173 |     "**This function will :**\n",
174 |     "\n",
175 |     "1. Stratifies SNP and INDEL calls\n",
176 |     "2. Report true-positive,False-positive and false-negative calls\n",
177 |     "3. Calculates sensitivity and precision\n",
178 |     "\n",
179 |     "The tool assumes all records in the --truth VCF are passing truth variants. For the -eval VCF, the tool uses only unfiltered passing calls.\n",
180 |     "\n",
181 |     "Optionally, the tool can be set to produce VCFs of the following variant records, annotated with each variant's concordance status:\n",
182 |     "\n",
183 |     "True positives and false negatives (i.e. all variants in the truth VCF): useful for calculating sensitivity\n",
184 |     "\n",
185 |     "True positives and false positives (i.e. all variants in the eval VCF): useful for obtaining a training data set for machine learning classifiers of artifacts\n",
186 |     "\n",
187 |     "**These output VCFs can be passed to VariantsToTable to produce a TSV file for statistical analysis in R or Python.**"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "scrolled": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     " run gatk Concordance -R Homo_sapiens_assembly38.fasta -eval outputannot.vcf --truth outputnocall.vcf  --summary summary.tsv "
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "# 5. VariantsToTable"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "Extract fields from a VCF file to a tab-delimited table\n",
213 |     "This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and FORMAT (i.e. sample-level) fields.\n",
214 |     "\n",
215 |     "\n",
216 |     "**INFO/site-level fields**\n",
217 |     "\n",
218 |     "Use the `-F` argument to extract INFO fields; each field will occupy a single column in the output file. The field can be any standard VCF column (e.g. CHROM, ID, QUAL) or any annotation name in the INFO field (e.g. AC, AF). The tool also supports the following additional fields:\n",
219 |     "\n",
220 |     "EVENTLENGTH (length of the event)\n",
221 |     "TRANSITION (1 for a bi-allelic transition (SNP), 0 for bi-allelic transversion (SNP), -1 for INDELs and multi-allelics)\n",
222 |     "HET (count of het genotypes)\n",
223 |     "HOM-REF (count of homozygous reference genotypes)\n",
224 |     "HOM-VAR (count of homozygous variant genotypes)\n",
225 |     "NO-CALL (count of no-call genotypes)\n",
226 |     "TYPE (type of variant, possible values are NO_VARIATION, SNP, MNP, INDEL, SYMBOLIC, and MIXED\n",
227 |     "VAR (count of non-reference genotypes)\n",
228 |     "NSAMPLES (number of samples)\n",
229 |     "NCALLED (number of called samples)\n",
230 |     "MULTI-ALLELIC (is this variant multi-allelic? true/false)\n",
231 |     "\n",
232 |     "\n",
233 |     "**FORMAT/sample-level fields**\n",
234 |     "\n",
235 |     "Use the `-GF` argument to extract FORMAT/sample-level fields. The tool will create a new column per sample with the name \"SAMPLE_NAME.FORMAT_FIELD_NAME\" e.g. NA12877.GQ, NA12878.GQ.\n",
236 |     "\n"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "**Input**\n",
244 |     "\n",
245 |     "A VCF file to convert to a table\n",
246 |     "\n",
247 |     "**Output**\n",
248 |     "\n",
249 |     "A tab-delimited file containing the values of the requested fields in the VCF file.\n"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "run gatk VariantsToTable -V NA12877.vcf.gz -F CHROM -F POS -F TYPE -F AC -F AD -F AF -GF DP -GF AD -O outputtable.table"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "# References\n",
266 |     "\n",
267 |     "1. VariantFiltration: https://gatk.broadinstitute.org/hc/en-us/articles/360036827111-VariantFiltration \n",
268 |     "2. Select Variants:https://gatk.broadinstitute.org/hc/en-us/articles/360037052272-SelectVariants\n",
269 |     "3. Concordance: https://gatk.broadinstitute.org/hc/en-us/articles/360041851651-Concordance\n",
270 |     "4. Variants to table: https://gatk.broadinstitute.org/hc/en-us/articles/360036882811-VariantsToTable \n",
271 |     "5. Illumina Platinum Genomes:https://www.illumina.com/platinumgenomes.html \n",
272 |     "\n"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "**END OF NOTEBOOK**"
280 |    ]
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "jupytext": {
285 |    "formats": "ipynb,md"
286 |   },
287 |   "kernelspec": {
288 |    "display_name": "Python 3.6 - AzureML",
289 |    "language": "python",
290 |    "name": "python3-azureml"
291 |   },
292 |   "language_info": {
293 |    "codemirror_mode": {
294 |     "name": "ipython",
295 |     "version": 3
296 |    },
297 |    "file_extension": ".py",
298 |    "mimetype": "text/x-python",
299 |    "name": "python",
300 |    "nbconvert_exporter": "python",
301 |    "pygments_lexer": "ipython3",
302 |    "version": "3.6.9"
303 |   }
304 |  },
305 |  "nbformat": 4,
306 |  "nbformat_minor": 4
307 | }
308 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-prereqs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "attachments": {},
  5 |       "cell_type": "markdown",
  6 |       "metadata": {},
  7 |       "source": [
  8 |         "# **Installing Cromwell on Azure Prerequisites**\n",
  9 |         "This guide goes through the steps to deploy the prerequisites listed on the Cromwell on Azure page [here](genomics.ipynb).\n",
 10 |         "\n",
 11 |         "\n",
 12 |         "**Here is the coverage of this notebook:**\n",
 13 |         "1. Install Azure CLI \n",
 14 |         "2. Download & Extract Azure AzCopy \n",
 15 |         "3. Download Picard Jar file \n",
 16 |         "4. Test Picard \n",
 17 |         "5. Install Helm \n",
 18 |         "6. Download extract GATK \n",
 19 |         "7. Download Cromwell \n",
 20 |         "8. Deploy Cromwell \n",
 21 |         "\n",
 22 |         "\n",
 23 |         "---\n",
 24 |         "\n",
 25 |         "### **1. Install Azure CLI**\n",
 26 |         "\n",
 27 |         "Reference link:  https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt\n"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "execution_count": null,
 33 |       "metadata": {},
 34 |       "outputs": [],
 35 |       "source": [
 36 |         "%%bash\n",
 37 |         "curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "attachments": {},
 42 |       "cell_type": "markdown",
 43 |       "metadata": {},
 44 |       "source": [
 45 |         "### **2. Download & extract Azure AzCopy**\n",
 46 |         "\n",
 47 |         "Reference link:  https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10"
 48 |       ]
 49 |     },
 50 |     {
 51 |       "cell_type": "code",
 52 |       "execution_count": null,
 53 |       "metadata": {},
 54 |       "outputs": [],
 55 |       "source": [
 56 |         "%%bash\n",
 57 |         "wget https://aka.ms/downloadazcopy-v10-linux\n",
 58 |         "\n",
 59 |         "tar -xvzf downloadazcopy-v10-linux"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "attachments": {},
 64 |       "cell_type": "markdown",
 65 |       "metadata": {},
 66 |       "source": [
 67 |         "### **3. Download Picard Jar file**\n",
 68 |         "\n",
 69 |         "Reference Link:  https://broadinstitute.github.io/picard/\n",
 70 |         "\n",
 71 |         "This one is currently set to use Release 2.27.5."
 72 |       ]
 73 |     },
 74 |     {
 75 |       "cell_type": "code",
 76 |       "execution_count": null,
 77 |       "metadata": {},
 78 |       "outputs": [],
 79 |       "source": [
 80 |         "%%bash\n",
 81 |         "wget https://github.com/broadinstitute/picard/releases/download/2.27.5/picard.jar"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "attachments": {},
 86 |       "cell_type": "markdown",
 87 |       "metadata": {},
 88 |       "source": [
 89 |         "### **4. Test Picard**\n",
 90 |         "\n",
 91 |         "Use these steps to verify the Picard.jar file is compatible with the Java version on the system."
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "execution_count": null,
 97 |       "metadata": {},
 98 |       "outputs": [],
 99 |       "source": [
100 |         "%%bash\n",
101 |         "java -jar picard.jar -h"
102 |       ]
103 |     },
104 |     {
105 |       "attachments": {},
106 |       "cell_type": "markdown",
107 |       "metadata": {},
108 |       "source": [
109 |         "### **5. Install Helm**\n",
110 |         "\n",
111 |         "Reference Link:  https://helm.sh/docs/intro/install/"
112 |       ]
113 |     },
114 |     {
115 |       "cell_type": "code",
116 |       "execution_count": null,
117 |       "metadata": {},
118 |       "outputs": [],
119 |       "source": [
120 |         "%%bash\n",
121 |         "wget https://get.helm.sh/helm-v3.12.1-linux-amd64.tar.gz\n",
122 |         "\n",
123 |         "tar -xvzf helm-v3.12.1-linux-amd64.tar.gz\n"
124 |       ]
125 |     },
126 |     {
127 |       "attachments": {},
128 |       "cell_type": "markdown",
129 |       "metadata": {
130 |         "nteract": {
131 |           "transient": {
132 |             "deleting": false
133 |           }
134 |         }
135 |       },
136 |       "source": [
137 |         "Next Copy the Helm File to /usr/local/bin/helm per the Helm install instructions.\n",
138 |         "\n",
139 |         "Reference Link:  https://helm.sh/docs/intro/install/"
140 |       ]
141 |     },
142 |     {
143 |       "cell_type": "code",
144 |       "execution_count": 37,
145 |       "metadata": {
146 |         "jupyter": {
147 |           "outputs_hidden": false,
148 |           "source_hidden": false
149 |         },
150 |         "nteract": {
151 |           "transient": {
152 |             "deleting": false
153 |           }
154 |         }
155 |       },
156 |       "outputs": [],
157 |       "source": [
158 |         "%%bash\n",
159 |         "sudo cp linux-amd64/helm /usr/local/bin/helm"
160 |       ]
161 |     },
162 |     {
163 |       "attachments": {},
164 |       "cell_type": "markdown",
165 |       "metadata": {},
166 |       "source": [
167 |         "### **6. Download extract GATK**\n",
168 |         "\n",
169 |         "Reference Link:  https://github.com/broadinstitute/gatk/releases\n",
170 |         "\n",
171 |         "**Download the package:**"
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "execution_count": null,
177 |       "metadata": {},
178 |       "outputs": [],
179 |       "source": [
180 |         "%%bash\n",
181 |         "wget https://github.com/broadinstitute/gatk/archive/refs/tags/4.4.0.0.tar.gz"
182 |       ]
183 |     },
184 |     {
185 |       "attachments": {},
186 |       "cell_type": "markdown",
187 |       "metadata": {},
188 |       "source": [
189 |         "**Extract the package:**"
190 |       ]
191 |     },
192 |     {
193 |       "cell_type": "code",
194 |       "execution_count": null,
195 |       "metadata": {
196 |         "gather": {
197 |           "logged": 1687380091970
198 |         }
199 |       },
200 |       "outputs": [],
201 |       "source": [
202 |         "%%bash\n",
203 |         "tar -xvzf 4.4.0.0.tar.gz"
204 |       ]
205 |     },
206 |     {
207 |       "attachments": {},
208 |       "cell_type": "markdown",
209 |       "metadata": {},
210 |       "source": [
211 |         "### **7. Download Cromwell package**\n",
212 |         "\n",
213 |         "Reference Link:  https://github.com/microsoft/CromwellOnAzure/releases\n",
214 |         "\n",
215 |         "Download the package:"
216 |       ]
217 |     },
218 |     {
219 |       "cell_type": "code",
220 |       "execution_count": null,
221 |       "metadata": {},
222 |       "outputs": [],
223 |       "source": [
224 |         "%%bash\n",
225 |         "wget https://github.com/microsoft/CromwellOnAzure/releases/download/4.3.0/deploy-cromwell-on-azure-linux.tar.gz\n",
226 |         "\n",
227 |         "tar -xvzf deploy-cromwell-on-azure-linux.tar.gz"
228 |       ]
229 |     },
230 |     {
231 |       "attachments": {},
232 |       "cell_type": "markdown",
233 |       "metadata": {},
234 |       "source": [
235 |         "### **8. Deploy Cromwell Package**\n",
236 |         "\n",
237 |         "First Login, this will display a URL and login code.  \n",
238 |         "Click the URL and enter the code."
239 |       ]
240 |     },
241 |     {
242 |       "cell_type": "code",
243 |       "execution_count": null,
244 |       "metadata": {},
245 |       "outputs": [],
246 |       "source": [
247 |         "! az login"
248 |       ]
249 |     },
250 |     {
251 |       "attachments": {},
252 |       "cell_type": "markdown",
253 |       "metadata": {},
254 |       "source": [
255 |         "Next , run the deployment script.\n",
256 |         "\n",
257 |         "This takes a while to run, deploys all the resources in the named prefix resource group in the script parameters.\n",
258 |         "\n",
259 |         "* ENTERYOURSUBSCRIPTIONGUID = You can get the subscription ID from the base resource page.  Copy it and use it here.\n",
260 |         "* ENTERYOURPREFIX = Resource group prefix\n",
261 |         "* ENTERYOURREGION = Not available in all regions."
262 |       ]
263 |     },
264 |     {
265 |       "cell_type": "code",
266 |       "execution_count": null,
267 |       "metadata": {
268 |         "gather": {
269 |           "logged": 1687385591817
270 |         }
271 |       },
272 |       "outputs": [],
273 |       "source": [
274 |         "# example:\n",
275 |         "# ! ./deploy-cromwell-on-azure-linux --SubscriptionId 2a20e080-4248-468d-a088-5b157921afbf --RegionName eastus2 --MainIdentifierPrefix coaprefix\n",
276 |         "\n",
277 |         "! ./deploy-cromwell-on-azure-linux --SubscriptionId <ENTERYOURSUBSCRIPTIONGUID> --RegionName <ENTERYOURREGION> --MainIdentifierPrefix <ENTERYOURPREFIX>\n"
278 |       ]
279 |     }
280 |   ],
281 |   "metadata": {
282 |     "kernel_info": {
283 |       "name": "python310-sdkv2"
284 |     },
285 |     "kernelspec": {
286 |       "display_name": "Python 3.10 - SDK v2",
287 |       "language": "python",
288 |       "name": "python310-sdkv2"
289 |     },
290 |     "language_info": {
291 |       "codemirror_mode": {
292 |         "name": "ipython",
293 |         "version": 3
294 |       },
295 |       "file_extension": ".py",
296 |       "mimetype": "text/x-python",
297 |       "name": "python",
298 |       "nbconvert_exporter": "python",
299 |       "pygments_lexer": "ipython3",
300 |       "version": "3.10.11"
301 |     },
302 |     "microsoft": {
303 |       "ms_spell_check": {
304 |         "ms_spell_check_language": "en"
305 |       }
306 |     },
307 |     "nteract": {
308 |       "version": "nteract-front-end@1.0.0"
309 |     },
310 |     "orig_nbformat": 4
311 |   },
312 |   "nbformat": 4,
313 |   "nbformat_minor": 2
314 | }
315 | 


--------------------------------------------------------------------------------
/sample-notebooks/genomics-reference-genomes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Getting the Reference Genomes from Azure Open Datasets\n",
  8 |     "\n",
  9 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. You can find examples of data calling procedure from Azure Open Datasets for `Reference Genomes` dataset in below:\n",
 10 |     "\n",
 11 |     "Users can call and download the following path with this notebook: https://datasetreferencegenomes.blob.core.windows.net/dataset/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/genomic_regions_definitions.txt\n",
 12 |     "\n",
 13 |     "**Important note:** Users needs to log-in their Azure Account via Azure CLI for viewing the data with Azure ML SDK. On the other hand, they do not need do any actions for downloading the data.\n",
 14 |     "\n",
 15 |     "Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Calling the data from  'Reference Genome Datasets'"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import azureml.core\n",
 32 |     "print(\"Azure ML SDK Version: \", azureml.core.VERSION)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from azureml.core import  Dataset\n",
 42 |     "reference_dataset = Dataset.File.from_files('https://datasetreferencegenomes.blob.core.windows.net/dataset')\n",
 43 |     "mount = reference_dataset.mount()"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import os\n",
 53 |     "\n",
 54 |     "REF_DIR = '/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure'\n",
 55 |     "path = mount.mount_point + REF_DIR\n",
 56 |     "\n",
 57 |     "with mount:\n",
 58 |     "    print(os.listdir(path))"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import pandas as pd\n",
 68 |     "\n",
 69 |     "# create mount context\n",
 70 |     "mount.start()\n",
 71 |     "\n",
 72 |     "# specify path to genomic_regions_definitions.txt file\n",
 73 |     "REF_DIR = 'vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure'\n",
 74 |     "metadata_filename = '{}/{}/{}'.format(mount.mount_point, REF_DIR, 'genomic_regions_definitions.txt')\n",
 75 |     "\n",
 76 |     "# read genomic_regions_definitions.txt file\n",
 77 |     "metadata = pd.read_table(metadata_filename)\n",
 78 |     "metadata"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Download the specific file"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |     "source": [
 94 |       "import os\n",
 95 |       "import uuid\n",
 96 |       "import sys\n",
 97 |       "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 98 |       "\n",
 99 |       "blob_service_client = BlockBlobService(account_name='datasetreferencegenomes',sas_token='sv=2019-02-02&se=2050-01-01T08%3A00%3A00Z&si=prod&sr=c&sig=JtQoPFqiC24GiEB7v9zHLi4RrA2Kd1r%2F3iFt2l9%2FlV8%3D')     \n",
100 |       "blob_service_client.get_blob_to_path('dataset/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure', 'genomic_regions_definitions.txt', './genomic_regions_definitions.txt')"
101 |     ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "**END OF NOTEBOOK**"
108 |    ]
109 |   }
110 |  ],
111 |  "metadata": {
112 |   "jupytext": {
113 |    "formats": "ipynb,md"
114 |   },
115 |   "kernelspec": {
116 |    "display_name": "Python 3.6 - AzureML",
117 |    "language": "python",
118 |    "name": "python3-azureml"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.6.9"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 4
135 | }
136 | 


--------------------------------------------------------------------------------
/sample-notebooks/graphragforgenomics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a4bde8fc-2b61-430e-a181-7c6e2b799fe3",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Enhancing Genomics Annotation with GraphRAG\n",
  9 |     "\n",
 10 |     "The intersection of generative AI and genomics is rapidly reshaping how researchers understand and annotate complex biological data. Among the emerging techniques, GraphRAG (Graph Retrieval-Augmented Generation) stands out by integrating structured knowledge graphs with large language models to enhance contextual reasoning and data retrieval. In genomics annotation, where relationships between genes, proteins, and phenotypes are intricate and deeply interlinked, GraphRAG offers a novel approach to navigate this complexity with greater precision and interpretability. This notebook is a follow up study of our previous [RAG research paper](https://doi.org/10.1093/bioadv/vbaf019) and explores how GraphRAG can be leveraged to accelerate and improve the annotation of genomic sequences, bringing AI one step closer to becoming a trusted tool in molecular biology.\n",
 11 |     "\n",
 12 |     "### Requirements\n",
 13 |     "1. Python 3.10-3.12\n",
 14 |     "2. Sample annotation data: chr1_sample.txt\n",
 15 |     "\n",
 16 |     "### Dependencies: \n",
 17 |     "\n",
 18 |     "graphrag, networkx, matplotlib \n",
 19 |     "\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "6980640e-d24c-44f2-8bc4-84f4ef7515c9",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "### 1. Install GraphRAG"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "fbca9d9a-4acb-49f0-84b9-d20d934d06b7",
 34 |    "metadata": {
 35 |     "tags": []
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "!pip install graphrag"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "0afaa495-a2ab-41cc-9401-e9ecc458d454",
 45 |    "metadata": {
 46 |     "tags": []
 47 |    },
 48 |    "source": [
 49 |     "### 2. Run genomics annotation indexer"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "5a9e3976-bf8c-41ac-89a1-f77bc85ec248",
 56 |    "metadata": {
 57 |     "tags": []
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "mkdir -p ./genomicsragtest/input"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "id": "e5e5c59d-ef30-4ac4-ba5c-52588161eae4",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Please download the sample data from: "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "c646642c-9aed-4215-89c1-ce0a768c85ca",
 76 |    "metadata": {
 77 |     "tags": []
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "!curl https://raw.githubusercontent.com/microsoft/genomicsnotebook/refs/heads/main/docs/clinvar_chr1_sample_mini.txt -o ./genomicsragtest/input/clinvar_chr1_sample_mini.txt"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "id": "f17a975f-ea5a-4bdb-a437-188817876364",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### 3. Set Up Your Workspace Variables"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "id": "6d9bd6a7-0994-4f2d-bd6b-6851fd0e0699",
 96 |    "metadata": {
 97 |     "tags": []
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "!graphrag init --root ./genomicsragtest"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "id": "41c5f527-3185-4ffc-8660-83c08a1244e2",
107 |    "metadata": {},
108 |    "source": [
109 |     "### 4. Running the Indexing pipeline"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "id": "8a786c3f-f78c-4e3a-ab08-30f0c871d269",
116 |    "metadata": {
117 |     "scrolled": true,
118 |     "tags": []
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "!graphrag index --root .\"/genomicsragtest\""
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "6a0935e8-e12f-4f28-b03e-09cd0fedeed4",
128 |    "metadata": {},
129 |    "source": [
130 |     "### 5. Running the Genomics Annotation Query Engine"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "6219b030-0e47-433b-9f29-0457b6053258",
137 |    "metadata": {
138 |     "tags": []
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "!graphrag query \\\n",
143 |     "--root ./genomicsragtest \\\n",
144 |     "--method local \\\n",
145 |     "--query \"Annotate chr1:930220\"\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "id": "447ca492-28e4-4b8a-b31e-c4dd107fe3a8",
151 |    "metadata": {},
152 |    "source": [
153 |     "### 6. Visualize and save the graph\n",
154 |     "\n",
155 |     "Attention: Please be sure that you installed the relevant libraries."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "6ca64c8f-dc6f-4031-89ae-00abad1233e2",
162 |    "metadata": {
163 |     "tags": []
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "import networkx as nx\n",
168 |     "import matplotlib.pyplot as plt\n",
169 |     "# Load GraphML file\n",
170 |     "G = nx.read_graphml(\"./genomicsragtest/output/graph.graphml\")\n",
171 |     "pos = nx.spring_layout(G)\n",
172 |     "# Draw the graph\n",
173 |     "plt.figure(figsize=(50, 30))\n",
174 |     "nx.draw(G, pos, with_labels=True, node_color='skyblue', edge_color='gray', node_size=5, font_size=8)\n",
175 |     "plt.title(\"GraphML Visualization\")\n",
176 |     "plt.savefig(\"graph_rag_output.png\", format=\"png\", dpi=500)\n",
177 |     "plt.show()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "id": "dbb987fc-cbec-412e-907f-6e991bdad08e",
183 |    "metadata": {},
184 |    "source": [
185 |     "### References\n",
186 |     "\n",
187 |     "1. https://microsoft.github.io/graphrag/get_started/\n",
188 |     "2. Learn more about [GraphRAG](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/)\n",
189 |     "3. Access the GraphRAG GitHub repository - - https://github.com/microsoft/graphrag/\n",
190 |     "4. Use the GraphRAG Solution accelerator - https://github.com/Azure-Samples/graphrag-accelerator\n",
191 |     "5. For sample data source-ClinVAR: https://www.ncbi.nlm.nih.gov/clinvar/\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "f51cf22b-9951-4cc7-af15-4690975f38b4",
197 |    "metadata": {},
198 |    "source": [
199 |     "### NOTICES"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "id": "535acead-70d9-4376-a50c-cf83591698aa",
205 |    "metadata": {},
206 |    "source": [
207 |     "THIS NOTEBOOK JUST PROVIDE A SAMPLE CODES FOR EDUCATIONAL PURPOSES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS NOTEBOOK AND SAMPLE USE OF LIBRARIES ON AN “AS IS” BASIS. DATA OR ANY MATERIAL ON THIS NOTEBOOK. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS NOTEBOOK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS NOTEBOOK."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "id": "c36d310f-118d-43dc-abcc-3864b9e4efe6",
213 |    "metadata": {},
214 |    "source": [
215 |     "### END OF NOTEBOOK"
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "Python 3.10 - SDK v2",
222 |    "language": "python",
223 |    "name": "python310-sdkv2"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 3
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython3",
235 |    "version": "3.10.16"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 5
240 | }
241 | 


--------------------------------------------------------------------------------
/sample-notebooks/igv_jupyter_extension_sample.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0e50ac44",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Use Microsoft Azure Genomics Data Lake with IGV-Jupyter extension"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "33f2a8d4",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Jupyter notebook is a great tool for data scientists who is working on Genomics data analysis. We will demonstrate Azure Jupyter notebook usage via  'Integrative Genomics Viewer Jupyter Extension (igv-jupyter) with Microsoft Azure Genomics Data Lake files.\n",
 17 |     "\n",
 18 |     "**Here is the coverage of this notebook:**\n",
 19 |     "\n",
 20 |     "1. Download the  data from Azure Genomics Data Lake\n",
 21 |     "2. Cloning the igv-jupyter extension repo\n",
 22 |     "3. igv-jupyter extension installation and sample submissions\n",
 23 |     "\n",
 24 |     "**Dependencies:**\n",
 25 |     "\n",
 26 |     "This notebook requires the following libraries:\n",
 27 |     "\n",
 28 |     "- Azure storage `pip install azure-storage-blob==2.1.0`. Please visit [this page](https://github.com/Azure/azure-storage-python/wiki) for frequently encountered problem for this SDK.\n",
 29 |     "\n",
 30 |     "- IGV: Integrative Genomics Viewer Jupyter Extension (*We have used the sample codes from igv-jupyter sample notebooks: https://github.com/igvteam/igv-jupyter, https://pypi.org/project/igv-jupyter/*)\n",
 31 |     "\n",
 32 |     "- Technical note: [Explore Azure Genomics Data Lake with Azure Storage Explorer](https://github.com/microsoft/genomicsnotebook/blob/main/docs/Genomics_Data_Lake_Azure_Storage_Explorer.pdf)\n",
 33 |     "\n",
 34 |     "- Requirements:\n",
 35 |     "\n",
 36 |     "    `python >= 3.6.4`\n",
 37 |     "\n",
 38 |     "    `jupyterlab >= 3.0`\n",
 39 |     "\n",
 40 |     "**Important information: This notebook should be executed on Jupyter Lab Version 3.0 or higher. Users can install Jupyter Lab with `pip install jupyterlab==3.0` command.**\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "d1c2fed4",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# 1. Download sample VCF data from Broad Institute's GATK Test Data on Azure Genomics Data Lake\n",
 49 |     "\n",
 50 |     "Several public genomics data has been uploaded as an Azure Open Dataset [here](https://azure.microsoft.com/services/open-datasets/catalog/). We create a blob service linked to this open datasets. Than, users can use IGV browser from Jupyter environment. We recommend to use Azure Machine Learning Studio for Jupyter Lab environment."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "70a9bcf2",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "**1.a.Install Azure Blob Storage SDK**"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "5a501e0d",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "pip install azure-storage-blob==2.1.0"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "734a8474",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "**1.b.Download the sample VCF and .tbi file from Microsoft Genomics Data Lake**"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "b4222db9",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import os\n",
 87 |     "import uuid\n",
 88 |     "import sys\n",
 89 |     "from azure.storage.blob import BlockBlobService, PublicAccess\n",
 90 |     "\n",
 91 |     "blob_service_client = BlockBlobService(account_name='datasetgatktestdata', sas_token='sv=2020-04-08&si=prod&sr=c&sig=fzLts1Q2vKjuvR7g50vE4HteEHBxTcJbNvf%2FZCeDMO4%3D')     \n",
 92 |     "blob_service_client.get_blob_to_path('dataset/1kgp/downsampled_vcf_hg38', '1kgp-50-exomes.vcf.gz', './1kgp-50-exomes.vcf.gz')"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "2ce69441-b116-4c91-8ca9-a9fd768a3670",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import os\n",
103 |     "import uuid\n",
104 |     "import sys\n",
105 |     "from azure.storage.blob import BlockBlobService, PublicAccess\n",
106 |     "\n",
107 |     "blob_service_client = BlockBlobService(account_name='datasetgatktestdata', sas_token='sv=2020-04-08&si=prod&sr=c&sig=fzLts1Q2vKjuvR7g50vE4HteEHBxTcJbNvf%2FZCeDMO4%3D')     \n",
108 |     "blob_service_client.get_blob_to_path('dataset/1kgp/downsampled_vcf_hg38', '1kgp-50-exomes.vcf.gz.tbi', './1kgp-50-exomes.vcf.gz.tbi')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "00838f0b",
114 |    "metadata": {},
115 |    "source": [
116 |     "## 2.igv-jupyter extension: sample submissions"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "cb31d253",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "pip install igv-jupyter"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "e46b3e3f-d4bb-4248-9c8f-df3b30696a35",
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "import igv_notebook\n",
137 |     "\n",
138 |     "igv_notebook.init()\n",
139 |     "\n",
140 |     "b = igv_notebook.Browser(\n",
141 |     "    {\n",
142 |     "        \"genome\": \"hg38\",\n",
143 |     "        \"locus\": \"chr22\",\n",
144 |     "        \"tracks\": [\n",
145 |     "            {\n",
146 |     "                \"url\": \"1kgp-50-exomes.vcf.gz\",\n",
147 |     "                \"indexURL\": \"1kgp-50-exomes.vcf.gz.tbi\",\n",
148 |     "                \"name\": \"Color by table, SVTYPE\",\n",
149 |     "                \"visibilityWindow\": -1,\n",
150 |     "                \"colorBy\": \"SVTYPE\",\n",
151 |     "                \"colorTable\": {\n",
152 |     "                    \"DEL\": \"#ff2101\",\n",
153 |     "                    \"INS\": \"#001888\",\n",
154 |     "                    \"DUP\": \"#028401\",\n",
155 |     "                    \"INV\": \"#008688\",\n",
156 |     "                    \"CNV\": \"#8931ff\",\n",
157 |     "                    \"BND\": \"#891100\",\n",
158 |     "                    \"*\": \"#002eff\"\n",
159 |     "                }\n",
160 |     "            }]\n",
161 |     "    })"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "id": "b1ab36fa",
167 |    "metadata": {},
168 |    "source": [
169 |     "# References\n",
170 |     "\n",
171 |     "1. IGV-Jupyter:  https://github.com/igvteam/igv-jupyter\n",
172 |     "2. IGV-Jupyter project: https://pypi.org/project/igv-jupyter/\n",
173 |     "3. 1000 Genomes Project: https://www.internationalgenome.org/"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "c23f41bd",
179 |    "metadata": {
180 |     "incorrectly_encoded_metadata": "tags=[] jp-MarkdownHeadingCollapsed=true"
181 |    },
182 |    "source": [
183 |     "## Notices"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "189005b2",
189 |    "metadata": {},
190 |    "source": [
191 |     "THIS NOTEBOOK HAS JUST A SAMPLE CODES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS NOTEBOOK AND SAMPLE USE OF igv-jupyter LIBRARIES ON AN “AS IS” BASIS. DATA OR ANY MATERIAL ON THIS NOTEBOOK. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS NOTEBOOK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS NOTEBOOK."
192 |    ]
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "jupytext": {
197 |    "formats": "ipynb,md"
198 |   },
199 |   "kernelspec": {
200 |    "display_name": "Python 3 (ipykernel)",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.8.5"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 5
219 | }
220 | 


--------------------------------------------------------------------------------
/sample-notebooks/initial-notebook.md:
--------------------------------------------------------------------------------
1 | initial.md
2 | 


--------------------------------------------------------------------------------
/sample-notebooks/simtotable.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Genomics Data Simulation to Machine Learning ready table "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Jupyter notebook is a great tool for data scientists who are working on Genomics data analysis. We will demonstrate the process of simulation of paired-end fastq files to downstream analysis ready table format with `ART, Cromwell on Azure, GATK and Picard` on Jupyter notebook.\n",
 15 |     "\n",
 16 |     "**Here is the coverage of this notebook:**\n",
 17 |     "\n",
 18 |     "**1.** Simulate Next Generation Sequencing Data with ART\n",
 19 |     "\n",
 20 |     "**2.** Convert fastq paired-end data to uBAM with Cromwell on Azure \n",
 21 |     "\n",
 22 |     "**3.** uBAM to VCF with Cromwell on Azure\n",
 23 |     " \n",
 24 |     "    3.1.Alignment and Variant Calling with Microsoft Genomics service\n",
 25 |     "\n",
 26 |     "**4.** Convert the gVCF file to a table format\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "**Dependencies:**\n",
 31 |     "\n",
 32 |     "This notebook requires the following libraries:\n",
 33 |     "\n",
 34 |     "- Azure CLI \n",
 35 |     "\n",
 36 |     "- AzCopy: Please install latest release of the `AzCopy`: https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10\n",
 37 |     "\n",
 38 |     "- Cromwell on Azure: Please download the latest release of `CoA` from: https://github.com/microsoft/CromwellOnAzure/releases\n",
 39 |     "\n",
 40 |     "- ART: ART is a set of simulation tools to generate synthetic next-generation sequencing reads. Please download the latest version of this tool from:  \n",
 41 |     "https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm\n",
 42 |     "\n",
 43 |     "- Picard: Please download the latest release of the tool from https://broadinstitute.github.io/picard/\n",
 44 |     "\n",
 45 |     "- Genome Analysis Toolkit (GATK) (*Users need to download `GATK` from Broad Institute's webpage into the same compute environment with this notebook: https://github.com/broadinstitute/gatk/releases*)\n",
 46 |     "\n",
 47 |     "- Users need reference genome for using this notebook on their environment: [hg19.fasta](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-reference-genomes/)\n",
 48 |     "\n",
 49 |     "**Important information: This notebook is using Python 3.6 kernel**\n"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "# 1. Simulate Next Generation Sequencing Data with ART - **Sample Code**\n",
 57 |     "\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "lines_to_next_cell": 0
 64 |    },
 65 |    "source": [
 66 |     "We recommend to use ART ([quote from the ART's website](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm): \"_ART is a set of simulation tools to generate synthetic next-generation sequencing reads. ART simulates sequencing reads by mimicking real sequencing process with empirical error models or quality profiles summarized from large recalibrated sequencing data\"_) for NGS data simulation.\n",
 67 |     "\n",
 68 |     "This is a great tool to simulate a NGS data for different sequencing platforms. Simulated data sets are very close to the real genomics datasets. Users can test their own downstream analysis with the simulated data sets.\n",
 69 |     "\n",
 70 |     "In this notebook, we will demonstrate the 'paired sample fastq' simulation with sample codes. Please visit tool's website for further sample codes. \n",
 71 |     "\n",
 72 |     "Please download the ART binary files from this [link](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm) than just call the code in below.\n",
 73 |     "\n",
 74 |     "Based on the information on the manual of [ART](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm), parameters of the simulation are defined as follows: \n",
 75 |     "\n",
 76 |     "\n",
 77 |     "        -ss  --seqSys   The name of Illumina sequencing system of the built-in profile used for simulation\n",
 78 |     "        \n",
 79 |     "        -i   --in       the filename of input DNA/RNA reference\n",
 80 |     "        \n",
 81 |     "        -p   --paired   indicate a paired-end read simulation or to generate reads from both ends of amplicons\n",
 82 |     "                        NOTE: art will automatically switch to a mate-pair simulation if the given mean fragment size >= 2000\n",
 83 |     "                        \n",
 84 |     "        -l   --len      the length of reads to be simulated\n",
 85 |     "          \n",
 86 |     "        -f   --fcov     the fold of read coverage to be simulated or number of reads/read pairs generated for each amplicon\n",
 87 |     "                        \n",
 88 |     "        -m   --mflen    the mean size of DNA/RNA fragments for paired-end simulations\n",
 89 |     "        \n",
 90 |     "        -s   --sdev     the standard deviation of DNA/RNA fragment size for paired-end simulations.\n",
 91 |     "        \n",
 92 |     "        -o   --out      the prefix of output filename\n",
 93 |     "\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "!./art_illumina -ss HS25 -sam -i hg19.fasta -p -l 150 -f 20 -m 200 -s 10 -o paired_dat"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Output report of this function will be: \n",
110 |     "\n",
111 |     "                  Paired-end sequencing simulation\n",
112 |     "\n",
113 |     "    ** Parameters used during run **\n",
114 |     "        Read Length:    150\n",
115 |     "        Genome masking 'N' cutoff frequency:    1 in 150\n",
116 |     "        Fold Coverage:            20X\n",
117 |     "        Mean Fragment Length:     200\n",
118 |     "        Standard Deviation:       10\n",
119 |     "        Profile Type:             Combined\n",
120 |     "        ID Tag:                   \n",
121 |     "\n",
122 |     "    ** Quality Profile(s) **\n",
123 |     "        First Read:   HiSeq 2500 Length 150 R1 (built-in profile) \n",
124 |     "        First Read:   HiSeq 2500 Length 150 R2 (built-in profile) \n",
125 |     "\n",
126 |     "    ** Output files **\n",
127 |     "\n",
128 |     "      FASTQ Sequence Files: ~ 57.7 GB\n",
129 |     "         the 1st reads: paired_dat1.fq\n",
130 |     "         the 2nd reads: paired_dat2.fq \n",
131 |     "\n",
132 |     "      ALN Alignment Files: ~ 60.4 GB\n",
133 |     "         the 1st reads: paired_dat1.aln\n",
134 |     "         the 2nd reads: paired_dat2.aln\n",
135 |     "\n",
136 |     "      SAM Alignment File: ~ 129.2 GB\n",
137 |     "        paired_dat.sam \n",
138 |     "\n",
139 |     "Reference: [ART](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm)\n",
140 |     "\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "# 2. Convert fastq paired-end data to uBAM with Cromwell on Azure "
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "Users needs to use the [\"Sequence data format conversion pipelines on Azure\"](https://github.com/microsoft/CromwellOnAzure/blob/master/docs/example-fastq-to-ubam.md/#Example-workflow-to-convert-FASTQ-files-to-uBAM-files) for converting the simulated fastq files to uBAM files. Here is the brief information about this pipeline.\n",
155 |     "\n",
156 |     "### paired-fastq-to-unmapped-bam :\n",
157 |     "This WDL converts paired FASTQ to uBAM and adds read group information \n",
158 |     "\n",
159 |     "#### Requirements/expectations \n",
160 |     "- Pair-end sequencing data in FASTQ format (one file per orientation)\n",
161 |     "- The following metada descriptors per sample: \n",
162 |     "  - readgroup   \n",
163 |     "  - sample_name\n",
164 |     "  - library_name\n",
165 |     "  - platform_unit\n",
166 |     "  - run_date\n",
167 |     "  - platform_name\n",
168 |     "  - sequecing_center\n",
169 |     "  \n",
170 |     "#### Outputs \n",
171 |     "- Unmapped BAM "
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "# 3. uBAM to gVCF with Cromwell on Azure\n"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "The next step on this notebooks is 'variant calling' analysis for further downstream analysis. We recommend to use 'gatk4 genome processing pipeline' for this phase. Users can use the existed pipelines from [\"gatk4-genome-processing-pipeline-azure\"](https://github.com/microsoft/gatk4-genome-processing-pipeline-azure/blob/master-azure/README.md). Here is the inputs and outputs of this pipeline.\n",
186 |     "\n",
187 |     "## gatk4-genome-processing-pipeline\n",
188 |     "Workflows used for germline processing in whole genome sequence data.\n",
189 |     "\n",
190 |     "### WholeGenomeGermlineSingleSample :\n",
191 |     "This WDL pipeline implements data pre-processing and initial variant calling (GVCF\n",
192 |     "generation) according to the GATK Best Practices (June 2016) for germline SNP and\n",
193 |     "Indel discovery in human whole-genome sequencing data.\n",
194 |     "\n",
195 |     "#### Requirements/expectations\n",
196 |     "- Human whole-genome paired-end sequencing data in unmapped BAM (uBAM) format\n",
197 |     "- One or more read groups, one per uBAM file, all belonging to a single sample (SM)\n",
198 |     "- Input uBAM files must additionally comply with the following requirements:\n",
199 |     "- - filenames all have the same suffix (we use \".unmapped.bam\")\n",
200 |     "- - files must pass validation by ValidateSamFile\n",
201 |     "- - reads are provided in query-sorted order\n",
202 |     "- - all reads must have an RG tag\n",
203 |     "- Reference genome must be Hg38 with ALT contigs\n",
204 |     "\n",
205 |     "#### Outputs \n",
206 |     "- Cram, cram index, and cram md5 \n",
207 |     "- GVCF and its gvcf index \n",
208 |     "- BQSR Report\n",
209 |     "- Several Summary Metrics "
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "## 3.1. Alignment and Variant Calling with Microsoft Genomics service- Optional\n",
217 |     "\n",
218 |     "Users can also use the [Microsoft Genomics service](https://azure.microsoft.com/en-us/services/genomics/) for alligment and variant calling process. The Microsoft Genomics client (msgen) is a Python front-end to the web service. It can be\n",
219 |     "installed like a standard Python package, on Windows or Linux using the Python pip package manager (“pip install msgen”). For each genome sample that you want to process, you create a configuration file containing all the parameters for downloading the data, running the Microsoft Genomics pipeline, and uploading the results:\n",
220 |     "\n",
221 |     "• Your subscription key to Microsoft\n",
222 |     "Genomics\n",
223 |     "\n",
224 |     "• The process to run and its parameters\n",
225 |     "\n",
226 |     "• Path information and storage account\n",
227 |     "keys for the input files in either paired\n",
228 |     "FASTQ, paired compressed FASTQ, or\n",
229 |     "BAM format, in Azure Storage\n",
230 |     "\n",
231 |     "• Path information and storage account\n",
232 |     "key for the location to place the output files in Azure Storage\n",
233 |     "\n",
234 |     "You can then invoke the msgen client to initiate processing, and monitor progress until the job is complete. The final aligned reads in BAM format, and variant calls in VCF.GZ format will be placed in your designated output container in Azure Storage. The client can easily be incorporated into existing workflows. Here is the sample code for calling Microsoft Genomics service from Python client.\n",
235 |     "\n",
236 |     "Please visit  [quick start run](https://docs.microsoft.com/en-us/azure/genomics/quickstart-run-genomics-workflow-portal) page for sample job submission to the service.\n"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "! ./msgen submit -f ./config.txt -b1 paired_dat1.fq -b2 paired_dat2.fq"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "# 4. Convert the final gVCF file to a table format -VariantsToTable"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "The optional final step before downstream analysis is converting gvcf file to a table format for specific parameters. \n",
260 |     "\n",
261 |     "Extract fields from a VCF file to a tab-delimited table\n",
262 |     "This tool extracts specified fields for each variant in a VCF file to a tab-delimited table, which may be easier to work with than a VCF. By default, the tool only extracts PASS or . (unfiltered) variants in the VCF file. Filtered variants may be included in the output by adding the --show-filtered flag. The tool can extract both INFO (i.e. site-level) fields and FORMAT (i.e. sample-level) fields. \n",
263 |     "\n",
264 |     "Reference: [Variants to table](https://gatk.broadinstitute.org/hc/en-us/articles/360036882811-VariantsToTable)\n",
265 |     "\n",
266 |     "\n",
267 |     "**INFO/site-level fields**\n",
268 |     "\n",
269 |     "Use the `-F` argument to extract INFO fields; each field will occupy a single column in the output file. The field can be any standard VCF column (e.g. CHROM, ID, QUAL) or any annotation name in the INFO field (e.g. AC, AF). The tool also supports the following additional fields:\n",
270 |     "\n",
271 |     "EVENTLENGTH (length of the event)\n",
272 |     "TRANSITION (1 for a bi-allelic transition (SNP), 0 for bi-allelic transversion (SNP), -1 for INDELs and multi-allelics)\n",
273 |     "HET (count of het genotypes)\n",
274 |     "HOM-REF (count of homozygous reference genotypes)\n",
275 |     "HOM-VAR (count of homozygous variant genotypes)\n",
276 |     "NO-CALL (count of no-call genotypes)\n",
277 |     "TYPE (type of variant, possible values are NO_VARIATION, SNP, MNP, INDEL, SYMBOLIC, and MIXED\n",
278 |     "VAR (count of non-reference genotypes)\n",
279 |     "NSAMPLES (number of samples)\n",
280 |     "NCALLED (number of called samples)\n",
281 |     "MULTI-ALLELIC (is this variant multi-allelic? true/false)\n",
282 |     "\n",
283 |     "\n",
284 |     "**FORMAT/sample-level fields**\n",
285 |     "\n",
286 |     "Use the `-GF` argument to extract FORMAT/sample-level fields. The tool will create a new column per sample with the name \"SAMPLE_NAME.FORMAT_FIELD_NAME\" e.g. NA12877.GQ, NA12878.GQ.\n",
287 |     "\n"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "**Input**\n",
295 |     "\n",
296 |     "A VCF file to convert to a table\n",
297 |     "\n",
298 |     "**Output**\n",
299 |     "\n",
300 |     "A tab-delimited file containing the values of the requested fields in the VCF file.\n"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "!./gatk VariantsToTable -V simoutput.g.vcf.gz -F CHROM -F POS -F TYPE -F AC -F AD -F AF -GF DP -GF AD -O outputtable.table"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "# References\n",
317 |     "\n",
318 |     "1. Cromwell on Azure: https://github.com/microsoft/CromwellOnAzure/releases\n",
319 |     "\n",
320 |     "2. ART: https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm\n",
321 |     "\n",
322 |     "3. Variants to table: https://gatk.broadinstitute.org/hc/en-us/articles/360036882811-VariantsToTable \n",
323 |     "\n",
324 |     "4. Picard: https://broadinstitute.github.io/picard/ \n",
325 |     "\n",
326 |     "5. AzCopy: https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10\n",
327 |     "   \n"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "# NOTICES\n",
335 |     "\n",
336 |     "THIS NOTEBOOK HAS JUST A SAMPLE CODES. MICROSOFT DOES NOT CLAIM ANY OWNERSHIP ON THESE CODES AND LIBRARIES. MICROSOFT PROVIDES THIS NOTEBOOK AND SAMPLE USE OF ART'S SIMULATION LIBRARIES ON AN “AS IS” BASIS. MICROSOFT MAKES NO WARRANTIES, EXPRESS OR IMPLIED, GUARANTEES OR CONDITIONS WITH RESPECT TO YOUR USE OF THIS NOTEBOOK. TO THE EXTENT PERMITTED UNDER YOUR LOCAL LAW, MICROSOFT DISCLAIMS ALL LIABILITY FOR ANY DAMAGES OR LOSSES, INCLUDING DIRECT, CONSEQUENTIAL, SPECIAL, INDIRECT, INCIDENTAL OR PUNITIVE, RESULTING FROM YOUR USE OF THIS NOTEBOOK.\n",
337 |     "\n",
338 |     "**END OF NOTEBOOK**"
339 |    ]
340 |   }
341 |  ],
342 |  "metadata": {
343 |   "jupytext": {
344 |    "formats": "ipynb,md"
345 |   },
346 |   "kernelspec": {
347 |    "display_name": "Python 3.6 - AzureML",
348 |    "language": "python",
349 |    "name": "python3-azureml"
350 |   },
351 |   "language_info": {
352 |    "codemirror_mode": {
353 |     "name": "ipython",
354 |     "version": 3
355 |    },
356 |    "file_extension": ".py",
357 |    "mimetype": "text/x-python",
358 |    "name": "python",
359 |    "nbconvert_exporter": "python",
360 |    "pygments_lexer": "ipython3",
361 |    "version": "3.6.9"
362 |   }
363 |  },
364 |  "nbformat": 4,
365 |  "nbformat_minor": 4
366 | }
367 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/1000genomes/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # .vcf to .parquet conversion for 1000 Genomes dataset
 3 | 
 4 | `vcf2parquet-1000genomes.ipynb` is a notebook used to convert variant call information from .vcf to .parquet for [1000 Genomes datatset](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-1000-genomes/)  using [Glow](https://projectglow.io/). 
 5 | 
 6 | Notebook was tested on Azure Databricks Runtime 8.1 and Glow v1.0.1, cluster worker type Standard_DS13_v2 (min - 7, max - 15).
 7 | 
 8 | We also provide SQL scripts to create external tables on curated 1000 Genomes dataset and to query it with [Azure Synapse Analytics](https://azure.microsoft.com/en-us/services/synapse-analytics/).
 9 | 
10 | 
11 | ### Curated 1000 Genomes data access
12 | 
13 | East US: https://curated1000genomes.blob.core.windows.net/dataset
14 | 
15 | [SAS Token](https://docs.microsoft.com/en-us/azure/storage/common/storage-sas-overview): sv=2018-03-28&si=prod&sr=c&sig=BgIomQanB355O4FhxqBL9xUgKzwpcVlRZdBewO5%2FM4E%3D
16 | 
17 | ### Data schema 
18 | 
19 | Notebook creates 2 copies of dataset with different schema:
20 | * [Default Glow schema](https://glow.readthedocs.io/en/latest/etl/variant-data.html#vcf)  that mirrors a single row of a VCF + hashId. 
21 | 
22 |     Information that applies to an entire variant, such as the contig name, start and end positions, and INFO attributes, is contained in columns. The genotypes, which correspond to the GT FORMAT fields in a VCF, are contained in an array with one entry per sample. Each entry is a struct with fields that are described in the VCF header. Column hashId is constructed as the hash code of all columns except genotypes using the 64-bit variant of the xxHash algorithm, hashId might be used as unique variant id.
23 | 
24 | * Flattened schema
25 | 
26 |     Information that applies to an entire variant (hashId included) stays the same. The genotypes get replaced by set of columns, one per each struct field - to achieve that we have to copy variant information for each entry of genotypes array.  Schema is consistent across all chromosomes in the dataset.
27 | 
28 | 
29 | Format conversion is done for each chromosome separately, so default data schema might differ between chromosomes. In case of flattened schema we add empty columns when needed to match schema across all chromosomes.
30 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/1000genomes/sql/sampleQueriesFlattened.sql:
--------------------------------------------------------------------------------
 1 | ---switch to database created by setup.sql
 2 | USE [1000Genomes]
 3 | GO
 4 | 
 5 | 
 6 | --- create external table for flattened data schema
 7 | CREATE EXTERNAL TABLE [chr22Flattened] (
 8 | 	[contigName] varchar(2),
 9 | 	[start] bigint,
10 | 	[end] bigint,
11 | 	[names] varchar(150),
12 | 	[referenceAllele] varchar(150),
13 | 	[alternateAlleles] varchar(700),
14 | 	[qual] float,
15 | 	[filters] varchar(10),
16 | 	[splitFromMultiAllelic] bit,
17 | 	[INFO_MEND] int,
18 | 	[INFO_AC] varchar(60),
19 | 	[INFO_CIEND] varchar(15),
20 | 	[INFO_NS] int,
21 | 	[INFO_AFR_AF] varchar(100),
22 | 	[INFO_VT] varchar(30),
23 | 	[INFO_AN] int,
24 | 	[INFO_MULTI_ALLELIC] bit,
25 | 	[INFO_SAS_AF] varchar(100),
26 | 	[INFO_CIPOS] varchar(15),
27 | 	[INFO_AA] varchar(150),
28 | 	[INFO_AF] varchar(150),
29 | 	[INFO_EAS_AF] varchar(100),
30 | 	[INFO_AMR_AF] varchar(100),
31 | 	[INFO_DP] int,
32 | 	[INFO_SVLEN] varchar(10),
33 | 	[INFO_MLEN] int,
34 | 	[INFO_MEINFO] varchar(30),
35 | 	[INFO_IMPRECISE] bit,
36 | 	[INFO_CS] varchar(15),
37 | 	[INFO_MC] varchar(450),
38 | 	[INFO_END] int,
39 | 	[INFO_MSTART] int,
40 | 	[INFO_EUR_AF] varchar(100),
41 | 	[INFO_EX_TARGET] bit,
42 | 	[INFO_TSD] varchar(35),
43 | 	[INFO_SVTYPE] varchar(10),
44 | 	[INFO_OLD_VARIANT] varchar(200),
45 | 	[hashId] bigint,
46 | 	[genotypes_sampleId] varchar(10),
47 | 	[genotypes_phased] bit,
48 | 	[genotypes_calls] varchar(10),
49 | 	[genotypes_CNL] varchar(60),
50 | 	[genotypes_filters] varchar(1),
51 | 	[genotypes_CN] int,
52 | 	[genotypes_CNP] varchar(60),
53 | 	[genotypes_CNQ] float,
54 | 	[genotypes_phredLikelihoods] varchar(30),
55 | 	[genotypes_conditionalQuality] int,
56 | 	[genotypes_posteriorProbabilities] varchar(60)
57 | 	)
58 | 	WITH (
59 | --- we create table for chromosome 22
60 | --- change chromosome name to create table for chromosome of interest
61 | 	LOCATION = 'flattened/chr22/*.snappy.parquet', 
62 | 	DATA_SOURCE = [1000GenomesVariants],
63 | 	FILE_FORMAT = [SynapseParquetFormat]
64 | 	)
65 | GO
66 | 
67 | 
68 | --- Coiunt number of calls
69 | SELECT  COUNT_BIG(*)
70 |  FROM [dbo].[chr22Flattened]
71 | GO
72 | 
73 | 
74 | --- Filter true calls (by genotype) by sample name 
75 | SELECT TOP(100) [names], 
76 |     [contigName], 
77 | 	[start], 
78 |     [referenceAllele],
79 |     [alternateAlleles],
80 | 	JSON_VALUE(genotypes_calls, '$[0]') as call1, 
81 | 	JSON_VALUE(genotypes_calls, '$[1]') as call2 
82 | FROM [dbo].[chr22Flattened] 
83 | WHERE genotypes_sampleId = 'NA12878' AND  (JSON_VALUE(genotypes_calls, '$[0]') > 0 OR 	JSON_VALUE(genotypes_calls, '$[1]') > 0); 
84 | GO 
85 | 
86 | 
87 | --- Count true calls (by genotype) per sample 
88 | SELECT [genotypes_sampleId], 
89 |     COUNT (genotypes_sampleId) as true_calls_count 
90 | FROM [dbo].[chr22Flattened] 
91 | WHERE (JSON_VALUE(genotypes_calls, '$[0]') > 0 OR 	JSON_VALUE(genotypes_calls, '$[1]') > 0)  
92 | GROUP BY genotypes_sampleId 
93 | ORDER BY true_calls_count; 
94 | GO 
95 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/1000genomes/sql/sampleQueriesNested.sql:
--------------------------------------------------------------------------------
  1 | ---switch to database created by setup.sql
  2 | USE [1000Genomes]
  3 | GO
  4 | 
  5 | --- create external table for nested data schema
  6 | CREATE EXTERNAL TABLE [nested] (
  7 | 	[contigName] varchar(2),
  8 | 	[start] bigint,
  9 | 	[end] bigint,
 10 | 	[names] varchar(150),
 11 | 	[referenceAllele] varchar(150),
 12 | 	[alternateAlleles] varchar(700),
 13 | 	[qual] float,
 14 | 	[filters] varchar(10),
 15 | 	[splitFromMultiAllelic] bit,
 16 | 	[INFO_MEND] int,
 17 | 	[INFO_AC] varchar(60),
 18 | 	[INFO_CIEND] varchar(15),
 19 | 	[INFO_NS] int,
 20 | 	[INFO_AFR_AF] varchar(100),
 21 | 	[INFO_VT] varchar(30),
 22 | 	[INFO_AN] int,
 23 | 	[INFO_MULTI_ALLELIC] bit,
 24 | 	[INFO_SAS_AF] varchar(100),
 25 | 	[INFO_CIPOS] varchar(15),
 26 | 	[INFO_AA] varchar(150),
 27 | 	[INFO_AF] varchar(150),
 28 | 	[INFO_EAS_AF] varchar(100),
 29 | 	[INFO_AMR_AF] varchar(100),
 30 | 	[INFO_DP] int,
 31 | 	[INFO_SVLEN] varchar(10),
 32 | 	[INFO_MLEN] int,
 33 | 	[INFO_MEINFO] varchar(30),
 34 | 	[INFO_IMPRECISE] bit,
 35 | 	[INFO_CS] varchar(15),
 36 | 	[INFO_MC] varchar(450),
 37 | 	[INFO_END] int,
 38 | 	[INFO_MSTART] int,
 39 | 	[INFO_EUR_AF] varchar(100),
 40 | 	[INFO_EX_TARGET] bit,
 41 | 	[INFO_TSD] varchar(35),
 42 | 	[INFO_SVTYPE] varchar(10),
 43 | 	[INFO_OLD_VARIANT] varchar(200),
 44 | 	[genotypes] varchar(MAX),
 45 | 	[hashId] bigint
 46 | 	)
 47 | 	WITH (
 48 | --- we create table for full dataset
 49 | --- specify chromosome name to create table for chromosome of interest, for example 'nested/chr22/*.snappy.parquet' for chromosome 22
 50 | 	LOCATION = 'nested/*/*.snappy.parquet', 
 51 | 	DATA_SOURCE = [1000GenomesVariants],
 52 | 	FILE_FORMAT = [SynapseParquetFormat]
 53 | 	)
 54 | GO
 55 | 
 56 | --- Count number of variants 
 57 | SELECT  COUNT_BIG(*)
 58 | FROM [dbo].[nested]
 59 | GO
 60 | 
 61 | 
 62 | --- Count variants per chromosome
 63 | SELECT [contigName],
 64 |     COUNT_BIG (contigName) as variantCount
 65 | FROM [dbo].[nested]
 66 | GROUP BY contigName
 67 | ORDER BY variantCount;
 68 | GO
 69 | 
 70 | 
 71 | --- Filter variants on multiple fields
 72 | SELECT TOP (10) [contigName],
 73 |     [referenceAllele],
 74 |     [alternateAlleles],
 75 |     [INFO_MULTI_ALLELIC],
 76 |     [start],
 77 |     [names],
 78 |     [INFO_AFR_AF],
 79 |     [INFO_VT],
 80 |     [AFR_AF]
 81 | FROM [dbo].[nested]
 82 | CROSS APPLY OPENJSON (INFO_AFR_AF) WITH (AFR_AF float '$')
 83 | WHERE ([referenceAllele] = 'A') AND ([AFR_AF] > 0.3); 
 84 | GO
 85 | 
 86 | 
 87 | SELECT TOP (10) [contigName],
 88 |     [start],
 89 |     [names],
 90 |     [referenceAllele],
 91 |     [alternateAlleles],
 92 |     [filters],
 93 |     [INFO_AC],
 94 |     [INFO_AA],
 95 |     [INFO_AF],
 96 |     [INFO_DP]
 97 | FROM [dbo].[nested]
 98 | CROSS APPLY OPENJSON (alternateAlleles) WITH (alternateAllele VARCHAR(700) '$')
 99 | WHERE ([alternateAllele]= 'C') AND ([INFO_DP]> 2);
100 | GO
101 | 
102 | 
103 | --- Filter true calls (by genotype) on start position
104 | SELECT TOP (10) [contigName],
105 |     [sampleId],
106 |     [start],
107 |     [names],
108 |     [referenceAllele],
109 |     [alternateAlleles],
110 |     [INFO_AC],
111 |     [INFO_AA],
112 |     [INFO_AF],
113 |     [INFO_DP],
114 |     [call1],
115 |     [call2]
116 | FROM [dbo].[nested]
117 | CROSS APPLY openjson (genotypes) WITH (sampleId VARCHAR(10), call1 INT '$.calls[0]', call2 INT '$.calls[1]') 
118 | WHERE ([call1] > 0 OR [call2] > 0) AND ( [start] >= 23000000);
119 | GO
120 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/1000genomes/sql/setup.sql:
--------------------------------------------------------------------------------
 1 | --- create database
 2 | CREATE DATABASE [1000Genomes]
 3 |     COLLATE SQL_Latin1_General_CP1_CI_AS;
 4 | GO
 5 | 
 6 | USE [1000Genomes]
 7 | GO
 8 | 
 9 | --- create external file format
10 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'SynapseParquetFormat') 
11 | 	CREATE EXTERNAL FILE FORMAT [SynapseParquetFormat] 
12 | 	WITH ( FORMAT_TYPE = PARQUET)
13 | GO
14 | 
15 | --- create credential
16 | CREATE MASTER KEY
17 | GO
18 | CREATE DATABASE SCOPED CREDENTIAL [PublicCredential]
19 | WITH IDENTITY='SHARED ACCESS SIGNATURE',  
20 | SECRET = 'sv=2018-03-28&si=prod&sr=c&sig=BgIomQanB355O4FhxqBL9xUgKzwpcVlRZdBewO5%2FM4E%3D' 
21 | GO
22 | 
23 | 
24 | --- create external data source
25 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = '1000GenomesVariants') 
26 | 	CREATE EXTERNAL DATA SOURCE [1000GenomesVariants]  
27 | 	WITH (
28 | 		LOCATION   = 'https://curated1000genomes.blob.core.windows.net/dataset', 
29 | 		CREDENTIAL = PublicCredential
30 | 	)
31 | GO
32 | 
33 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/1000genomes/vcf2parquet-1000genomes.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["from pyspark.sql.functions import explode, col, lit, xxhash64\n","from math import ceil\n","\n","# Import glow.py and register Glow package\n","import glow\n","glow.register(spark)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"42f3599e-f63c-479f-92d7-846e09cd39ac"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# Provide your storage account, container and SAS token\n","outputStorageAccount = \n","outputContainer = \n","outputSAS = \n","outputDir ="],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5b1aced3-8185-4272-950c-ac28826997f3"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# Configure session credentials\n","# Set up a SAS for a container with public data - no changes needed here (public SAS)\n","spark.conf.set(\n","  \"fs.azure.sas.dataset.dataset1000genomes.blob.core.windows.net\",\n","  \"sv=2019-10-10&si=prod&sr=c&sig=9nzcxaQn0NprMPlSh4RhFQHcXedLQIcFgbERiooHEqM%3D\")\n","\n","# Set up a SAS for a container to store .parquet files\n","spark.conf.set(\n","  \"fs.azure.sas.\"+outputContainer+\".\"+outputStorageAccount+\".blob.core.windows.net\", outputSAS)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"32cd967d-57af-4c71-8249-6bb2a44c65f2"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# DropDuplicates() partitions into 200 pieces (default value)\n","# To change default number of partitions change config -  sqlContext.setConf(\"spark.sql.shuffle.partitions\", <YourNumberOfPartitions>)\n","partitionMax = 1500\n","sqlContext.setConf(\"spark.sql.shuffle.partitions\", partitionMax)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"67bf49c0-865a-4b3b-8657-16f228047d47"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# Flatten struct columns\n","def flattenStructFields(df):\n","  flat_cols = [c[0] for c in df.dtypes if c[1][:6] != 'struct']\n","  nested_cols = [c[0] for c in df.dtypes if c[1][:6] =='struct']\n","  flat_df = df.select(flat_cols + \n","                     [col(nc+'.'+c).alias(nc+'_'+c)\n","                     for nc in nested_cols\n","                     for c in df.select(nc+'.*').columns])\n","  return flat_df\n","\n","# Add empty columns to match schema\n","def completeSchema(df, diffSet):\n","  full_df = df\n","  for column in diffSet:\n","    full_df = full_df.withColumn(column.name, lit(None).cast(column.dataType.simpleString()))\n","  return full_df\n","\n","# Transform dataframe with original vcf schema\n","def transformVcf(df, toFlatten, toHash, fullSchemaFields):\n","  # Drop duplicates\n","  dataDedup = df.dropDuplicates()\n","     \n","  # Add hashId column to identify variants\n","  if toHash:\n","    hashCols = list(set(data.columns) - {'genotypes'})\n","    dataHashed = dataDedup.withColumn('hashId', xxhash64(*hashCols))\n","  else:\n","    dataHashed = dataDedup\n","  \n","  # Flatten data - explode on genotypes, create separate column for each genotypes field, add empty columns to match schema to full dataset\n","  if not toFlatten:\n","    dataFinal = dataHashed\n","  else:\n","  # Explode and flatten data\n","    dataExploded = dataHashed.withColumn('genotypes', explode('genotypes'))\n","    dataExplodedFlatten = flattenStructFields(dataExploded)\n","  # Find schema for contig dataset and add columns to match full schema\n","    contigSet = set(dataExplodedFlatten.schema.fields)\n","    diffSet =(fullSchemaFields - contigSet)\n","    dataFinal = completeSchema(dataExplodedFlatten, diffSet)\n","   \n","  return dataFinal"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"902378ed-6612-4baf-bc7b-0f98e3beacd0"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# Create widgets for toFlatten and contigs\n","flatOptions = [False, True]\n","dbutils.widgets.dropdown(\"flatten\", \"False\", [str(x) for x in flatOptions])\n","\n","contigOptions =  list(map(str, range(1, 23)))\n","contigLiterals = ['X','Y','MT', 'All']\n","contigOptions.extend(contigLiterals)\n","dbutils.widgets.multiselect(\"contigsToProcess\", \"22\", contigOptions)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"66eb007d-ee1e-40e9-be3e-f438bd23c470"}},"outputs":[],"execution_count":null},{"cell_type":"code","source":["# Define parameters\n","toFlatten = eval(getArgument(\"flatten\"))\n","toHash = True\n","repartitionCoef = 45 / 1000000 # gives ~20MB .parquet files\n","\n","# Define contig list\n","contigs = getArgument(\"contigsToProcess\").split(\",\")\n","if \"All\" in contigs:\n","  contigs = contigOptions\n","  contigs.remove('All')\n","\n","# Find schema for full dataset\n","sourceAll = \"wasbs://dataset@dataset1000genomes.blob.core.windows.net/release/20130502/ALL.chr*.vcf.gz\"\n","dataAll = spark.read\\\n","  .format(\"vcf\")\\\n","  .option(\"includeSampleIds\", True)\\\n","  .option(\"flattenInfoFields\", True)\\\n","  .load(sourceAll)\n","\n","dataAllExploded = dataAll.withColumn('genotypes', explode('genotypes'))\n","dataAllExplodedFlatten = flattenStructFields(dataAllExploded)\n","fullSet = set(dataAllExplodedFlatten.schema.fields)\n","                 \n","for contig in contigs:\n","  source = \"wasbs://dataset@dataset1000genomes.blob.core.windows.net/release/20130502/ALL.chr\"+contig+\".*.vcf.gz\"\n","\n","# Load data\n","  data = spark.read\\\n","    .format(\"vcf\")\\\n","    .option(\"includeSampleIds\", True)\\\n","    .option(\"flattenInfoFields\", True)\\\n","    .load(source)\n","  \n","  # Define number of partitions, will be used for coalesce later\n","  rowCount = data.count()\n","  partCount = ceil (repartitionCoef * rowCount)  \n","  if partCount > partitionMax:\n","    partCount = partitionMax\n","\n","  dataFinal = transformVcf(data, toFlatten, toHash, fullSet)\n","  if not toFlatten:\n","    sink = \"wasbs://\"+outputContainer + \"@\" + outputStorageAccount + \".blob.core.windows.net\"+ outputDir + \"/original/chr\"+contig\n","  else:\n","    sink = \"wasbs://\"+outputContainer + \"@\" + outputStorageAccount + \".blob.core.windows.net\"+ outputDir + \"/flattened/chr\"+contig\n","                 \n","  dataFinal.coalesce(partCount). \\\n","    write. \\\n","    mode(\"overwrite\"). \\\n","    format(\"parquet\"). \\\n","    save(sink)\n"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"70c077c0-10be-455b-839d-f7368bc4db29"}},"outputs":[],"execution_count":null}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"vcf2parquet-1000genomes","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{"flatten":{"nuid":"8bf1ecc7-2a55-447a-80b7-aa1250b6c022","currentValue":"False","widgetInfo":{"widgetType":"dropdown","name":"flatten","defaultValue":"False","label":null,"options":{"widgetType":"dropdown","choices":["False","True"]}}},"contigsToProcess":{"nuid":"b716a858-a855-4c5a-8d3c-af0b05773029","currentValue":"22","widgetInfo":{"widgetType":"multiselect","name":"contigsToProcess","defaultValue":"22","label":null,"options":{"widgetType":"dropdown","choices":["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","MT","All"]}}}},"notebookOrigID":4372562996517358}},"nbformat":4,"nbformat_minor":0}


--------------------------------------------------------------------------------
/vcf2parquet-conversion/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # .vcf to .parquet conversion with Glow on Microsoft Azure 
 3 | 
 4 | Notebooks to convert variant call information from .vcf to .parquet for 1000 Genomes and gnomAD datasets using [Glow](https://projectglow.io/). 
 5 | 
 6 | Notebook `vcf2parquet-walkthrough.ipynb` is a **sample code** of format conversion run on chromosome 22 from [1000 Genomes datatset](https://azure.microsoft.com/en-us/services/open-datasets/catalog/genomics-1000-genomes/).  It was tested on Azure Databricks Runtime 9.1, Azure Synapse Spark 3.1 Runtime and Glow v1.1.1.
 7 | 
 8 | 
 9 | 
10 | 
11 | ### Requirements  
12 | 
13 | glow[]().py and Glow maven package should be installed on your Spark cluster:
14 | 
15 | * For Azure Databricks  - see details in [Getting started with Glow](https://glow.readthedocs.io/en/latest/getting-started.html) and [Azure Databricks documentation](https://docs.microsoft.com/en-us/azure/databricks/libraries/workspace-libraries).
16 | * For Azure Synapse Analytics - see details in [Manage Python libraries](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-python-packages) and [Manage Scala and Java packages](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-scala-packages). We recommend to install glow[]().py by [providing an environment specification file](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-python-packages#environment-specification-formats). Since Maven coordinates are not yet supported in Azure Synapse, there are 2 options to install Glow: a. build Glow jar and upload it to your Synapse workspace, or b. download Glow jar and dependencies from Maven repository and upload them to your Synapse workspace ([Glow Spark3 v1.1.1](https://mvnrepository.com/artifact/io.projectglow/glow-spark3_2.12/1.1.1), [HTSJDK v2.21.2](https://mvnrepository.com/artifact/com.github.samtools/htsjdk/2.21.2) and [Hadoop BAM v7.9.2](https://mvnrepository.com/artifact/org.seqdoop/hadoop-bam/7.9.2) are required for all notebooks in this repo)
17 | 
18 | 
19 | ### Data schema 
20 | 
21 | Notebook `vcf2parquet-walkthrough.ipynb` creates 2 copies of dataset with different schema:
22 | * [Default Glow schema](https://glow.readthedocs.io/en/latest/etl/variant-data.html#vcf)  that mirrors a single row of a VCF + hashId. 
23 | 
24 |     Information that applies to an entire variant, such as the contig name, start and end positions, and INFO attributes, is contained in columns. The genotypes, which correspond to the GT FORMAT fields in a VCF, are contained in an array with one entry per sample. Each entry is a struct with fields that are described in the VCF header. Column hashId is constructed as the hash code of all columns except genotypes using the 64-bit variant of the xxHash algorithm, hashId might be used as unique variant id.
25 | 
26 | * Flattened schema
27 | 
28 |     Information that applies to an entire variant (hashId included) stays the same. The genotypes get replaced by set of columns, one per each struct field - to achieve that we have to copy variant information for each entry of genotypes array.  
29 | 
30 | If you want to learn more about VCF format check [the spec page](https://samtools.github.io/hts-specs/).
31 | 
32 | ### Microsoft Azure Resources 
33 | If you are new to Azure, see:
34 | - [Azure Synapse Analytics](https://azure.microsoft.com/en-us/services/synapse-analytics/)
35 | - [Azure Databricks](https://azure.microsoft.com/en-us/services/databricks/)
36 | 
37 | ### Contributing
38 | 
39 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
40 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
41 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
42 | 
43 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
44 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
45 | provided by the bot. You will only need to do this once across all repos using our CLA.
46 | 
47 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
48 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
49 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
50 | 
51 | ### Support
52 | 
53 | For questions about the notebook: Please send an e-mail to genomics@microsoft.com


--------------------------------------------------------------------------------
/vcf2parquet-conversion/gnomad/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # .vcf to .parquet conversion for gnomAD dataset
 3 | 
 4 |  `vcf2parquet-gnomad.ipynb` is a notebook used to convert variant call information from .vcf to .parquet for [gnomAD dataset](https://docs.microsoft.com/en-us/azure/open-datasets/dataset-gnomad) v2.1.1 using [Glow](https://projectglow.io/). 
 5 | 
 6 | Notebook was tested on Azure Synapse Spark 3.1 Runtime and Glow v1.1.1.
 7 | 
 8 | We also provide SQL scripts to create external tables on curated gnomAD dataset and to query it with [Azure Synapse Analytics](https://azure.microsoft.com/en-us/services/synapse-analytics/).
 9 | 
10 | ### Curated gnomAD data access
11 | 
12 | The data is available publicly without restrictions. Storage Account: https://datasetgnomadparquet.blob.core.windows.net/dataset 
13 | 
14 | 
15 | ### Data schema 
16 | 
17 | Notebook creates dataset with the following schema:
18 |     
19 | * [Default Glow schema](https://glow.readthedocs.io/en/latest/etl/variant-data.html#vcf)  with removed genotypes and added hashId. 
20 | 
21 |     Information that applies to an entire variant, such as the contig name, start and end positions, and INFO attributes, is contained in columns. The genotypes, which correspond to the GT FORMAT fields in a VCF, are removed since they do not contain any information in gnomAD dataset. Column hashId is constructed as the hash code of all columns except genotypes using the 64-bit variant of the xxHash algorithm, hashId might be used as unique variant id.
22 | 
23 | Detailed description is available at https://datasetgnomadparquet.blob.core.windows.net/dataset/v2.1.1/dataSchema_exomes.txt and https://datasetgnomadparquet.blob.core.windows.net/dataset/v2.1.1/dataSchema_genomes.txt


--------------------------------------------------------------------------------
/vcf2parquet-conversion/gnomad/sql/sampleQueries.sql:
--------------------------------------------------------------------------------
 1 | --- create database
 2 | CREATE DATABASE [gnomAD]
 3 |     COLLATE Latin1_General_100_BIN2_UTF8;
 4 | GO
 5 | 
 6 | USE [gnomAD]
 7 | GO
 8 | 
 9 | --- create external file format
10 | IF NOT EXISTS (SELECT * FROM sys.external_file_formats WHERE name = 'SynapseParquetFormat') 
11 | 	CREATE EXTERNAL FILE FORMAT [SynapseParquetFormat] 
12 | 	WITH ( FORMAT_TYPE = PARQUET)
13 | GO
14 | 
15 | --- create external data source
16 | IF NOT EXISTS (SELECT * FROM sys.external_data_sources WHERE name = 'gnomADVariants') 
17 | 	CREATE EXTERNAL DATA SOURCE [gnomADVariants]  
18 | 	WITH (
19 | 		LOCATION   = 'https://datasetgnomadparquet.blob.core.windows.net/dataset'
20 | 	)
21 | GO
22 | 
23 | --- create external table for exome data
24 | --- we use a subset of columns for this table
25 | --- full schemas are available at https://datasetgnomadparquet.blob.core.windows.net/dataset/v2.1.1/dataSchema_exomes.txt and https://datasetgnomadparquet.blob.core.windows.net/dataset/v2.1.1/dataSchema_genomes.txt
26 | CREATE EXTERNAL TABLE [exomes] (
27 | 	[contigName] varchar(2),
28 | 	[referenceAllele] varchar(150),
29 | 	[alternateAlleles] varchar(150),
30 | 	[start] bigint,
31 | 	[end] bigint,
32 | 	[names] varchar(150),
33 | 	[qual] float,
34 | 	[INFO_VQSR_POSITIVE_TRAIN_SITE] bit
35 | 	)
36 | 	WITH (
37 | 	LOCATION = 'v2.1.1/exomes/*/*.snappy.parquet', 
38 | 	DATA_SOURCE = [gnomADVariants],
39 | 	FILE_FORMAT = [SynapseParquetFormat]
40 | 	)
41 | GO
42 | 
43 | --- count number of variants 
44 | SELECT  COUNT_BIG(*) as totalVariantCount
45 | FROM [dbo].[exomes]
46 | GO
47 | 
48 | --- count number of variants by chromosome
49 | SELECT [contigName],  COUNT_BIG(*) as variantCount
50 | FROM [dbo].[exomes] GROUP BY contigName ORDER BY variantCount DESC
51 | GO
52 | 
53 | --- count number of variants that are VQSR positive training sites, per chromosome
54 | SELECT [contigName],  COUNT_BIG(*) as VQSRPositiveSitesCount
55 | FROM [dbo].[exomes] 
56 | WHERE INFO_VQSR_POSITIVE_TRAIN_SITE = 1
57 | GROUP BY contigName ORDER BY VQSRPositiveSitesCount DESC
58 | GO
59 | 
60 | --- take a look at VQSR positive training sites for chromosome Y
61 | SELECT *
62 | FROM [dbo].[exomes]
63 | WHERE contigName = 'Y' and INFO_VQSR_POSITIVE_TRAIN_SITE = 1
64 | GO
65 | 


--------------------------------------------------------------------------------
/vcf2parquet-conversion/gnomad/vcf2parquet-gnomad.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 2,
  4 |   "metadata": {
  5 |     "kernelspec": {
  6 |       "name": "synapse_pyspark",
  7 |       "display_name": "Synapse PySpark"
  8 |     },
  9 |     "language_info": {
 10 |       "name": "python"
 11 |     },
 12 |     "save_output": false,
 13 |     "synapse_widget": {
 14 |       "version": "0.1",
 15 |       "state": {}
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "execution_count": 19,
 22 |       "outputs": [],
 23 |       "metadata": {
 24 |         "application/vnd.databricks.v1+cell": {
 25 |           "title": "",
 26 |           "showTitle": false,
 27 |           "inputWidgets": {},
 28 |           "nuid": "42f3599e-f63c-479f-92d7-846e09cd39ac"
 29 |         }
 30 |       },
 31 |       "source": [
 32 |         "from pyspark.sql.functions import col, xxhash64\n",
 33 |         "from notebookutils import mssparkutils\n",
 34 |         "import re\n",
 35 |         "\n",
 36 |         "# Import glow.py and register Glow package\n",
 37 |         "import glow\n",
 38 |         "spark = glow.register(spark)"
 39 |       ]
 40 |     },
 41 |     {
 42 |       "cell_type": "code",
 43 |       "execution_count": 20,
 44 |       "outputs": [],
 45 |       "metadata": {
 46 |         "application/vnd.databricks.v1+cell": {
 47 |           "title": "",
 48 |           "showTitle": false,
 49 |           "inputWidgets": {},
 50 |           "nuid": "454af720-8d0f-4bed-86a3-5189471ef9b4"
 51 |         }
 52 |       },
 53 |       "source": [
 54 |         "# Provide names for output storage account, container and relative path\n",
 55 |         "\n",
 56 |         "outputStorageAccount = 'Your account name' # replace with your account name\n",
 57 |         "outputContainer = 'Your container name' # replace with your container name\n",
 58 |         "outputDir = 'Your path' # replace with your relative path\n",
 59 |         "\n",
 60 |         "\n",
 61 |         "# Here we assume that Azure Synapse Analytics is used and outputStorageAccount is a primary storage account in the workspace - no auth needed in this case \n",
 62 |         "# For other Synapse scenarios check https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/microsoft-spark-utilities?pivots=programming-language-python\n",
 63 |         "# For Azure Databricks check https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/ and https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/azure-storage\n",
 64 |         ""
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "execution_count": 21,
 70 |       "outputs": [],
 71 |       "metadata": {
 72 |         "application/vnd.databricks.v1+cell": {
 73 |           "title": "",
 74 |           "showTitle": false,
 75 |           "inputWidgets": {},
 76 |           "nuid": "8b89be1f-bfa3-4abb-99dd-995a13535761"
 77 |         }
 78 |       },
 79 |       "source": [
 80 |         "# Define source public data\n",
 81 |         "\n",
 82 |         "inputStorageAccount = 'azureopendatastorage'\n",
 83 |         "inputContainer = 'gnomad'\n",
 84 |         "inputDir = 'release/2.1.1/vcf'"
 85 |       ]
 86 |     },
 87 |     {
 88 |       "cell_type": "code",
 89 |       "execution_count": 22,
 90 |       "outputs": [],
 91 |       "metadata": {
 92 |         "application/vnd.databricks.v1+cell": {
 93 |           "title": "",
 94 |           "showTitle": false,
 95 |           "inputWidgets": {},
 96 |           "nuid": "f332ea7c-2a59-4c9d-b2a8-5969e443714e"
 97 |         }
 98 |       },
 99 |       "source": [
100 |         "# Read, transform and write data\n",
101 |         "\n",
102 |         "def TransformData(source, sink, colsToDrop, colsToKeepAsArray):\n",
103 |         "# Read data\n",
104 |         "  data = spark.read. \\\n",
105 |         "    format('vcf'). \\\n",
106 |         "    load(source)\n",
107 |         "# Drop columns\n",
108 |         "  dataReduced = data\n",
109 |         "  for column in colsToDrop:\n",
110 |         "    dataReduced = dataReduced.drop(column)\n",
111 |         "# Add hashId column\n",
112 |         "  hashCols = dataReduced.columns\n",
113 |         "  dataHashed = dataReduced.withColumn('hashId', xxhash64(*hashCols))\n",
114 |         "# Replace arrays by first element  \n",
115 |         "  colsToReplaceByFirstElement = []\n",
116 |         "  for x, t in dataHashed.dtypes:\n",
117 |         "    if t.startswith('array'):\n",
118 |         "      colsToReplaceByFirstElement.append(x)\n",
119 |         "  colsToReplaceByFirstElement = list(set(colsToReplaceByFirstElement) - set(colsToKeepAsArray))\n",
120 |         "  dataTransformed = dataHashed\n",
121 |         "  for column in colsToReplaceByFirstElement:\n",
122 |         "    dataTransformed = dataTransformed.withColumn(column, col(column)[0])\n",
123 |         "# Write data   \n",
124 |         "  dataTransformed.write. \\\n",
125 |         "    mode('overwrite'). \\\n",
126 |         "    format('parquet'). \\\n",
127 |         "    save(sink)"
128 |       ]
129 |     },
130 |     {
131 |       "cell_type": "code",
132 |       "execution_count": 23,
133 |       "outputs": [],
134 |       "metadata": {
135 |         "application/vnd.databricks.v1+cell": {
136 |           "title": "",
137 |           "showTitle": false,
138 |           "inputWidgets": {},
139 |           "nuid": "ebfd6d88-5604-468c-8c8f-cb67ba620e1e"
140 |         }
141 |       },
142 |       "source": [
143 |         "# Input files end with suffix 'chromosome.vcf.bgz', where chromosome might be a number from 1 to 22 or X or Y\n",
144 |         "sourceSuffix = '[XY|0-9][.]vcf[.]bgz$'\n",
145 |         "\n",
146 |         "# Columns to drop and to keep as array\n",
147 |         "colsToDrop = ['genotypes']\n",
148 |         "colsToKeepAsArray = ['INFO_vep']\n",
149 |         "\n",
150 |         "# Datasets to process\n",
151 |         "datasets = ['genomes']\n",
152 |         "\n",
153 |         "for dataset in datasets:\n",
154 |         "  sourcePath = 'wasbs://%s@%s.blob.core.windows.net/%s/%s' % (inputContainer, inputStorageAccount, inputDir, dataset)   \n",
155 |         "  files = mssparkutils.fs.ls(sourcePath)\n",
156 |         "  for file in files:\n",
157 |         "    if re.search(sourceSuffix, file.name):\n",
158 |         "      source = file.path\n",
159 |         "      sink = 'abfss://%s@%s.dfs.core.windows.net/%s/%s/%s' % (outputContainer, outputStorageAccount, outputDir, dataset, file.name.rstrip('.vcf.bgz')) \n",
160 |         "      TransformData(source, sink, colsToDrop, colsToKeepAsArray)"
161 |       ]
162 |     }
163 |   ]
164 | }


--------------------------------------------------------------------------------
/vcf2parquet-conversion/vcf2parquet-walkthrough.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "nteract": {
  7 |           "transient": {
  8 |             "deleting": false
  9 |           }
 10 |         }
 11 |       },
 12 |       "source": [
 13 |         "Import packages, register Glow, configure credentials to access 1000 genomes dataset from Azure Genomics Data Lake and read data for chromosome 22."
 14 |       ]
 15 |     },
 16 |     {
 17 |       "cell_type": "code",
 18 |       "execution_count": 1,
 19 |       "metadata": {
 20 |         "application/vnd.databricks.v1+cell": {
 21 |           "inputWidgets": {},
 22 |           "nuid": "32cd967d-57af-4c71-8249-6bb2a44c65f2",
 23 |           "showTitle": false,
 24 |           "title": ""
 25 |         }
 26 |       },
 27 |       "outputs": [],
 28 |       "source": [
 29 |         "from pyspark.sql.functions import explode, col, lit, xxhash64\n",
 30 |         "\n",
 31 |         "import glow\n",
 32 |         "spark = glow.register(spark)\n",
 33 |         "\n",
 34 |         "spark.conf.set(\n",
 35 |         "  'fs.azure.sas.dataset.dataset1000genomes.blob.core.windows.net',\n",
 36 |         "  'sv=2019-10-10&si=prod&sr=c&sig=9nzcxaQn0NprMPlSh4RhFQHcXedLQIcFgbERiooHEqM%3D')\n",
 37 |         "\n",
 38 |         "source = 'wasbs://dataset@dataset1000genomes.blob.core.windows.net/release/20130502/ALL.chr22*.vcf.gz'\n",
 39 |         "\n",
 40 |         "df = spark.read.\\\n",
 41 |         "  format('vcf').\\\n",
 42 |         "  option('includeSampleIds', True).\\\n",
 43 |         "  option('flattenInfoFields', True).\\\n",
 44 |         "  load(source)"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "nteract": {
 51 |           "transient": {
 52 |             "deleting": false
 53 |           }
 54 |         }
 55 |       },
 56 |       "source": [
 57 |         "We will use a subset of chromosome 22 to keep time and cost low. If you want to do format conversion for the whole chromosome 22 dataset you still can use this notebook but might need bigger cluster."
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "execution_count": 2,
 63 |       "metadata": {
 64 |         "jupyter": {
 65 |           "outputs_hidden": false,
 66 |           "source_hidden": false
 67 |         },
 68 |         "nteract": {
 69 |           "transient": {
 70 |             "deleting": false
 71 |           }
 72 |         }
 73 |       },
 74 |       "outputs": [],
 75 |       "source": [
 76 |         "data=df.limit(10000)"
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "markdown",
 81 |       "metadata": {
 82 |         "nteract": {
 83 |           "transient": {
 84 |             "deleting": false
 85 |           }
 86 |         }
 87 |       },
 88 |       "source": [
 89 |         "Take a look at regular sites"
 90 |       ]
 91 |     },
 92 |     {
 93 |       "cell_type": "code",
 94 |       "execution_count": 3,
 95 |       "metadata": {
 96 |         "jupyter": {
 97 |           "outputs_hidden": false,
 98 |           "source_hidden": false
 99 |         },
100 |         "nteract": {
101 |           "transient": {
102 |             "deleting": false
103 |           }
104 |         }
105 |       },
106 |       "outputs": [],
107 |       "source": [
108 |         "data.where('INFO_MULTI_ALLELIC = FALSE').show(2)"
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "markdown",
113 |       "metadata": {
114 |         "nteract": {
115 |           "transient": {
116 |             "deleting": false
117 |           }
118 |         }
119 |       },
120 |       "source": [
121 |         "and multiallelic sites."
122 |       ]
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "execution_count": 6,
127 |       "metadata": {
128 |         "jupyter": {
129 |           "outputs_hidden": false,
130 |           "source_hidden": false
131 |         },
132 |         "nteract": {
133 |           "transient": {
134 |             "deleting": false
135 |           }
136 |         }
137 |       },
138 |       "outputs": [],
139 |       "source": [
140 |         "data.where('INFO_MULTI_ALLELIC = TRUE').show(2)"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "markdown",
145 |       "metadata": {
146 |         "nteract": {
147 |           "transient": {
148 |             "deleting": false
149 |           }
150 |         }
151 |       },
152 |       "source": [
153 |         "Transform data: add _hashId_ column (hash is built on all columns except genotypes and might be used as unique id for variants), explode on genotypes and flatten genotypes column."
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "execution_count": 4,
159 |       "metadata": {
160 |         "jupyter": {
161 |           "outputs_hidden": false,
162 |           "source_hidden": false
163 |         },
164 |         "nteract": {
165 |           "transient": {
166 |             "deleting": false
167 |           }
168 |         }
169 |       },
170 |       "outputs": [],
171 |       "source": [
172 |         "hashCols = list(set(data.columns) - {'genotypes'})\n",
173 |         "dataHashed = data.withColumn('hashId', xxhash64(*hashCols))\n",
174 |         "\n",
175 |         "dataExploded = dataHashed.withColumn('genotypes', explode('genotypes'))\n",
176 |         "\n",
177 |         "def flattenStructFields(df):\n",
178 |         "  flat_cols = [c[0] for c in df.dtypes if c[1][:6] != 'struct']\n",
179 |         "  nested_cols = [c[0] for c in df.dtypes if c[1][:6] =='struct']\n",
180 |         "  flat_df = df.select(flat_cols + \n",
181 |         "                     [col(nc+'.'+c).alias(nc+'_'+c)\n",
182 |         "                     for nc in nested_cols\n",
183 |         "                     for c in df.select(nc+'.*').columns])\n",
184 |         "  return flat_df\n",
185 |         "\n",
186 |         "dataExplodedFlatten = flattenStructFields(dataExploded)"
187 |       ]
188 |     },
189 |     {
190 |       "cell_type": "markdown",
191 |       "metadata": {
192 |         "nteract": {
193 |           "transient": {
194 |             "deleting": false
195 |           }
196 |         }
197 |       },
198 |       "source": [
199 |         "Take a look at transformed dataset: regular sites\n"
200 |       ]
201 |     },
202 |     {
203 |       "cell_type": "code",
204 |       "execution_count": 5,
205 |       "metadata": {
206 |         "jupyter": {
207 |           "outputs_hidden": false,
208 |           "source_hidden": false
209 |         },
210 |         "nteract": {
211 |           "transient": {
212 |             "deleting": false
213 |           }
214 |         }
215 |       },
216 |       "outputs": [],
217 |       "source": [
218 |         "dataExplodedFlatten.where('INFO_MULTI_ALLELIC = FALSE').show(2)"
219 |       ]
220 |     },
221 |     {
222 |       "cell_type": "markdown",
223 |       "metadata": {
224 |         "nteract": {
225 |           "transient": {
226 |             "deleting": false
227 |           }
228 |         }
229 |       },
230 |       "source": [
231 |         "and multiallelic sites."
232 |       ]
233 |     },
234 |     {
235 |       "cell_type": "code",
236 |       "execution_count": 32,
237 |       "metadata": {
238 |         "jupyter": {
239 |           "outputs_hidden": false,
240 |           "source_hidden": false
241 |         },
242 |         "nteract": {
243 |           "transient": {
244 |             "deleting": false
245 |           }
246 |         }
247 |       },
248 |       "outputs": [],
249 |       "source": [
250 |         "dataExplodedFlatten.where('INFO_MULTI_ALLELIC = TRUE').show(2)"
251 |       ]
252 |     },
253 |     {
254 |       "cell_type": "markdown",
255 |       "metadata": {
256 |         "nteract": {
257 |           "transient": {
258 |             "deleting": false
259 |           }
260 |         }
261 |       },
262 |       "source": [
263 |         "To save the data you need to provide names for output storage account, container and relative path, and set _outputPath_"
264 |       ]
265 |     },
266 |     {
267 |       "cell_type": "code",
268 |       "execution_count": 6,
269 |       "metadata": {
270 |         "jupyter": {
271 |           "outputs_hidden": false,
272 |           "source_hidden": false
273 |         },
274 |         "nteract": {
275 |           "transient": {
276 |             "deleting": false
277 |           }
278 |         }
279 |       },
280 |       "outputs": [],
281 |       "source": [
282 |         "outputStorageAccount =  'Your account name' # replace with your account name\n",
283 |         "outputContainer = 'Your container name' # replace with your container name\n",
284 |         "outputDir = 'Your path' # replace with your relative path"
285 |       ]
286 |     },
287 |     {
288 |       "cell_type": "markdown",
289 |       "metadata": {
290 |         "nteract": {
291 |           "transient": {
292 |             "deleting": false
293 |           }
294 |         }
295 |       },
296 |       "source": [
297 |         "**Option 1:** If you use Azure Synapse Analytics and _outputStorageAccount_ is a primary (default) storage account in your Synapse workspace, run cell below\n",
298 |         "\n",
299 |         "Primary storage account is ADLS Gen2, so we use \n",
300 |         "```\n",
301 |         "adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)\n",
302 |         "```"
303 |       ]
304 |     },
305 |     {
306 |       "cell_type": "code",
307 |       "execution_count": 34,
308 |       "metadata": {
309 |         "jupyter": {
310 |           "outputs_hidden": false,
311 |           "source_hidden": false
312 |         },
313 |         "nteract": {
314 |           "transient": {
315 |             "deleting": false
316 |           }
317 |         }
318 |       },
319 |       "outputs": [],
320 |       "source": [
321 |         "outputPath = 'abfss://%s@%s.dfs.core.windows.net/%s' % (outputContainer, outputStorageAccount, outputDir) "
322 |       ]
323 |     },
324 |     {
325 |       "cell_type": "markdown",
326 |       "metadata": {
327 |         "nteract": {
328 |           "transient": {
329 |             "deleting": false
330 |           }
331 |         }
332 |       },
333 |       "source": [
334 |         "**Option 2:** If you want to save data to Azure Blob Storage, provide SAS token for output container and run cell below. It works for Azure Databricks and Azure Synapse Analytics. \n",
335 |         "\n",
336 |         "For Azure Blob Storage we use  \n",
337 |         "```\n",
338 |         "wasb_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)\n",
339 |         "```"
340 |       ]
341 |     },
342 |     {
343 |       "cell_type": "code",
344 |       "execution_count": 7,
345 |       "metadata": {
346 |         "jupyter": {
347 |           "outputs_hidden": false,
348 |           "source_hidden": false
349 |         },
350 |         "nteract": {
351 |           "transient": {
352 |             "deleting": false
353 |           }
354 |         }
355 |       },
356 |       "outputs": [],
357 |       "source": [
358 |         "outputSAS = 'Your SAS token' # replace with your SAS token\n",
359 |         "spark.conf.set(\n",
360 |         "  'fs.azure.sas.' + outputContainer + '.' + outputStorageAccount + '.blob.core.windows.net', outputSAS)\n",
361 |         "\n",
362 |         "outputPath = 'wasbs://%s@%s.blob.core.windows.net/%s' % (outputContainer, outputStorageAccount, outputDir)"
363 |       ]
364 |     },
365 |     {
366 |       "cell_type": "markdown",
367 |       "metadata": {
368 |         "nteract": {
369 |           "transient": {
370 |             "deleting": false
371 |           }
372 |         }
373 |       },
374 |       "source": [
375 |         " To learn more about Azure Blob Storage and Azure Data Lake Storage (ADLS) Gen2 and different ways to access them check documentation: \n",
376 |         "\n",
377 |         " - Azure Databricks - https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/adls-gen2/ and https://docs.microsoft.com/en-us/azure/databricks/data/data-sources/azure/azure-storage\n",
378 |         " - Azure Synapse Analytics - https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/microsoft-spark-utilities?pivots=programming-language-python"
379 |       ]
380 |     },
381 |     {
382 |       "cell_type": "markdown",
383 |       "metadata": {
384 |         "nteract": {
385 |           "transient": {
386 |             "deleting": false
387 |           }
388 |         }
389 |       },
390 |       "source": [
391 |         "Write data in .parquet format to your storage account: original data"
392 |       ]
393 |     },
394 |     {
395 |       "cell_type": "code",
396 |       "execution_count": 8,
397 |       "metadata": {
398 |         "application/vnd.databricks.v1+cell": {
399 |           "inputWidgets": {},
400 |           "nuid": "7cb2db54-fcab-43c1-94f7-de6d64f20742",
401 |           "showTitle": false,
402 |           "title": ""
403 |         }
404 |       },
405 |       "outputs": [],
406 |       "source": [
407 |         "sink = outputPath + '/original/chr22'\n",
408 |         "dataHashed.write. \\\n",
409 |         "  mode('overwrite'). \\\n",
410 |         "  format('parquet'). \\\n",
411 |         "  save(sink)"
412 |       ]
413 |     },
414 |     {
415 |       "cell_type": "markdown",
416 |       "metadata": {
417 |         "nteract": {
418 |           "transient": {
419 |             "deleting": false
420 |           }
421 |         }
422 |       },
423 |       "source": [
424 |         "and transformed data"
425 |       ]
426 |     },
427 |     {
428 |       "cell_type": "code",
429 |       "execution_count": 9,
430 |       "metadata": {
431 |         "application/vnd.databricks.v1+cell": {
432 |           "inputWidgets": {},
433 |           "nuid": "091f7803-4fd9-4826-a8de-a9e6e423812a",
434 |           "showTitle": false,
435 |           "title": ""
436 |         }
437 |       },
438 |       "outputs": [],
439 |       "source": [
440 |         "sink = outputPath + '/flattened/chr22' \n",
441 |         "\n",
442 |         "dataExplodedFlatten.write. \\\n",
443 |         "  mode('overwrite'). \\\n",
444 |         "  format('parquet'). \\\n",
445 |         "  save(sink)"
446 |       ]
447 |     }
448 |   ],
449 |   "metadata": {
450 |     "description": null,
451 |     "kernelspec": {
452 |       "display_name": "Synapse PySpark",
453 |       "name": "synapse_pyspark"
454 |     },
455 |     "language_info": {
456 |       "name": "python"
457 |     },
458 |     "save_output": false,
459 |     "synapse_widget": {
460 |       "state": {},
461 |       "version": "0.1"
462 |     }
463 |   },
464 |   "nbformat": 4,
465 |   "nbformat_minor": 2
466 | }
467 | 


--------------------------------------------------------------------------------