├── README.md
├── images
    └── enable-gpu-on-anvil.jpg
├── Bioconductor-GPU-ML
    ├── launch-gcloud.sh
    └── on-gcloud-gpu-machine.md
├── Bioconductor-on-AnVIL-GPU
    └── README.md
└── scripts
    ├── deeppincs.R
    └── vaexprs.R


/README.md:
--------------------------------------------------------------------------------
1 | # Using Bioconductor on GPU based compute
2 | 


--------------------------------------------------------------------------------
/images/enable-gpu-on-anvil.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/BiocGPU/devel/images/enable-gpu-on-anvil.jpg


--------------------------------------------------------------------------------
/Bioconductor-GPU-ML/launch-gcloud.sh:
--------------------------------------------------------------------------------
1 | gcloud compute instances create test-gpu-scratch \
2 |     --machine-type n1-standard-2 \
3 |     --zone us-central1-a \
4 |     --boot-disk-size 50GB \
5 |     --accelerator type=nvidia-tesla-k80,count=1 \
6 |     --image-project deeplearning-platform-release \
7 |     --image-family common-cu113-ubuntu-2004 \
8 |     --maintenance-policy TERMINATE --restart-on-failure
9 | 


--------------------------------------------------------------------------------
/Bioconductor-on-AnVIL-GPU/README.md:
--------------------------------------------------------------------------------
 1 | # Bioconductor on AnVIL GPU
 2 | 
 3 | ## Authors
 4 | 
 5 | Nitesh Turaga - nitesh@ds.dfci.harvar.edu
 6 | 
 7 | ## Introduction
 8 | 
 9 | Bioconductor has been recieving R packages that use an interface to run python libraries. Some of these packages use machine learning and deep learning libraries that are commonly used such as  `tensorflow`, and `keras`.
10 | 
11 | - 'TensorFlow' is an end-to-end open source platform for machine learning.
12 | 
13 | - 'Keras' is an open-source software library that provides a Python interface for artificial neural networks
14 | 
15 | Some of the Bioconductor packages that use "reticulate" to interface with are listed below
16 | 
17 | - VAExprs
18 | 
19 | - DeepPINCS
20 | 
21 | These machine learning packages run faster when using GPU based cloud environments. This workspace demonstrates the how to use the AnVIL environment to run a Bioconductor package on a GPU enabled cloud environment.
22 | 
23 | 
24 | ## Steps 
25 | 
26 | 1. Start a GPU enabled cloud environment from the Cloud environment launcher. Select **Customize** in the menu. 
27 |  The cloud environment should be a **R / Bioconductor** based Jupyter environment. It will have an option to **Enable GPUs**.
28 |  
29 | 2. Choose a GPU type, and the number of GPUs. The default setting will work for this workspace. 
30 | 
31 | ![env](https://raw.githubusercontent.com/nturaga/BiocGPU/master/images/enable-gpu-on-anvil.jpg)
32 | 
33 | 3. Go to the Notebooks tab, and run the **VAExprs_Bioconductor_on_terra_GPU** notehook with the GPU Enabled cloud environment just launched. 
34 | 
35 | 
36 | ## More information on GPU
37 | 
38 | [GPU Support page](https://support.terra.bio/hc/en-us/articles/4403006001947)
39 | 


--------------------------------------------------------------------------------
/Bioconductor-GPU-ML/on-gcloud-gpu-machine.md:
--------------------------------------------------------------------------------
 1 | # Shell commands within the Gcloud GPU VM
 2 |  
 3 | - Login
 4 | 
 5 | ```
 6 | gcloud compute ssh test-gpu-scratch --zone=us-central1-a --project=ldmbio
 7 | ```
 8 | 
 9 | - VM requires NVidia GPU to be installed: (y)
10 | 
11 | ```
12 | This VM requires Nvidia drivers to function correctly.   Installation takes ~1 minute.
13 | Would you like to install the Nvidia driver? [y/n] y
14 | ```
15 | 
16 | - Install docker (required) and NVidia Docker (comes with it)
17 | 
18 | ```
19 | wget -O - -q 'https://gist.githubusercontent.com/allenday/c875eaf21a2b416f6478c0a48e428f6a/raw/f7feca1acc1a992afa84f347394fd7e4bfac2599/install-docker-ce.sh' | sudo bash
20 | ```
21 | 
22 | - Verify GPU is visible from Docker container
23 | 
24 | ```
25 | nvidia-docker-plugin &
26 | ```
27 | 
28 | Should see an output like this
29 | 
30 | ```
31 | nvidia-docker-plugin | 2022/04/01 13:29:27 Loading NVIDIA unified memory
32 | nvidia-docker-plugin | 2022/04/01 13:29:28 Loading NVIDIA management library
33 | nvidia-docker-plugin | 2022/04/01 13:29:29 Discovering GPU devices
34 | nvidia-docker-plugin | 2022/04/01 13:29:30 Provisioning volumes at /var/lib/nvidia-docker/volumes
35 | nvidia-docker-plugin | 2022/04/01 13:29:30 Serving plugin API at /run/docker/plugins
36 | nvidia-docker-plugin | 2022/04/01 13:29:30 Serving remote API at localhost:3476
37 | ```
38 | 
39 | - Pull Bioconductor ML image
40 | 
41 | ```
42 | sudo  docker pull nitesh1989/bioconductor_ml:devel
43 | ```
44 | 
45 | - Run image
46 | 
47 | ```
48 |  sudo docker run --gpus all --rm -ti nitesh1989/bioconductor_ml:devel bash
49 |  ```
50 |  
51 | ## Within the Docker image
52 | 
53 |  
54 | - Start with creating a new python environment
55 | 
56 | ```
57 | 
58 | ## Check pyenv installation
59 | pyenv --version
60 | 
61 | ## List python versions 
62 | pyenv install --list
63 | 
64 | ## Create a virtual env
65 | pyenv virtualenv vaexprs
66 | 
67 | ## activate the virtual env
68 | pyenv local vaexprs
69 | 
70 | ## Install deep learning libraries
71 | pip3 install keras tensorflow
72 | 
73 | ```
74 | 
75 | - Within R install VAExprs
76 | 
77 | ```
78 | BiocManager::install(c('SC3', 'VAExprs'))
79 | ```
80 | 
81 | - Set virtual env
82 | 
83 | ```
84 | reticulate::use_virtualenv('.pyenv/versions/vaexprs/vaexprs/')
85 | 
86 | keras::is_keras_available() & reticulate::py_available()
87 | ```
88 | 


--------------------------------------------------------------------------------
/scripts/deeppincs.R:
--------------------------------------------------------------------------------
 1 | library(DeepPINCS)
 2 | 
 3 | # ?DeepPINCS::fit_cpi
 4 | 
 5 | 
 6 | ## ---- eval=TRUE---------------------------------------------------------------
 7 | if (keras::is_keras_available() & reticulate::py_available()) {
 8 |     library(DeepPINCS)
 9 |     example_cpi <- example_cpi[1:500,]
10 |     validation_split <- 0.3
11 |     idx <- sample(seq_len(length(example_cpi[,1])))
12 |     train_idx <- seq_len(length(example_cpi[,1])) %in%
13 |         idx[seq_len(round(length(example_cpi[,1]) * (1 - validation_split)))]
14 | }
15 | 
16 | ## ---- eval=TRUE---------------------------------------------------------------
17 | if (keras::is_keras_available() & reticulate::py_available()) {
18 |     net_args <- list(
19 |         compound = "gcn_in_out",
20 |         compound_args = list(
21 |             gcn_units = c(128, 64),
22 |             gcn_activation = c("relu", "relu"),
23 |             fc_units = c(10),
24 |             fc_activation = c("relu")),
25 |         protein = "cnn_in_out",
26 |         protein_args = list(
27 |             cnn_filters = c(32),
28 |             cnn_kernel_size = c(3),
29 |             cnn_activation = c("relu"),
30 |             fc_units = c(10),
31 |             fc_activation = c("relu")),
32 |         fc_units = c(1),
33 |         fc_activation = c("sigmoid"),
34 |         loss = "binary_crossentropy",
35 |         optimizer = keras::optimizer_adam(),
36 |         metrics = "accuracy")
37 | }
38 | 
39 | ## ---- eval=TRUE---------------------------------------------------------------
40 | if (keras::is_keras_available() & reticulate::py_available()) {
41 |     compound_max_atoms <- 50
42 |     protein_embedding_dim <- 16
43 |     protein_length_seq <- 100
44 |     gcn_cnn_cpi <- fit_cpi(
45 |         smiles = example_cpi[train_idx, 1],
46 |         AAseq = example_cpi[train_idx, 2], 
47 |         outcome = example_cpi[train_idx, 3],
48 |         compound_type = "graph",
49 |         compound_max_atoms = compound_max_atoms,
50 |         protein_length_seq = protein_length_seq,
51 |         protein_embedding_dim = protein_embedding_dim,
52 |         protein_ngram_max = 2,
53 |         protein_ngram_min = 1,
54 |         smiles_val = example_cpi[!train_idx, 1],
55 |         AAseq_val = example_cpi[!train_idx, 2],
56 |         outcome_val = example_cpi[!train_idx, 3],
57 |         net_args = net_args,
58 |         epochs = 20,
59 |         batch_size = 64,
60 |         callbacks = keras::callback_early_stopping(
61 |             monitor = "val_accuracy",
62 |             patience = 10,
63 |             restore_best_weights = TRUE))
64 |     ttgsea::plot_model(gcn_cnn_cpi$model)
65 | }
66 | 


--------------------------------------------------------------------------------
/scripts/vaexprs.R:
--------------------------------------------------------------------------------
  1 | BiocManager::install('VAExprs')
  2 | 
  3 | library(VAExprs)
  4 | 
  5 | ## ---- eval=TRUE---------------------------------------------------------------
  6 | if (keras::is_keras_available() & reticulate::py_available()) {
  7 |     library(VAExprs)
  8 |     
  9 |     ### simulate differentially expressed genes
 10 |     set.seed(1)
 11 |     g <- 3
 12 |     n <- 100
 13 |     m <- 1000
 14 |     mu <- 5
 15 |     sigma <- 5
 16 |     mat <- matrix(rnorm(n*m*g, mu, sigma), m, n*g)
 17 |     rownames(mat) <- paste0("gene", seq_len(m))
 18 |     colnames(mat) <- paste0("cell", seq_len(n*g))
 19 |     group <- factor(sapply(seq_len(g), function(x) { 
 20 |         rep(paste0("group", x), n)
 21 |     }))
 22 |     names(group) <- colnames(mat)
 23 |     mu_upreg <- 6
 24 |     sigma_upreg <- 10
 25 |     deg <- 100
 26 |     for (i in seq_len(g)) {
 27 |         mat[(deg*(i-1) + 1):(deg*i), group == paste0("group", i)] <- 
 28 |             mat[1:deg, group==paste0("group", i)] + rnorm(deg, mu_upreg, sigma_upreg)
 29 |     }
 30 |     # positive expression only
 31 |     mat[mat < 0] <- 0
 32 |     x_train <- as.matrix(t(mat))
 33 |     
 34 |     # heatmap
 35 |     heatmap(mat, Rowv = NA, Colv = NA, 
 36 |             col = colorRampPalette(c('green', 'red'))(100), 
 37 |             scale = "none")
 38 | }
 39 | 
 40 | ## ---- eval=TRUE---------------------------------------------------------------
 41 | if (keras::is_keras_available() & reticulate::py_available()) {
 42 |     # model parameters
 43 |     batch_size <- 32
 44 |     original_dim <- 1000
 45 |     intermediate_dim <- 512
 46 |     epochs <- 100
 47 |     
 48 |     # VAE
 49 |     vae_result <- fit_vae(x_train = x_train, x_val = x_train,
 50 |                         encoder_layers = list(layer_input(shape = c(original_dim)),
 51 |                                             layer_dense(units = intermediate_dim,
 52 |                                                         activation = "relu")),
 53 |                         decoder_layers = list(layer_dense(units = intermediate_dim,
 54 |                                                         activation = "relu"),
 55 |                                             layer_dense(units = original_dim,
 56 |                                                         activation = "sigmoid")),
 57 |                         epochs = epochs, batch_size = batch_size,
 58 |                         use_generator = FALSE,
 59 |                         callbacks = keras::callback_early_stopping(
 60 |                             monitor = "val_loss",
 61 |                             patience = 10,
 62 |                             restore_best_weights = TRUE))
 63 | }
 64 | 
 65 | ## ---- eval=TRUE---------------------------------------------------------------
 66 | if (keras::is_keras_available() & reticulate::py_available()) {
 67 |     # model architecture
 68 |     plot_vae(vae_result$model)
 69 | }
 70 | 
 71 | ## ---- eval=TRUE---------------------------------------------------------------
 72 | if (keras::is_keras_available() & reticulate::py_available()) {
 73 |     # sample generation
 74 |     set.seed(1)
 75 |     gen_sample_result <- gen_exprs(vae_result, num_samples = 100)
 76 |     
 77 |     # heatmap
 78 |     heatmap(cbind(t(x_train), t(gen_sample_result$x_gen)),
 79 |             col = colorRampPalette(c('green', 'red'))(100),
 80 |             Rowv=NA)
 81 | }
 82 | 
 83 | ## ---- eval=TRUE---------------------------------------------------------------
 84 | if (keras::is_keras_available() & reticulate::py_available()) {
 85 |     # plot for augmented data
 86 |     plot_aug(gen_sample_result, "PCA")
 87 | }
 88 | 
 89 | ## ---- eval=TRUE---------------------------------------------------------------
 90 | if (keras::is_keras_available() & reticulate::py_available()) {
 91 |     library(VAExprs)
 92 |     library(SC3)
 93 |     library(SingleCellExperiment)
 94 |     
 95 |     # create a SingleCellExperiment object
 96 |     sce <- SingleCellExperiment::SingleCellExperiment(
 97 |         assays = list(counts = as.matrix(yan)),
 98 |         colData = ann
 99 |     )
100 |     
101 |     # define feature names in feature_symbol column
102 |     rowData(sce)$feature_symbol <- rownames(sce)
103 |     # remove features with duplicated names
104 |     sce <- sce[!duplicated(rowData(sce)$feature_symbol), ]
105 |     # remove genes that are not expressed in any samples
106 |     sce <- sce[which(rowMeans(assay(sce)) > 0),]
107 |     dim(assay(sce))
108 |     
109 |     # model parameters
110 |     batch_size <- 32
111 |     original_dim <- 19595
112 |     intermediate_dim <- 256
113 |     epochs <- 100
114 |     
115 |     # model
116 |     cvae_result <- fit_vae(object = sce,
117 |                         encoder_layers = list(layer_input(shape = c(original_dim)),
118 |                                             layer_dense(units = intermediate_dim,
119 |                                                         activation = "relu")),
120 |                         decoder_layers = list(layer_dense(units = intermediate_dim,
121 |                                                         activation = "relu"),
122 |                                             layer_dense(units = original_dim,
123 |                                                         activation = "sigmoid")),
124 |                         epochs = epochs, batch_size = batch_size,
125 |                         use_generator = TRUE,
126 |                         callbacks = keras::callback_early_stopping(
127 |                             monitor = "loss",
128 |                             patience = 20,
129 |                             restore_best_weights = TRUE))
130 |     
131 |     # model architecture
132 |     plot_vae(cvae_result$model)
133 | }
134 | 
135 | ## ---- eval=TRUE---------------------------------------------------------------
136 | if (keras::is_keras_available() & reticulate::py_available()) {
137 |     # sample generation
138 |     set.seed(1)
139 |     gen_sample_result <- gen_exprs(cvae_result, 100,
140 |                                 batch_size, use_generator = TRUE)
141 |     
142 |     # plot for augmented data
143 |     plot_aug(gen_sample_result, "PCA")
144 | }
145 | 
146 | ## ---- eval=TRUE---------------------------------------------------------------
147 | sessionInfo()
148 | 


--------------------------------------------------------------------------------