├── Differential expression ├── BC-04-Differential Expression.ipynb ├── MOB-04-DE for regions.ipynb └── MOB-05-DE for cell types.ipynb ├── Nuclear RNA ├── MOB-NuclearRNA-01-Load data in R.ipynb └── MOB-NuclearRNA-02-Compare nucleus and cell.ipynb ├── README.md ├── alignment ├── .gitignore ├── README.md ├── setup.cfg ├── setup.py └── staligner │ ├── __init__.py │ ├── __main__.py │ ├── __version__.py │ └── staligner.py ├── cell_typing ├── cell_type_assignment.ipynb └── quality_check.ipynb ├── enrichment_analysis └── enrichment_analysis.ipynb ├── files.png ├── hdst.png ├── pre_processing ├── BC-01-GenerateAnnData.ipynb ├── BC-02-Binning.ipynb ├── BC-03-Smoothing.ipynb ├── MOB-00-ABA Gene retrieval via API.ipynb ├── MOB-01-GenerateAnnData.ipynb ├── MOB-02-Binning.ipynb ├── MOB-03-Smoothing.ipynb ├── pre-processing_external.ipynb └── pre-processing_hdst.ipynb └── segmentation └── HD_ST_Master.m /Nuclear RNA/MOB-NuclearRNA-01-Load data in R.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Allen and Macosko datasets" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "Warning message:\n", 20 | "“package ‘data.table’ was built under R version 3.5.2”" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "library(Matrix)\n", 26 | "library(data.table)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Allen data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "cells <- readRDS('data/allen_50k.RDS')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "

27998
50000

\n" 57 | ], 58 | "text/latex": [ 59 | "\\begin{enumerate*}\n", 60 | "\\item 27998\n", 61 | "\\item 50000\n", 62 | "\\end{enumerate*}\n" 63 | ], 64 | "text/markdown": [ 65 | "1. 27998\n", 66 | "2. 50000\n", 67 | "\n", 68 | "\n" 69 | ], 70 | "text/plain": [ 71 | "[1] 27998 50000" 72 | ] 73 | }, 74 | "metadata": {}, 75 | "output_type": "display_data" 76 | } 77 | ], 78 | "source": [ 79 | "dim(cells)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "cells.mat <- Matrix(cells, sparse=T)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "genes <- rownames(cells)\n", 98 | "barcodes <- colnames(cells)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/html": [ 109 | "\n", 110 | "\n", 111 | "\n", 112 | "\t\n", 113 | "\t\n", 114 | "\n", 115 | "

	used	(Mb)	gc trigger	(Mb)	limit (Mb)	max used	(Mb)
Ncells	1647717	88.0	2661402	142.2	NA	2661402	142.2
Vcells	130454269	995.3	1348288924	10286.7	16384	1530682202	11678.2

\n" 116 | ], 117 | "text/latex": [ 118 | "\\begin{tabular}{r|lllllll}\n", 119 | " & used & (Mb) & gc trigger & (Mb) & limit (Mb) & max used & (Mb)\\\\\n", 120 | "\\hline\n", 121 | "\tNcells & 1647717 & 88.0 & 2661402 & 142.2 & NA & 2661402 & 142.2 \\\\\n", 122 | "\tVcells & 130454269 & 995.3 & 1348288924 & 10286.7 & 16384 & 1530682202 & 11678.2 \\\\\n", 123 | "\\end{tabular}\n" 124 | ], 125 | "text/markdown": [ 126 | "\n", 127 | "| | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |\n", 128 | "|---|---|---|---|---|---|---|---|\n", 129 | "| Ncells | 1647717 | 88.0 | 2661402 | 142.2 | NA | 2661402 | 142.2 |\n", 130 | "| Vcells | 130454269 | 995.3 | 1348288924 | 10286.7 | 16384 | 1530682202 | 11678.2 |\n", 131 | "\n" 132 | ], 133 | "text/plain": [ 134 | " used (Mb) gc trigger (Mb) limit (Mb) max used (Mb) \n", 135 | "Ncells 1647717 88.0 2661402 142.2 NA 2661402 142.2\n", 136 | "Vcells 130454269 995.3 1348288924 10286.7 16384 1530682202 11678.2" 137 | ] 138 | }, 139 | "metadata": {}, 140 | "output_type": "display_data" 141 | } 142 | ], 143 | "source": [ 144 | "rm(cells)\n", 145 | "gc()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "NULL" 157 | ] 158 | }, 159 | "metadata": {}, 160 | "output_type": "display_data" 161 | } 162 | ], 163 | "source": [ 164 | "writeMM(cells.mat, 'data/allen.mtx')" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 8, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "write.csv(genes, 'data/allen-genes.csv', row.names=F, quote=F)\n", 174 | "write.csv(barcodes, 'data/allen-barcodes.csv', row.names=F, quote=F)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "### Macosko data" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 9, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "cells <- readRDS('data/macosko_50k.RDS')" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 10, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/html": [ 201 | "

27877
50000

\n" 205 | ], 206 | "text/latex": [ 207 | "\\begin{enumerate*}\n", 208 | "\\item 27877\n", 209 | "\\item 50000\n", 210 | "\\end{enumerate*}\n" 211 | ], 212 | "text/markdown": [ 213 | "1. 27877\n", 214 | "2. 50000\n", 215 | "\n", 216 | "\n" 217 | ], 218 | "text/plain": [ 219 | "[1] 27877 50000" 220 | ] 221 | }, 222 | "metadata": {}, 223 | "output_type": "display_data" 224 | } 225 | ], 226 | "source": [ 227 | "dim(cells)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 11, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "cells.dt <- as.data.table(cells)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 12, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "genes <- rownames(cells)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 13, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/html": [ 256 | "\n", 257 | "\n", 258 | "\n", 259 | "\t\n", 260 | "\t\n", 261 | "\n", 262 | "

	used	(Mb)	gc trigger	(Mb)	limit (Mb)	max used	(Mb)
Ncells	1755340	93.8	2661402	142.2	NA	2661402	142.2
Vcells	827788748	6315.6	1619974282	12359.5	16384	1530682202	11678.2

\n" 263 | ], 264 | "text/latex": [ 265 | "\\begin{tabular}{r|lllllll}\n", 266 | " & used & (Mb) & gc trigger & (Mb) & limit (Mb) & max used & (Mb)\\\\\n", 267 | "\\hline\n", 268 | "\tNcells & 1755340 & 93.8 & 2661402 & 142.2 & NA & 2661402 & 142.2 \\\\\n", 269 | "\tVcells & 827788748 & 6315.6 & 1619974282 & 12359.5 & 16384 & 1530682202 & 11678.2 \\\\\n", 270 | "\\end{tabular}\n" 271 | ], 272 | "text/markdown": [ 273 | "\n", 274 | "| | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |\n", 275 | "|---|---|---|---|---|---|---|---|\n", 276 | "| Ncells | 1755340 | 93.8 | 2661402 | 142.2 | NA | 2661402 | 142.2 |\n", 277 | "| Vcells | 827788748 | 6315.6 | 1619974282 | 12359.5 | 16384 | 1530682202 | 11678.2 |\n", 278 | "\n" 279 | ], 280 | "text/plain": [ 281 | " used (Mb) gc trigger (Mb) limit (Mb) max used (Mb) \n", 282 | "Ncells 1755340 93.8 2661402 142.2 NA 2661402 142.2\n", 283 | "Vcells 827788748 6315.6 1619974282 12359.5 16384 1530682202 11678.2" 284 | ] 285 | }, 286 | "metadata": {}, 287 | "output_type": "display_data" 288 | } 289 | ], 290 | "source": [ 291 | "rm(cells)\n", 292 | "gc()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 14, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "fwrite(cells.dt, 'data/macosko.csv')" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 15, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "write.csv(genes, 'data/macosko-genes.csv', row.names=F, quote=F)" 311 | ] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "R", 317 | "language": "R", 318 | "name": "ir" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": "r", 322 | "file_extension": ".r", 323 | "mimetype": "text/x-r-source", 324 | "name": "R", 325 | "pygments_lexer": "r", 326 | "version": "3.5.1" 327 | } 328 | }, 329 | "nbformat": 4, 330 | "nbformat_minor": 2 331 | } 332 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HDST 2 | 3 | This is a public repository for all code connected to HDST (High-definition spatial transcriptomics). 4 | 5 | Please cite: Vickovic S et al. High-definition spatial transcriptomics for in situ tissue profiling. Nat Methods 2019: doi: https://doi.org/10.1038/s41592-019-0548-y 6 | 7 | # Tech workflow 8 | ![github-small](https://github.com/broadinstitute/hdst/blob/master/hdst.png) 9 | 10 | # File Structure Overview 11 | All processed files are available at: https://portals.broadinstitute.org/single_cell/study/SCP420 12 | 13 | ![github-small](https://github.com/broadinstitute/hdst/blob/master/files.png) 14 | 15 | We recommed using the `Bulk Download` function and to consult the `Metadata` file. 16 | 17 | #### `*red_ut*`files: Sorted counts tsv files with: 18 | 19 | `bc` barcode (XxY) coordinate 20 | `spot_px_x` representing (x) pixel coordinate in the HE image and `X`in `bc` 21 | `spot_px_y` representing (y) pixel coordinate in the HE image and `Y`in `bc` 22 | `gene` representing the gene name 23 | `count` representing UMI filtered expressed counts per corresponding gene 24 | 25 | (Note: spatial resolution is marked as `HDST`, `5x` or `segments`in all file names) 26 | 27 | #### `*barcodes_under_tissue_annot*`files: files conenction (x,y) coordinates to annotation regions in `HDST` with: 28 | 29 | `bc` barcode (XxY) coordinate 30 | `spot_px_x` representing (x) pixel coordinate in the HE image and `X`in `bc` 31 | `spot_px_y` representing (y) pixel coordinate in the HE image and `Y`in `bc 32 | `annotation_region` as region names to each (x,y) coordinate 33 | 34 | #### `*HE.png` files are HE images used in the study 35 | 36 | #### `*HE_Probabilities_mask.tiff` files are coordinates of segmented nuclei based on corresponding HE images 37 | 38 | #### Files needed to run the ST pipeline: 39 | ##### `*.fastq` raw seq data with encoded barcode information 40 | ##### `*barcode_ids.tsv` ids files needed for demultiplexing 41 | 42 | 43 | # Alignment 44 | This is [code](./alignment) for aligning HE images to (x,y) barcode coordiantes as given by ST Pipeline ([v.1.5.1](https://github.com/SpatialTranscriptomicsResearch/st_pipeline/releases/tag/1.5.1)). 45 | 46 | # Segmentation 47 | This is [code](./segmentation) for segmenting HE nuclei. HE image segmentation was performed by combining Ilastik and CellProfiler. The labeled segmentation mask was used to assign the individual spots to the corresponding Cell ID. The output CSV file includes Cell IDs, X and Y position of the cells (centroid) and the corresponding spots. 48 | 49 | # Cell typing 50 | This is [code](./cell_typing) for imputing cell types onto (x,y) spatial positions based on scRNA-seq data. 51 | 52 | # Differential expression (DE) analysis 53 | This is [code](./Differential%20expression) for DE analysis between annotated regions. 54 | -------------------------------------------------------------------------------- /alignment/.gitignore: -------------------------------------------------------------------------------- 1 | **__pycache__ 2 | -------------------------------------------------------------------------------- /alignment/README.md: -------------------------------------------------------------------------------- 1 | # ST Aligner 2 | 3 | This package can be used to find approximate coordinates of spots on an HDST array. 4 | 5 | ## Installation 6 | 7 | To install the package and its dependencies with pip, run 8 | 9 | ``` 10 | pip install 11 | ``` 12 | 13 | ## Usage 14 | 15 | ST Aligner is run on the bright-field microscopy image from an HDST experiment. 16 | Before proceeding, make sure that the microscopy image has the right orientation; spots will be indexed from the top left in the output file. 17 | 18 | Invoke the alignment script by running 19 | 20 | ``` 21 | staligner --input --output --annotate 22 | ``` 23 | 24 | The `--annotate` flag is optional but recommended. 25 | When specified, ST Aligner will emit an annotated bright-field image, showing the inferred locations of the spots. 26 | The annotated image can be used to verify that the results are correct. 27 | -------------------------------------------------------------------------------- /alignment/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = staligner 3 | version = attr: staligner.__version__ 4 | 5 | [options] 6 | zip_safe = True 7 | packages = find: 8 | install_requires = 9 | imageio ~= 2.4.1 10 | numpy ~= 1.16.2 11 | pandas ~= 0.24.1 12 | scipy ~= 1.2.1 13 | python_requires = ~= 3.7 14 | 15 | [options.entry_points] 16 | console_scripts = 17 | staligner = staligner.__main__:main 18 | -------------------------------------------------------------------------------- /alignment/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import setuptools as st 3 | 4 | st.setup() 5 | -------------------------------------------------------------------------------- /alignment/staligner/__init__.py: -------------------------------------------------------------------------------- 1 | from .staligner import align 2 | from .__version__ import __version__ 3 | -------------------------------------------------------------------------------- /alignment/staligner/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse as ap 2 | 3 | import logging 4 | 5 | from . import align 6 | 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | LOG = logging.getLogger(__package__) 10 | 11 | 12 | def main(): 13 | """Script entry point. 14 | """ 15 | opt = ap.ArgumentParser() 16 | opt.add_argument('-i', '--input', type=str, required=True, 17 | help='Input image.') 18 | opt.add_argument('-o', '--output-directory', type=str, default='.', 19 | help='Output directory.') 20 | opt.add_argument('--size', nargs=2, default=[-1, 7330], type=int, 21 | help='Rescale the image to at most the given size before ' 22 | 'running.') 23 | opt.add_argument('--win-size', default=1 / 25, type=float, 24 | help='Size of the decision window. Use smaller values if ' 25 | 'the image is very rotated or if the tissue extends ' 26 | 'close to the borders of the array.') 27 | opt.add_argument('--annotate', action='store_true', 28 | help='Emit bright-field image with spot annotations.') 29 | opt.add_argument('--debug', action='store_true', 30 | help='Print debug messages.') 31 | opt = opt.parse_args() 32 | 33 | if opt.debug: 34 | LOG.setLevel(logging.DEBUG) 35 | 36 | LOG.info('Running frame detection with options: %s.', 37 | ', '.join([f'{k}={v}' for k, v in vars(opt).items()])) 38 | align( 39 | im_file=opt.input, 40 | im_size=opt.size, 41 | win_size=[ 42 | round(min([s for s in opt.size if s > 0]) * opt.win_size)] * 2, 43 | annotate=opt.annotate, 44 | output_directory=opt.output_directory, 45 | ) 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /alignment/staligner/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /alignment/staligner/staligner.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import itertools as it 4 | 5 | import logging 6 | 7 | import os 8 | 9 | from imageio import imread, imwrite 10 | 11 | import numpy as np 12 | 13 | import pandas as pd 14 | 15 | from scipy.ndimage.interpolation import zoom 16 | from scipy.signal import fftconvolve 17 | 18 | 19 | __all__ = ['align'] 20 | 21 | LOG = logging.getLogger(__package__) 22 | 23 | SOBELX = np.matrix([[-1, 0, 1]]).T * np.matrix([[1, 2, 1]]) 24 | SOBELY = SOBELX.T 25 | 26 | 27 | def sobel(im, amplify): 28 | """Applies a sobel filter to the input image. 29 | """ 30 | if len(im.shape) == 2: 31 | fx, fy = SOBELX, SOBELY 32 | pad_width = (1,) 33 | elif len(im.shape) == 3: 34 | fx, fy = [np.stack([np.array(f)], axis=2) for f in (SOBELX, SOBELY)] 35 | pad_width = ((1, 1), (1, 1), (0, 0)) 36 | else: 37 | raise ValueError('Invalid image dimensions') 38 | 39 | gx = fftconvolve(im, fx, mode='valid') 40 | gy = fftconvolve(im, fy, mode='valid') 41 | 42 | return np.pad( 43 | (gx * gx + gy * gy) ** (1 / (1 + np.exp(amplify))), 44 | pad_width, 45 | mode='edge', 46 | ) 47 | 48 | 49 | def zoomto(shape, im): 50 | """Zoom image to a given shape. 51 | """ 52 | dims = len(im.shape) 53 | if dims not in [2, 3]: 54 | raise ValueError('Invalid image dimensions') 55 | 56 | zoom_factor = min([ 57 | target / current 58 | for (target, current) in zip(shape, im.shape) if target != -1 59 | ]) 60 | 61 | if zoom_factor == 1.: 62 | return im, 1. 63 | 64 | zoom_seq = [1] * dims 65 | zoom_seq[:2] = [zoom_factor] * 2 66 | return zoom(im, zoom=zoom_seq, order=1, mode='nearest'), zoom_factor 67 | 68 | 69 | def ma(seq, length): 70 | """Computes the moving average of a 1-d sequence. 71 | """ 72 | return np.pad( 73 | np.convolve(seq, [1 / length] * length, mode='valid'), 74 | (length // 2,), 75 | mode='edge', 76 | ) 77 | 78 | 79 | def deriv(seq): 80 | """Computes the derivative of a 1-d sequence using finite differences 81 | approximation. 82 | """ 83 | return np.pad( 84 | np.convolve(seq, [1, 0, -1], mode='valid'), 85 | (1,), 86 | mode='edge', 87 | ) 88 | 89 | 90 | def getbounds(im, ymax=None, xmax=None): 91 | """Estimates the top and left boundaries of a light, rectangular object in a 92 | flattened image by optimizing the derivatives of the col and row sums of 93 | the image's intensity function. 94 | """ 95 | ymax, xmax = [ 96 | a if a is not None else s 97 | for (a, s) in zip((ymax, xmax), im.shape) 98 | ] 99 | y, x = (np.sum(im, axis=i) for i in (1, 0)) 100 | dy, dx = (ma(deriv(a), max(1, len(a) // 100)) for a in (y, x)) 101 | top, left = [np.argmax(d[:m]) for (d, m) in zip((dy, dx), (ymax, xmax))] 102 | return (top, left), (dy, dx) 103 | 104 | 105 | def restriction(H, W, x1s, t): 106 | """Given x1s and t, computes the other xss:s (s.t. them forming a rectangle 107 | with height H and width W). 108 | """ 109 | x11, x12 = x1s 110 | x2s = [x11 + W * np.sin(t), x12 + W * np.cos(t)] 111 | x3s = [x11 + H * np.cos(t), x12 - H * np.sin(t)] 112 | x4s = [x2s[0] + x3s[0] - x11, x2s[1] + x3s[1] - x12] 113 | return [x1s, x2s, x3s, x4s] 114 | 115 | 116 | def drestricted_cost(H, W, yss, x1s, t): 117 | """Derivative of restricted cost function w.r.t. to x1s and t. 118 | """ 119 | dx1s, dx2s, dx3s, dx4s = dcost(yss, restriction(H, W, x1s, t)) 120 | dx1sr = [ 121 | dx1s[0] + dx2s[0] + dx3s[0] + dx4s[0], 122 | dx1s[1] + dx2s[1] + dx3s[1] + dx4s[1], 123 | ] 124 | dt = dx2s[0] * W * np.cos(t) - dx2s[1] * W * np.sin(t) + \ 125 | -dx3s[0] * H * np.sin(t) - dx3s[1] * H * np.cos(t) + \ 126 | dx4s[0] * (W * np.cos(t) - H * np.sin(t)) + \ 127 | dx4s[1] * (-W * np.sin(t) - H * np.cos(t)) 128 | return [dx1sr, dt] 129 | 130 | 131 | def restricted_cost(H, W, yss, x1s, t): 132 | """Restricted cost function. 133 | """ 134 | return cost(yss, restriction(H, W, x1s, t)) 135 | 136 | 137 | def dcost(yss, xss): 138 | """Derivative of the cost function w.r.t. to the xss. 139 | """ 140 | return [ 141 | [2 * (x - y) for (x, y) in zip(xs, ys)] 142 | for (xs, ys) in zip(xss, yss) 143 | ] 144 | 145 | 146 | def cost(yss, xss): 147 | """Euclidean cost function. 148 | """ 149 | return sum( 150 | sum((x - y) ** 2 for (x, y) in zip(xs, ys)) 151 | for (xs, ys) in zip(xss, yss) 152 | ) 153 | 154 | 155 | def optimize_cost(H, W, yss, x1s0, t0): 156 | """Optimize cost by gradient descent. 157 | """ 158 | x1s = x1s0 159 | t = t0 160 | f = partial(restricted_cost, H, W, yss) 161 | df = partial(drestricted_cost, H, W, yss) 162 | for i in range(10000): 163 | dx1s, dt = df(x1s, t) 164 | x1s[0] -= 1e-3 * dx1s[0] 165 | x1s[1] -= 1e-3 * dx1s[1] 166 | t -= 1e-9 * dt 167 | LOG.debug('Iteration %d, loss=%.2e', i, f(x1s, t)) 168 | LOG.debug('-----------------------') 169 | LOG.debug('x1s: %s', x1s) 170 | LOG.debug('dx1s: %s', dx1s) 171 | LOG.debug('t: %.3f', t) 172 | LOG.debug('dt: %.3f', dt) 173 | return x1s, t 174 | 175 | 176 | def align( 177 | im_file, 178 | im_size, 179 | win_size, 180 | annotate=False, 181 | output_directory=None, 182 | ): 183 | """Runs the frame detection. 184 | """ 185 | if output_directory is None: 186 | output_directory = '.' 187 | elif not os.path.exists(output_directory): 188 | os.makedirs(output_directory) 189 | 190 | im_file_no_ext = os.path.basename(im_file)[:-(im_file[::-1].find('.') + 1)] 191 | 192 | def _go(im_): 193 | (t1, l1), (dy1, dx1) = getbounds(im_, *[s // 8 for s in im_.shape]) 194 | slices = [ 195 | slice(max(c - s // 2, 0), c + (s + 1) // 2 + 1) 196 | for (c, s) in zip((t1, l1), win_size) 197 | ] 198 | win = im_[tuple(slices)] 199 | (t2, l2), (dy2, dx2) = getbounds(win) 200 | return [slices[0].start + t2, slices[1].start + l2] 201 | 202 | def _annotate(spots, image): 203 | maxval = np.iinfo(image.dtype).max 204 | image[spots[0, :], spots[1, :]] = ( 205 | [maxval, 0, 0] 206 | if image.shape[-1] == 3 else 207 | [maxval, 0, 0, maxval] 208 | if image.shape[-1] == 4 else 209 | maxval 210 | ) 211 | save_path = os.path.join( 212 | output_directory, 213 | f'{im_file_no_ext}.annotated.tif', 214 | ) 215 | LOG.info('Saving annotated image to %s', save_path) 216 | imwrite(save_path, image) 217 | 218 | im = imread(im_file) 219 | 220 | if annotate: 221 | _annotate = partial(_annotate, image=im.copy()) 222 | else: 223 | _annotate = lambda *_: None 224 | 225 | LOG.info('Scaling image to %dx%d', *im_size) 226 | im, zoom_factor = zoomto(im_size, im) 227 | 228 | LOG.info('Applying sobel filter and flattening') 229 | im = np.sum(sobel(im, 1.5), axis=2) 230 | 231 | LOG.info('Running bounds detection') 232 | 233 | LOG.debug('Running bounds detection on the top-left corner') 234 | tl = _go(im) 235 | 236 | LOG.debug('Running bounds detection on the top-right corner') 237 | tr = _go(im[:, ::-1]) 238 | tr[1] = im.shape[1] - tr[1] - 1 239 | 240 | LOG.debug('Running bounds detection on the bottom-left corner') 241 | bl = _go(im[::-1, :]) 242 | bl[0] = im.shape[0] - bl[0] - 1 243 | 244 | LOG.debug('Running bounds detection on the bottom-right corner') 245 | br = _go(im[::-1, ::-1]) 246 | br[0] = im.shape[0] - br[0] - 1 247 | br[1] = im.shape[1] - br[1] - 1 248 | 249 | tl, tr, bl, br = [[a / zoom_factor for a in b] for b in (tl, tr, bl, br)] 250 | LOG.info('Unaligned result: top-left=%s', tl) 251 | LOG.info('Unaligned result: top-right=%s', tr) 252 | LOG.info('Unaligned result: bottom-left=%s', bl) 253 | LOG.info('Unaligned result: bottom-right=%s', br) 254 | 255 | array_size_px = [ 256 | ( 257 | np.sqrt((tl[1] - bl[1]) ** 2 + (tl[0] - bl[0]) ** 2) + 258 | np.sqrt((tr[1] - br[1]) ** 2 + (tr[0] - br[0]) ** 2) 259 | ) / 2, 260 | ( 261 | np.sqrt((tr[1] - tl[1]) ** 2 + (tr[0] - tr[0]) ** 2) + 262 | np.sqrt((br[1] - bl[1]) ** 2 + (br[0] - br[0]) ** 2) 263 | ) / 2, 264 | ] 265 | x1s, t = optimize_cost( 266 | *array_size_px, 267 | [tl, tr, bl, br], 268 | tl, 269 | 0, 270 | ) 271 | tl_, tr_, bl_, br_ = restriction(*array_size_px, x1s, t) 272 | 273 | print('Rotation=%.3f rad' % t) 274 | print('Top-left=%s' % tl_) 275 | print('Top-right=%s' % tr_) 276 | print('Bottom-left=%s' % bl_) 277 | print('Bottom-right=%s' % br_) 278 | 279 | spots = np.concatenate( 280 | list(map( 281 | np.transpose, 282 | map(np.matrix, it.product(range(783), range(1918), [1])) 283 | )), 284 | axis=1, 285 | ).astype(np.float64) 286 | 287 | spot_labels = spots[:2, :].copy() 288 | 289 | spots[1, :] += 0.5 * (0.5 + spots[0, :] % 2) 290 | spots[0, :] += 0.5 291 | spots[1, :] *= array_size_px[1] / 1918 292 | spots[0, :] *= array_size_px[0] / 783 293 | 294 | R = np.matrix([ 295 | [np.cos(t), np.sin(t), 0], 296 | [-np.sin(t), np.cos(t), 0], 297 | [0, 0, 1], 298 | ]) 299 | T = np.matrix([ 300 | [1, 0, tl_[0]], 301 | [0, 1, tl_[1]], 302 | [0, 0, 1], 303 | ]) 304 | 305 | spots = (T * R * spots) 306 | spots = np.round(spots).astype(int) 307 | 308 | # index labels from 1 309 | spot_labels += 1 310 | 311 | df = pd.concat( 312 | [ 313 | pd.DataFrame( 314 | spot_labels.T, 315 | columns=['spot_y', 'spot_x'], 316 | dtype=int, 317 | ), 318 | pd.DataFrame( 319 | spots[:2, :].T, 320 | columns=['spot_px_y', 'spot_px_x'], 321 | dtype=int, 322 | ), 323 | ], 324 | axis=1, 325 | ) 326 | 327 | save_path = os.path.join(output_directory, f'{im_file_no_ext}.tsv') 328 | LOG.info('Saving spots file to %s', save_path) 329 | df.to_csv(save_path, index=None, sep='\t') 330 | 331 | _annotate(spots) 332 | -------------------------------------------------------------------------------- /cell_typing/cell_type_assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "library(data.table)\n", 10 | "library(parallel)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "setwd(\"/broad/regevtmp/jklugham/HD_ST\") # set wd to project directory containing all the sub folder" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "#### Calculate cell type likelihoods for each barcode/bin" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "calc_lk=function(ref,dat,exp=NULL,n_shuff=0){\n", 36 | " dat_ct=merge(dat,ref,by=\"gene\",allow.cartesian=TRUE)\n", 37 | " likelihoods=dat_ct[,.(lk=sum(count*log_norm),\n", 38 | " count=sum(count),\n", 39 | " umi_distrib=paste0(\"c(\",paste(count,collapse = \",\"),\")\")\n", 40 | " ),by=c(\"x\",\"y\",\"ClusterName\",exp)]\n", 41 | " \n", 42 | " get_emp_pval=function(lk_orig,counts,cluster,ref_data,n_shuff){\n", 43 | " ref_cl=ref_data[ClusterName==cluster]$log_norm\n", 44 | " counts=eval(parse(text=counts))\n", 45 | " zero_rat=sum(ref_cl==-Inf)/length(ref_cl)\n", 46 | " p_no0=(1-zero_rat)^length(counts)\n", 47 | " \n", 48 | " set.seed(1234)\n", 49 | " shuffles=lapply(rep(length(counts),n_shuff),FUN=function(x)sample(ref_cl[ref_cl!=-Inf],x,replace = FALSE))\n", 50 | " lk_shuff=unlist(mclapply(X=shuffles,FUN=function(x)sum(counts*x),mc.cores=1,mc.preschedule = TRUE))\n", 51 | " p=unlist(mclapply(X=lk_orig,FUN=function(x){sum(lk_shuff>=x)/n_shuff},mc.cores=1,mc.preschedule = TRUE))\n", 52 | " return(p*p_no0)\n", 53 | " }\n", 54 | " if (n_shuff!=0){\n", 55 | " likelihoods[lk!=-Inf,emp_pval:=get_emp_pval(lk,umi_distrib,ClusterName,ref,n_shuff),\n", 56 | " by=c(\"umi_distrib\",\"ClusterName\")]\n", 57 | " likelihoods[lk==-Inf,emp_pval:=1,]\n", 58 | " }else{\n", 59 | " likelihoods[,emp_pval:=NA,]\n", 60 | " }\n", 61 | " return(likelihoods)\n", 62 | "}" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "#### Process the the output from calc_lk" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "process_calk_lk=function(likelihoods,Ng=Ngenes,exp=NULL){\n", 79 | " find_nexthighest=function(lk){\n", 80 | " lk_sort=sort(lk,decreasing=TRUE)\n", 81 | " nexth=sapply(lk,function(x){c(lk_sort[lk_sort=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c(\"x\",\"y\")]\n", 288 | "write.table(likelihoods_li,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "### Breast cancer" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "thres=c(0.05,0.7) #p-value, lk_norm" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "tnbc_norm=fread(\"results/tnbc_norm.tsv\")" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "#### 1x (hd)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "dat=fread(\"BC/CN21_BC24350_E2_filtered_red_ut.tsv\")\n", 330 | "tag=\"E2_tnbc_hd\"" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "dat=fread(\"BC_nc/CN21_BC24350_E2_unmodgtf_filtered_red_ut.tsv\")\n", 340 | "tag=\"E2_unmodgtf_tnbc_hd\"" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "dat=fread(\"BC_nc/CN21_BC24350_C1_unmodgtf_filtered_red_ut.tsv\")\n", 350 | "tag=\"C1_unmodgtf_tnbc_hd\"" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "dat=fread(\"BC_nc/CN21_BC24350_D1_unmodgtf_filtered_red_ut.tsv\")\n", 360 | "tag=\"D1_unmodgtf_tnbc_hd\"" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "#### binned " 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "dat=fread(\"BC_binned/hdst-lowres.tsv\")\n", 377 | "tag=\"E2_tnbc_low\"" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "dat=fread(\"BC_binned_nc/hdst-lowres.tsv\")\n", 387 | "tag=\"E2_unmodgtf_tnbc_low\"" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "dat=fread(\"BC_binned_nc/C1/hdst-lowres.tsv\")\n", 397 | "tag=\"C1_unmodgtf_tnbc_low\"" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "dat=fread(\"BC_binned_nc/D1/hdst-lowres.tsv\")\n", 407 | "tag=\"D1_unmodgtf_tnbc_low\"" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "#### segmentd" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "dat=fread(\"BC/CN21_BC24350_E2_filtered_red_ut_segmentd.tsv\")\n", 424 | "tag=\"E2_tnbc_seg\"" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "dat=fread(\"BC/CN21_BC24350_E2_unmodgtf_filtered_red_ut_segmentd.tsv\")\n", 434 | "tag=\"E2_unmodgtf_tnbc_seg\"" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "#### here actually run" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "ns=1000\n", 451 | "system.time(\n", 452 | "expr=likelihoods_tnbc<-calc_lk(tnbc_norm,dat,n_shuff = ns)\n", 453 | ")\n", 454 | "write.table(likelihoods_tnbc,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE) #fastp_ns only introduced 5/15\n", 455 | "likelihoods_tnbc=process_calk_lk(likelihoods = likelihoods_tnbc,Ng = 10000)\n", 456 | "likelihoods_tnbc[,N_ct:=sum(lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c(\"x\",\"y\")]\n", 457 | "write.table(likelihoods_tnbc,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)" 458 | ] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "R", 464 | "language": "R", 465 | "name": "ir" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": "r", 469 | "file_extension": ".r", 470 | "mimetype": "text/x-r-source", 471 | "name": "R", 472 | "pygments_lexer": "r", 473 | "version": "3.5.0" 474 | } 475 | }, 476 | "nbformat": 4, 477 | "nbformat_minor": 2 478 | } 479 | -------------------------------------------------------------------------------- /enrichment_analysis/enrichment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "library(data.table)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "setwd(project_directory) # set wd to project directory containing all the sub folder" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Location enrichment analysis (genes or cells)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "test_loc_enrichment=function(data,lk_thres=NULL,p_thres=NULL,sub_col){\n", 35 | " if (!is.null(lk_thres)){dat_filt=data[lk_norm>=lk_thres]}else{dat_filt=data}\n", 36 | " if (!is.null(p_thres)){dat_filt=dat_filt[emp_pval_adjust_BH<=p_thres]}\n", 37 | " \n", 38 | " cts=unique(dat_filt$ClusterName)\n", 39 | " anats=unique(dat_filt$layer)\n", 40 | " \n", 41 | " ftest_res=data.table()\n", 42 | " for (ict in cts){\n", 43 | " for (ianat in anats){\n", 44 | " # print(paste0(ict,\" \",ianat))\n", 45 | " cont_mat=matrix(c(dat_filt[ClusterName==ict&layer==ianat,length(unique(get(sub_col)))], \n", 46 | " dat_filt[ClusterName!=ict&layer==ianat,length(unique(get(sub_col)))], \n", 47 | " dat_filt[ClusterName==ict&layer!=ianat,length(unique(get(sub_col)))], \n", 48 | " dat_filt[ClusterName!=ict&layer!=ianat,length(unique(get(sub_col)))]),\n", 49 | " nrow = 2,\n", 50 | " dimnames = list(anat = c(\"yes\", \"no\"),\n", 51 | " ct = c(\"yes\", \"no\")))\n", 52 | " ft=fisher.test(cont_mat,alternative = \"greater\",conf.int = TRUE)\n", 53 | " ftest_res=rbindlist(list(ftest_res,data.table(cell_type=ict,\n", 54 | " layer=ianat,\n", 55 | " p.value=ft$p.value,\n", 56 | " cof.int.low=ft$conf.int[1],\n", 57 | " cof.int.high=ft$conf.int[2])))\n", 58 | " }\n", 59 | " }\n", 60 | " return(ftest_res)\n", 61 | "}" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### MOB cell type enrichment analysis" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "anat=fread(\"MOB/CN13_D2_barcodes_under_tissue_annot.tsv\")\n", 78 | "setnames(anat,\"poly.ID\",\"layer\")" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "thres=c(0.01,0.1) #p-value, lk_norm" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "#### 1x (hd)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "lk=fread(\"results/ct_likelihoods_li_hd_fastp_1000.tsv\")\n", 104 | "tag=\"li_hd\"" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_hd_fastp_1000.tsv\")\n", 114 | "tag=\"unmodgtf_li_hd\"" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "#prepare\n", 124 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 125 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "#### binned " 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "lk=fread(\"results/ct_likelihoods_li_low_fastp_1000.tsv\")\n", 142 | "coords=fread(\"MOB_binned/hdst-lowres-coordinates.csv\")\n", 143 | "tag=\"li_low\"" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_low_fastp_1000.tsv\")\n", 153 | "coords=fread(\"MOB_binned/hdst-lowres-coordinates.csv\")\n", 154 | "tag=\"unmodgtf_li_low\"" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "#prepare\n", 164 | "sub=lk[bin==\"5x\"]\n", 165 | "sub[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 166 | "lk_anat=merge(sub[lk_rat==1&N_ct==1],\n", 167 | " unique(coords[,c(\"region\",\"5x_x\",\"5x_y\"),with=FALSE][!duplicated(cbind(`5x_x`,`5x_y`))]),\n", 168 | " by.x=c(\"x\",\"y\"),\n", 169 | " by.y=c(\"5x_x\",\"5x_y\"))\n", 170 | "setnames(lk_anat,\"region\",\"layer\")" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "#### segmentd" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "lk=fread(\"results/ct_likelihoods_li_seg_fastp_1000.tsv\")\n", 187 | "tag=\"li_seg\"" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_seg_fastp_1000.tsv\")\n", 197 | "tag=\"unmodgtf_li_seg\"" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "#prepare\n", 207 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 208 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "#### here actually run" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "#enrichment analyis\n", 225 | "res_li=test_loc_enrichment(data = lk_anat,p_thres = thres[1],lk_thres = thres[2],sub_col = \"bc\")\n", 226 | "res_li[,p.value.adjust:=p.adjust(p.value,method = \"BY\"),]\n", 227 | "write.table(res_li,paste0(\"results/ct_anat_enrich_\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### Breast cancer cell type enrichment analysis" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "anat=fread(\"BC/CN21_BC24350_E2_barcodes_under_tissue_annot.tsv\")\n", 244 | "setnames(anat,\"poly.ID\",\"layer\")" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "thres=c(0.05,0.7) #p-value, lk_norm" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "#### 1x (hd)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_hd.tsv\")\n", 270 | "tag=\"E2_tnbc_hd\"" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_hd.tsv\")\n", 280 | "tag=\"E2_unmodgtf_tnbc_hd\"" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "#prepare\n", 290 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 291 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "#### binned " 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_low.tsv\")\n", 308 | "coords=fread(\"BC_binned/hdst-breast-cancer-lowres-coordinates.csv\")\n", 309 | "tag=\"E2_tnbc_low\"" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_low.tsv\")\n", 319 | "coords=fread(\"BC_binned/hdst-breast-cancer-lowres-coordinates.csv\")\n", 320 | "tag=\"E2_unmodgtf_tnbc_low\"" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "#prepare\n", 330 | "sub=lk[bin==\"5x\"]\n", 331 | "sub[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 332 | "lk_anat=merge(sub[lk_rat==1&N_ct==1],\n", 333 | " unique(coords[,c(\"region\",\"5x_x\",\"5x_y\"),with=FALSE][!duplicated(cbind(`5x_x`,`5x_y`))]),\n", 334 | " by.x=c(\"x\",\"y\"),\n", 335 | " by.y=c(\"5x_x\",\"5x_y\"))\n", 336 | "setnames(lk_anat,\"region\",\"layer\")" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "#### segmentd" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_seg.tsv\")\n", 353 | "tag=\"E2_tnbc_seg\"" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_seg.tsv\")\n", 363 | "tag=\"E2_unmodgtf_tnbc_seg\"" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "#prepare\n", 373 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n", 374 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "#### here actually run" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "#enrichment analyis\n", 391 | "res_li=test_loc_enrichment(data = lk_anat,p_thres = thres[1],lk_thres = thres[2],sub_col = \"bc\")\n", 392 | "res_li[,p.value.adjust:=p.adjust(p.value,method = \"BY\"),]\n", 393 | "write.table(res_li,paste0(\"results/ct_anat_enrich_\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "### MOB enrichment of HDST layer specific genes in layers specific genes according to ABA" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "#aba genes\n", 410 | "aba_layer_spec=fread(\"ext_data/aba_MOB_diff.tsv\")" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "#hdst genes\n", 420 | "hdst_markers=fread(\"DE/Table_DEGs.csv\",drop = \"V1\")\n", 421 | "setnames(hdst_markers,names(hdst_markers),c(\"gene\",\"lfc\",\"p.value\",\"ClusterName\"))\n", 422 | "pthres=0.1\n", 423 | "tag=\"\"" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "#hdst genes\n", 433 | "hdst_markers=fread(\"DE/mob_region_de_unmodgtf.csv\")\n", 434 | "setnames(hdst_markers,names(hdst_markers),c(\"gene\",\"lfc\",\"p.value\",\"ClusterName\"))\n", 435 | "pthres=0.05\n", 436 | "tag=\"_unmodgtf\"" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "hdst_aba=merge(hdst_markers[lfc>1.5& p.value1.5],by=\"gene\",all=FALSE)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "#remove ONL because it doesn't exist in ABA\n", 455 | "hdst_aba=hdst_aba[!ClusterName==\"Olfactory Nerve Layer (ONL)\"]\n", 456 | "#combine some layers to match ABA annotation\n", 457 | "hdst_aba[ClusterName%in%c(\"Granule Cell Layer External (GCL-E)\",\"Granule Cell Layer Internal (GCL-I)\",\"Rostral Migratory System (RMS)\",\"Ependymal Cell Zone (E)\"),ClusterName:=\"Granule Layer (GR)\",]" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "hdst_aba_enrich=test_loc_enrichment(hdst_aba,sub_col = \"gene\")\n", 467 | "hdst_aba_enrich[,p.adjust:=p.adjust(p.value,method = \"BY\"),]" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "write.table(hdst_aba_enrich,paste0(\"results/aba_deg_enrich\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)" 477 | ] 478 | } 479 | ], 480 | "metadata": { 481 | "kernelspec": { 482 | "display_name": "R", 483 | "language": "R", 484 | "name": "ir" 485 | }, 486 | "language_info": { 487 | "codemirror_mode": "r", 488 | "file_extension": ".r", 489 | "mimetype": "text/x-r-source", 490 | "name": "R", 491 | "pygments_lexer": "r", 492 | "version": "3.5.0" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 2 497 | } 498 | -------------------------------------------------------------------------------- /files.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klarman-cell-observatory/hdst/1dc0578c6b8539bc4ec704ade6a2fe7165321bc5/files.png -------------------------------------------------------------------------------- /hdst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klarman-cell-observatory/hdst/1dc0578c6b8539bc4ec704ade6a2fe7165321bc5/hdst.png -------------------------------------------------------------------------------- /pre_processing/MOB-00-ABA Gene retrieval via API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Differential search via ABA API" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from allensdk.api.queries.ontologies_api import OntologiesApi, StructureTree" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Get MOB structure IDs from Ontology Structure Graph" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "oapi = OntologiesApi()\n", 34 | "structure_graph = oapi.get_structures_with_sets([1]) # 1 is the id of the adult mouse structure graph\n", 35 | "\n", 36 | "# This removes some unused fields returned by the query\n", 37 | "structure_graph = StructureTree.clean_structures(structure_graph) \n", 38 | "tree = StructureTree(structure_graph)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "[{'acronym': 'MOBgl',\n", 50 | " 'rgb_triplet': [130, 199, 174],\n", 51 | " 'graph_id': 1,\n", 52 | " 'graph_order': 381,\n", 53 | " 'id': 212,\n", 54 | " 'name': 'Main olfactory bulb, glomerular layer',\n", 55 | " 'structure_id_path': [997, 8, 567, 688, 695, 698, 507, 212],\n", 56 | " 'structure_set_ids': [10, 12]},\n", 57 | " {'acronym': 'MOBgr',\n", 58 | " 'rgb_triplet': [130, 199, 174],\n", 59 | " 'graph_id': 1,\n", 60 | " 'graph_order': 382,\n", 61 | " 'id': 220,\n", 62 | " 'name': 'Main olfactory bulb, granule layer',\n", 63 | " 'structure_id_path': [997, 8, 567, 688, 695, 698, 507, 220],\n", 64 | " 'structure_set_ids': [10, 12]}]" 65 | ] 66 | }, 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "mob = tree.get_structures_by_name(['Main olfactory bulb'])\n", 74 | "mob_structures = tree.children([mob[0]['id']])[0]\n", 75 | "\n", 76 | "mob_structures[:2]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Functions for differential gene expression search" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def retrieve_ABA(url, start_row=0, num_rows=2000):\n", 93 | " '''Retrieves an ABA query result as CSV in multiple steps using paging'''\n", 94 | "\n", 95 | " filled_url = url % {'start_row': start_row, 'num_rows': num_rows}\n", 96 | " query_df = pd.read_csv(filled_url)\n", 97 | " final_df = []\n", 98 | "\n", 99 | " while (len(query_df) > 0):\n", 100 | " final_df.append(query_df)\n", 101 | " start_row += num_rows\n", 102 | " filled_url = url % {'start_row': start_row, 'num_rows': num_rows}\n", 103 | " query_df = pd.read_csv(filled_url)\n", 104 | "\n", 105 | " return pd.concat(final_df).reset_index()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from allensdk.api.queries.connected_services import ConnectedServices\n", 115 | "\n", 116 | "def build_differential_search_url(target_structures, contrast_structures, threshold=1):\n", 117 | " cs = ConnectedServices()\n", 118 | "\n", 119 | " # For parameters see: http://help.brain-map.org/display/api/Connected+Services+and+Pipes#ConnectedServicesandPipes-service::mouse_differential\n", 120 | " url = cs.build_url('mouse_differential', kwargs={'set': 'mouse_coronal',\n", 121 | " 'structures2': target_structures,\n", 122 | " 'structures1': contrast_structures,\n", 123 | " 'threshold2': [threshold, 50],\n", 124 | " 'threshold1': [0, 50],\n", 125 | " 'start_row': '%(start_row)s', #placeholders are resolved in the download_ABA function\n", 126 | " 'num_rows': '%(num_rows)s'\n", 127 | " }).replace('query.json', 'query.csv')\n", 128 | " return url" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Test if URL construction and download works" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "http://api.brain-map.org/api/v2/data/query.csv?q=service::mouse_differential[num_rows$eq%(num_rows)s][set$eqmouse_coronal][start_row$eq%(start_row)s][structures1$eq220,228,236,244][structures2$eq212][threshold1$eq0,50][threshold2$eq1,50]\n" 148 | ] 149 | }, 150 | { 151 | "data": { 152 | "text/html": [ 153 | "

\n", 154 | "\n", 167 | "\n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | "

	index	id	name	gene-id	gene-symbol	gene-name	entrez-id	chromosome	plane-of-section	specimen-id	fold-change	target-sum	contrast-sum	num-target-samples	num-contrast-samples
0	0	73520993	RP_051101_02_G12	88550	Kctd12	potassium channel tetramerisation domain conta...	239217	14	coronal	NaN	4.234	963.874	1020.211	83	372
1	1	74357573	RP_050915_03_C03	88861	Tspan18	tetraspanin 18	241556	2	coronal	NaN	4.183	142.255	156.017	80	367
2	2	72008121	RP_051017_01_B10	14038	Fmo1	flavin containing monooxygenase 1	14261	1	coronal	NaN	4.041	211.137	234.167	83	372
3	3	74512017	RP_060220_03_D03	83946	Phldb2	pleckstrin homology-like domain, family B, mem...	208177	16	coronal	NaN	3.971	222.445	251.422	82	368
4	4	73929578	RP_050927_03_H11	21585	Thbs2	thrombospondin 2	21826	17	coronal	NaN	3.922	175.864	200.959	83	372

\n", 281 | "

" 282 | ], 283 | "text/plain": [ 284 | " index id name gene-id gene-symbol \\\n", 285 | "0 0 73520993 RP_051101_02_G12 88550 Kctd12 \n", 286 | "1 1 74357573 RP_050915_03_C03 88861 Tspan18 \n", 287 | "2 2 72008121 RP_051017_01_B10 14038 Fmo1 \n", 288 | "3 3 74512017 RP_060220_03_D03 83946 Phldb2 \n", 289 | "4 4 73929578 RP_050927_03_H11 21585 Thbs2 \n", 290 | "\n", 291 | " gene-name entrez-id chromosome \\\n", 292 | "0 potassium channel tetramerisation domain conta... 239217 14 \n", 293 | "1 tetraspanin 18 241556 2 \n", 294 | "2 flavin containing monooxygenase 1 14261 1 \n", 295 | "3 pleckstrin homology-like domain, family B, mem... 208177 16 \n", 296 | "4 thrombospondin 2 21826 17 \n", 297 | "\n", 298 | " plane-of-section specimen-id fold-change target-sum contrast-sum \\\n", 299 | "0 coronal NaN 4.234 963.874 1020.211 \n", 300 | "1 coronal NaN 4.183 142.255 156.017 \n", 301 | "2 coronal NaN 4.041 211.137 234.167 \n", 302 | "3 coronal NaN 3.971 222.445 251.422 \n", 303 | "4 coronal NaN 3.922 175.864 200.959 \n", 304 | "\n", 305 | " num-target-samples num-contrast-samples \n", 306 | "0 83 372 \n", 307 | "1 80 367 \n", 308 | "2 83 372 \n", 309 | "3 82 368 \n", 310 | "4 83 372 " 311 | ] 312 | }, 313 | "execution_count": 6, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "mob_url = build_differential_search_url(mob_structures[0]['id'], [x['id'] for x in mob_structures[1:]])\n", 320 | "print(mob_url)\n", 321 | "mob_df = retrieve_ABA(mob_url)\n", 322 | "mob_df.head()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 7, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "http://mouse.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,structure_sets[id$eq2],rma::options[only$eq'id'],pipe::list[xstructures$eq'id'],service::differential_rows[set$eq'P56coronal'][domain1$eq'220,228,236,244'][domain1_threshold$eq'0,50'][domain2$eq'212'][domain2_threshold$eq'1,50'][start_row$eq%(start_row)s][num_rows$eq%(num_rows)s]\n" 335 | ] 336 | }, 337 | { 338 | "data": { 339 | "text/html": [ 340 | "

\n", 341 | "\n", 354 | "\n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | "

	index	id	name	gene-id	gene-symbol	gene-name	entrez-id	chromosome	plane-of-section	specimen-id	fold-change	target-sum	contrast-sum	num-target-samples	num-contrast-samples
0	0	73520993	RP_051101_02_G12	88550	Kctd12	potassium channel tetramerisation domain conta...	239217	14	coronal	NaN	4.234	963.874	1020.211	83	372
1	1	74357573	RP_050915_03_C03	88861	Tspan18	tetraspanin 18	241556	2	coronal	NaN	4.183	142.255	156.017	80	367
2	2	72008121	RP_051017_01_B10	14038	Fmo1	flavin containing monooxygenase 1	14261	1	coronal	NaN	4.041	211.137	234.167	83	372
3	3	74512017	RP_060220_03_D03	83946	Phldb2	pleckstrin homology-like domain, family B, mem...	208177	16	coronal	NaN	3.971	222.445	251.422	82	368
4	4	73929578	RP_050927_03_H11	21585	Thbs2	thrombospondin 2	21826	17	coronal	NaN	3.922	175.864	200.959	83	372

\n", 468 | "

" 469 | ], 470 | "text/plain": [ 471 | " index id name gene-id gene-symbol \\\n", 472 | "0 0 73520993 RP_051101_02_G12 88550 Kctd12 \n", 473 | "1 1 74357573 RP_050915_03_C03 88861 Tspan18 \n", 474 | "2 2 72008121 RP_051017_01_B10 14038 Fmo1 \n", 475 | "3 3 74512017 RP_060220_03_D03 83946 Phldb2 \n", 476 | "4 4 73929578 RP_050927_03_H11 21585 Thbs2 \n", 477 | "\n", 478 | " gene-name entrez-id chromosome \\\n", 479 | "0 potassium channel tetramerisation domain conta... 239217 14 \n", 480 | "1 tetraspanin 18 241556 2 \n", 481 | "2 flavin containing monooxygenase 1 14261 1 \n", 482 | "3 pleckstrin homology-like domain, family B, mem... 208177 16 \n", 483 | "4 thrombospondin 2 21826 17 \n", 484 | "\n", 485 | " plane-of-section specimen-id fold-change target-sum contrast-sum \\\n", 486 | "0 coronal NaN 4.234 963.874 1020.211 \n", 487 | "1 coronal NaN 4.183 142.255 156.017 \n", 488 | "2 coronal NaN 4.041 211.137 234.167 \n", 489 | "3 coronal NaN 3.971 222.445 251.422 \n", 490 | "4 coronal NaN 3.922 175.864 200.959 \n", 491 | "\n", 492 | " num-target-samples num-contrast-samples \n", 493 | "0 83 372 \n", 494 | "1 80 367 \n", 495 | "2 83 372 \n", 496 | "3 82 368 \n", 497 | "4 83 372 " 498 | ] 499 | }, 500 | "execution_count": 7, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "from urllib.parse import unquote\n", 507 | "\n", 508 | "# hand-crafted URL from ABA website\n", 509 | "test_url = 'http://mouse.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,structure_sets%5Bid$eq2%5D,rma::options%5Bonly$eq%27id%27%5D,pipe::list%5Bxstructures$eq%27id%27%5D,service::differential_rows%5Bset$eq%27P56coronal%27%5D%5Bdomain1$eq%27220,228,236,244%27%5D%5Bdomain1_threshold$eq%270,50%27%5D%5Bdomain2$eq%27212%27%5D%5Bdomain2_threshold$eq%271,50%27%5D%5Bstart_row$eq%(start_row)s%5D%5Bnum_rows$eq%(num_rows)s%5D'\n", 510 | "test_url = unquote(test_url)\n", 511 | "print(test_url)\n", 512 | "\n", 513 | "mob_manual = retrieve_ABA(test_url)\n", 514 | "mob_manual.head()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 8, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "assert mob_df.equals(mob_manual)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "## Perform one-vs-all differential searches for MOB regions" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 9, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "mob_dfs = {}\n", 540 | "\n", 541 | "for region in mob_structures:\n", 542 | " name = region['acronym']\n", 543 | " _id = region['id']\n", 544 | " url = build_differential_search_url(_id, \n", 545 | " [x['id'] for x in mob_structures if x['id'] != _id])\n", 546 | " mob_dfs[name] = retrieve_ABA(url)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 10, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "for k, v in mob_dfs.items():\n", 556 | " v.to_csv(k + '.csv', index=False)" 557 | ] 558 | } 559 | ], 560 | "metadata": { 561 | "kernelspec": { 562 | "display_name": "Python 3", 563 | "language": "python", 564 | "name": "python3" 565 | }, 566 | "language_info": { 567 | "codemirror_mode": { 568 | "name": "ipython", 569 | "version": 3 570 | }, 571 | "file_extension": ".py", 572 | "mimetype": "text/x-python", 573 | "name": "python", 574 | "nbconvert_exporter": "python", 575 | "pygments_lexer": "ipython3", 576 | "version": "3.6.6" 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 2 581 | } 582 | -------------------------------------------------------------------------------- /pre_processing/pre-processing_external.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "library(data.table)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "setwd(project_directory) # set wd to project directory containing all the sub folder" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Allen Brain Atlas (ABA) data" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "#### ABA layer specific genes" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "ABA_genes=readLines(\"ext_data/aba_mob-genes_raw.txt\")\n", 42 | "head(ABA_genes)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "ABA_genes_dt=rbindlist(lapply(ABA_genes,function(x){\n", 52 | " spl=unlist(strsplit(x,\",\"))\n", 53 | " g=spl[-c(1)];n=spl[1]\n", 54 | " dt=data.table(layer=n,gene=g)\n", 55 | " return(dt)}\n", 56 | " )\n", 57 | " )\n", 58 | "head(ABA_genes_dt)\n", 59 | "write.table(ABA_genes_dt,\"ext_data/aba_mob-genes.txt\",sep=\"\\t\",quote = FALSE,row.names=FALSE)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "#### ABA differential genes" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "ABA_diff_files=list.files(path = \"ext_data/\",\"aba_MOB.*.csv\",full.names = TRUE)\n", 76 | "head(ABA_diff_files)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "ABA_diff_dt=data.table()\n", 86 | "for (diff_file in ABA_diff_files){\n", 87 | " tab=fread(diff_file)\n", 88 | " tab[,layer:=unlist(strsplit(diff_file,\"MOB|\\\\.\"))[2]]\n", 89 | " ABA_diff_dt=rbindlist(list(ABA_diff_dt,tab))\n", 90 | "}\n", 91 | "setnames(ABA_diff_dt,\"gene-symbol\",\"gene\")\n", 92 | "head(ABA_diff_dt)\n", 93 | "write.table(ABA_diff_dt,\"ext_data/aba_MOB_diff.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Mouse olfactory bulb (MOB) single cell RNA-seq data (Linnarsson)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Calculating the normalized expression values (relative frequencies) for each relevant cell type for the MOB data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 19, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "li_mean=fread(\"ext_data/li_mean_expr.tsv\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 20, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/html": [ 127 | "27998" 128 | ], 129 | "text/latex": [ 130 | "27998" 131 | ], 132 | "text/markdown": [ 133 | "27998" 134 | ], 135 | "text/plain": [ 136 | "[1] 27998" 137 | ] 138 | }, 139 | "metadata": {}, 140 | "output_type": "display_data" 141 | } 142 | ], 143 | "source": [ 144 | "nrow(li_mean)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 21, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/html": [ 155 | "56" 156 | ], 157 | "text/latex": [ 158 | "56" 159 | ], 160 | "text/markdown": [ 161 | "56" 162 | ], 163 | "text/plain": [ 164 | "[1] 56" 165 | ] 166 | }, 167 | "metadata": {}, 168 | "output_type": "display_data" 169 | } 170 | ], 171 | "source": [ 172 | "#there are duplicated gene names in the mean expression matrix, but we don't know why, so we leave them in\n", 173 | "dupl_genes=unique(li_mean$V1[duplicated(li_mean$V1)])\n", 174 | "length(dupl_genes)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 24, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "li_norm=melt(li_mean[,c(grep(\"^OB\",names(li_mean)),grep(\"Neuron\",names(li_mean),invert = TRUE)),with=FALSE],id.vars = \"V1\",variable.name = \"ClusterName\")" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 25, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "li_norm[,norm:=value/sum(value),by=\"ClusterName\"]\n", 193 | "li_norm[,log_norm:=log(norm),]\n", 194 | "setnames(li_norm,\"V1\",\"gene\")" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 28, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/html": [ 205 | "63" 206 | ], 207 | "text/latex": [ 208 | "63" 209 | ], 210 | "text/markdown": [ 211 | "63" 212 | ], 213 | "text/plain": [ 214 | "[1] 63" 215 | ] 216 | }, 217 | "metadata": {}, 218 | "output_type": "display_data" 219 | } 220 | ], 221 | "source": [ 222 | "N_ct=length(unique(li_norm$ClusterName))\n", 223 | "N_ct" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 29, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/html": [ 234 | "5848" 235 | ], 236 | "text/latex": [ 237 | "5848" 238 | ], 239 | "text/markdown": [ 240 | "5848" 241 | ], 242 | "text/plain": [ 243 | "[1] 5848" 244 | ] 245 | }, 246 | "metadata": {}, 247 | "output_type": "display_data" 248 | } 249 | ], 250 | "source": [ 251 | "#some genes have 0 expression for all of the cell types.\n", 252 | "li_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]\n", 253 | "length(unique(li_norm[N_zero_ct==N_ct]$gene))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 31, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/html": [ 264 | "22085" 265 | ], 266 | "text/latex": [ 267 | "22085" 268 | ], 269 | "text/markdown": [ 270 | "22085" 271 | ], 272 | "text/plain": [ 273 | "[1] 22085" 274 | ] 275 | }, 276 | "metadata": {}, 277 | "output_type": "display_data" 278 | } 279 | ], 280 | "source": [ 281 | "#number of genes that are expressed in at least one cell type\n", 282 | "Ngenes=length(unique(li_norm[N_zero_ct!=N_ct]$gene))\n", 283 | "Ngenes" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 13, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "write.table(li_norm,\"results/li_norm.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE) #data as published" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### Triple negative breast cancer (tnbc) single-cell RNA seq data \n", 300 | "https://www.nature.com/articles/s41467-018-06052-0#Sec24 \n", 301 | "GSE118389: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "Calculating the normalized expression values (relative frequencies) for each relevant cell type for the breast cancer data" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 3, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stderr", 318 | "output_type": "stream", 319 | "text": [ 320 | "Warning message in fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_counts_rsem.txt\")):\n", 321 | "\"Detected 1534 column names but the data has 1535 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.\"" 322 | ] 323 | }, 324 | { 325 | "data": { 326 | "text/html": [ 327 | "\n", 328 | "\n", 329 | "\n", 330 | "\t\n", 331 | "\t\n", 332 | "\t\n", 333 | "\t\n", 334 | "\t\n", 335 | "\t\n", 336 | "\n", 337 | "

V1	PT089_P1_A01	PT089_P1_A02	PT089_P1_A03	PT089_P1_A04	PT089_P1_A05	PT089_P1_A06	PT089_P1_A07	PT089_P1_A08	PT089_P1_A09	...	PT039_P10_H03_S279	PT039_P10_H05_S281	PT039_P10_H07_S283	PT039_P10_H08_S284	PT039_P10_H09_S285	PT039_P10_H10_S286	PT039_P10_H11_S287	PT039_P10_H12_S288
ZXDC	1.76	5.06	4.86	2.57	7.48	7.26	12.10	2.61	7.59	...	0.00	0.00	1275.78	2.03	0.00	0.00	0.00	85.87
ZYG11A	4.73	111.84	1.26	0.00	1.42	7.26	12.36	4.72	13.45	...	0.00	0.00	1.02	2.18	3.85	0.00	0.00	6.15
ZYG11B	7.86	2.14	4.43	1.77	2.43	0.00	7.56	2.24	6.63	...	3.35	1.91	4.12	1.74	1.02	1.12	3.72	23.32
ZYX	0.00	0.00	1.00	0.00	937.00	0.00	0.00	0.00	0.00	...	0.00	0.00	0.00	3.00	143.00	0.00	0.00	85.00
ZZEF1	0.00	0.00	0.00	0.00	0.00	0.00	0.00	7.00	0.00	...	0.00	0.00	0.00	0.00	6.00	0.00	0.00	27.00
ZZZ3	0.00	0.00	0.00	2.00	0.00	2006.00	0.00	0.00	0.00	...	0.00	0.00	0.00	0.00	0.00	0.00	0.00	23.00

\n" 338 | ], 339 | "text/latex": [ 340 | "\\begin{tabular}{r|lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll}\n", 341 | " V1 & PT089\\_P1\\_A01 & PT089\\_P1\\_A02 & PT089\\_P1\\_A03 & PT089\\_P1\\_A04 & PT089\\_P1\\_A05 & PT089\\_P1\\_A06 & PT089\\_P1\\_A07 & PT089\\_P1\\_A08 & PT089\\_P1\\_A09 & ... & PT039\\_P10\\_H03\\_S279 & PT039\\_P10\\_H04\\_S280 & PT039\\_P10\\_H05\\_S281 & PT039\\_P10\\_H06\\_S282 & PT039\\_P10\\_H07\\_S283 & PT039\\_P10\\_H08\\_S284 & PT039\\_P10\\_H09\\_S285 & PT039\\_P10\\_H10\\_S286 & PT039\\_P10\\_H11\\_S287 & PT039\\_P10\\_H12\\_S288\\\\\n", 342 | "\\hline\n", 343 | "\t ZXDC & 1.76 & 5.06 & 4.86 & 2.57 & 7.48 & 7.26 & 12.10 & 2.61 & 7.59 & ... & 0.00 & 0 & 0.00 & 0 & 1275.78 & 2.03 & 0.00 & 0.00 & 0.00 & 85.87 \\\\\n", 344 | "\t ZYG11A & 4.73 & 111.84 & 1.26 & 0.00 & 1.42 & 7.26 & 12.36 & 4.72 & 13.45 & ... & 0.00 & 0 & 0.00 & 0 & 1.02 & 2.18 & 3.85 & 0.00 & 0.00 & 6.15 \\\\\n", 345 | "\t ZYG11B & 7.86 & 2.14 & 4.43 & 1.77 & 2.43 & 0.00 & 7.56 & 2.24 & 6.63 & ... & 3.35 & 0 & 1.91 & 0 & 4.12 & 1.74 & 1.02 & 1.12 & 3.72 & 23.32 \\\\\n", 346 | "\t ZYX & 0.00 & 0.00 & 1.00 & 0.00 & 937.00 & 0.00 & 0.00 & 0.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 3.00 & 143.00 & 0.00 & 0.00 & 85.00 \\\\\n", 347 | "\t ZZEF1 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 7.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 0.00 & 6.00 & 0.00 & 0.00 & 27.00 \\\\\n", 348 | "\t ZZZ3 & 0.00 & 0.00 & 0.00 & 2.00 & 0.00 & 2006.00 & 0.00 & 0.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 23.00 \\\\\n", 349 | "\\end{tabular}\n" 350 | ], 351 | "text/markdown": [ 352 | "\n", 353 | "| V1 | PT089_P1_A01 | PT089_P1_A02 | PT089_P1_A03 | PT089_P1_A04 | PT089_P1_A05 | PT089_P1_A06 | PT089_P1_A07 | PT089_P1_A08 | PT089_P1_A09 | ... | PT039_P10_H03_S279 | PT039_P10_H04_S280 | PT039_P10_H05_S281 | PT039_P10_H06_S282 | PT039_P10_H07_S283 | PT039_P10_H08_S284 | PT039_P10_H09_S285 | PT039_P10_H10_S286 | PT039_P10_H11_S287 | PT039_P10_H12_S288 |\n", 354 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 355 | "| ZXDC | 1.76 | 5.06 | 4.86 | 2.57 | 7.48 | 7.26 | 12.10 | 2.61 | 7.59 | ... | 0.00 | 0 | 0.00 | 0 | 1275.78 | 2.03 | 0.00 | 0.00 | 0.00 | 85.87 |\n", 356 | "| ZYG11A | 4.73 | 111.84 | 1.26 | 0.00 | 1.42 | 7.26 | 12.36 | 4.72 | 13.45 | ... | 0.00 | 0 | 0.00 | 0 | 1.02 | 2.18 | 3.85 | 0.00 | 0.00 | 6.15 |\n", 357 | "| ZYG11B | 7.86 | 2.14 | 4.43 | 1.77 | 2.43 | 0.00 | 7.56 | 2.24 | 6.63 | ... | 3.35 | 0 | 1.91 | 0 | 4.12 | 1.74 | 1.02 | 1.12 | 3.72 | 23.32 |\n", 358 | "| ZYX | 0.00 | 0.00 | 1.00 | 0.00 | 937.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 3.00 | 143.00 | 0.00 | 0.00 | 85.00 |\n", 359 | "| ZZEF1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 6.00 | 0.00 | 0.00 | 27.00 |\n", 360 | "| ZZZ3 | 0.00 | 0.00 | 0.00 | 2.00 | 0.00 | 2006.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 23.00 |\n", 361 | "\n" 362 | ], 363 | "text/plain": [ 364 | " V1 PT089_P1_A01 PT089_P1_A02 PT089_P1_A03 PT089_P1_A04 PT089_P1_A05\n", 365 | "1 ZXDC 1.76 5.06 4.86 2.57 7.48 \n", 366 | "2 ZYG11A 4.73 111.84 1.26 0.00 1.42 \n", 367 | "3 ZYG11B 7.86 2.14 4.43 1.77 2.43 \n", 368 | "4 ZYX 0.00 0.00 1.00 0.00 937.00 \n", 369 | "5 ZZEF1 0.00 0.00 0.00 0.00 0.00 \n", 370 | "6 ZZZ3 0.00 0.00 0.00 2.00 0.00 \n", 371 | " PT089_P1_A06 PT089_P1_A07 PT089_P1_A08 PT089_P1_A09 ... PT039_P10_H03_S279\n", 372 | "1 7.26 12.10 2.61 7.59 ... 0.00 \n", 373 | "2 7.26 12.36 4.72 13.45 ... 0.00 \n", 374 | "3 0.00 7.56 2.24 6.63 ... 3.35 \n", 375 | "4 0.00 0.00 0.00 0.00 ... 0.00 \n", 376 | "5 0.00 0.00 7.00 0.00 ... 0.00 \n", 377 | "6 2006.00 0.00 0.00 0.00 ... 0.00 \n", 378 | " PT039_P10_H04_S280 PT039_P10_H05_S281 PT039_P10_H06_S282 PT039_P10_H07_S283\n", 379 | "1 0 0.00 0 1275.78 \n", 380 | "2 0 0.00 0 1.02 \n", 381 | "3 0 1.91 0 4.12 \n", 382 | "4 0 0.00 0 0.00 \n", 383 | "5 0 0.00 0 0.00 \n", 384 | "6 0 0.00 0 0.00 \n", 385 | " PT039_P10_H08_S284 PT039_P10_H09_S285 PT039_P10_H10_S286 PT039_P10_H11_S287\n", 386 | "1 2.03 0.00 0.00 0.00 \n", 387 | "2 2.18 3.85 0.00 0.00 \n", 388 | "3 1.74 1.02 1.12 3.72 \n", 389 | "4 3.00 143.00 0.00 0.00 \n", 390 | "5 0.00 6.00 0.00 0.00 \n", 391 | "6 0.00 0.00 0.00 0.00 \n", 392 | " PT039_P10_H12_S288\n", 393 | "1 85.87 \n", 394 | "2 6.15 \n", 395 | "3 23.32 \n", 396 | "4 85.00 \n", 397 | "5 27.00 \n", 398 | "6 23.00 " 399 | ] 400 | }, 401 | "metadata": {}, 402 | "output_type": "display_data" 403 | }, 404 | { 405 | "data": { 406 | "text/html": [ 407 | "

21785
1535

\n" 411 | ], 412 | "text/latex": [ 413 | "\\begin{enumerate*}\n", 414 | "\\item 21785\n", 415 | "\\item 1535\n", 416 | "\\end{enumerate*}\n" 417 | ], 418 | "text/markdown": [ 419 | "1. 21785\n", 420 | "2. 1535\n", 421 | "\n", 422 | "\n" 423 | ], 424 | "text/plain": [ 425 | "[1] 21785 1535" 426 | ] 427 | }, 428 | "metadata": {}, 429 | "output_type": "display_data" 430 | }, 431 | { 432 | "data": { 433 | "text/html": [ 434 | "0" 435 | ], 436 | "text/latex": [ 437 | "0" 438 | ], 439 | "text/markdown": [ 440 | "0" 441 | ], 442 | "text/plain": [ 443 | "[1] 0" 444 | ] 445 | }, 446 | "metadata": {}, 447 | "output_type": "display_data" 448 | } 449 | ], 450 | "source": [ 451 | "#data\n", 452 | "TNBC_data=fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_counts_rsem.txt\"))\n", 453 | "tail(TNBC_data)\n", 454 | "dim(TNBC_data)\n", 455 | "sum(duplicated(TNBC_data$V1)) #check for duplicated gene names - here are none" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 4, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "data": { 465 | "text/html": [ 466 | "\n", 467 | "\n", 468 | "\n", 469 | "\t\n", 470 | "\t\n", 471 | "\t\n", 472 | "\t\n", 473 | "\t\n", 474 | "\t\n", 475 | "\n", 476 | "

V1	V2
PT089_P1_A01	epithelial
PT089_P1_A02	epithelial
PT089_P1_A03	epithelial
PT089_P1_A04	macrophage
PT089_P1_A05	macrophage
PT089_P1_A06	epithelial

\n" 477 | ], 478 | "text/latex": [ 479 | "\\begin{tabular}{r|ll}\n", 480 | " V1 & V2\\\\\n", 481 | "\\hline\n", 482 | "\t PT089\\_P1\\_A01 & epithelial \\\\\n", 483 | "\t PT089\\_P1\\_A02 & epithelial \\\\\n", 484 | "\t PT089\\_P1\\_A03 & epithelial \\\\\n", 485 | "\t PT089\\_P1\\_A04 & macrophage \\\\\n", 486 | "\t PT089\\_P1\\_A05 & macrophage \\\\\n", 487 | "\t PT089\\_P1\\_A06 & epithelial \\\\\n", 488 | "\\end{tabular}\n" 489 | ], 490 | "text/markdown": [ 491 | "\n", 492 | "| V1 | V2 |\n", 493 | "|---|---|\n", 494 | "| PT089_P1_A01 | epithelial |\n", 495 | "| PT089_P1_A02 | epithelial |\n", 496 | "| PT089_P1_A03 | epithelial |\n", 497 | "| PT089_P1_A04 | macrophage |\n", 498 | "| PT089_P1_A05 | macrophage |\n", 499 | "| PT089_P1_A06 | epithelial |\n", 500 | "\n" 501 | ], 502 | "text/plain": [ 503 | " V1 V2 \n", 504 | "1 PT089_P1_A01 epithelial\n", 505 | "2 PT089_P1_A02 epithelial\n", 506 | "3 PT089_P1_A03 epithelial\n", 507 | "4 PT089_P1_A04 macrophage\n", 508 | "5 PT089_P1_A05 macrophage\n", 509 | "6 PT089_P1_A06 epithelial" 510 | ] 511 | }, 512 | "metadata": {}, 513 | "output_type": "display_data" 514 | }, 515 | { 516 | "data": { 517 | "text/html": [ 518 | "

1112
2

\n" 522 | ], 523 | "text/latex": [ 524 | "\\begin{enumerate*}\n", 525 | "\\item 1112\n", 526 | "\\item 2\n", 527 | "\\end{enumerate*}\n" 528 | ], 529 | "text/markdown": [ 530 | "1. 1112\n", 531 | "2. 2\n", 532 | "\n", 533 | "\n" 534 | ], 535 | "text/plain": [ 536 | "[1] 1112 2" 537 | ] 538 | }, 539 | "metadata": {}, 540 | "output_type": "display_data" 541 | } 542 | ], 543 | "source": [ 544 | "#annotation\n", 545 | "TNBC_annot=fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_cell_annot.tsv\"))\n", 546 | "TNBC_annot[,V1:=as.character(V1),]\n", 547 | "TNBC_annot[,V2:=as.character(V2),]\n", 548 | "head(TNBC_annot)\n", 549 | "dim(TNBC_annot)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 5, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "name": "stderr", 559 | "output_type": "stream", 560 | "text": [ 561 | "Warning message in melt.data.table(TNBC_data, id.vars = \"V1\"):\n", 562 | "\"'measure.vars' [PT089_P1_A01, PT089_P1_A02, PT089_P1_A03, PT089_P1_A04, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.\"" 563 | ] 564 | }, 565 | { 566 | "data": { 567 | "text/html": [ 568 | "\n", 569 | "\n", 570 | "\n", 571 | "\t\n", 572 | "\t\n", 573 | "\t\n", 574 | "\t\n", 575 | "\t\n", 576 | "\t\n", 577 | "\n", 578 | "

V1	variable	value
A1BG	PT089_P1_A01	0.00
A1BG-AS1	PT089_P1_A01	0.00
A1CF	PT089_P1_A01	0.00
A2M	PT089_P1_A01	0.00
A2M-AS1	PT089_P1_A01	0.00
A2ML1	PT089_P1_A01	1.08

\n" 579 | ], 580 | "text/latex": [ 581 | "\\begin{tabular}{r|lll}\n", 582 | " V1 & variable & value\\\\\n", 583 | "\\hline\n", 584 | "\t A1BG & PT089\\_P1\\_A01 & 0.00 \\\\\n", 585 | "\t A1BG-AS1 & PT089\\_P1\\_A01 & 0.00 \\\\\n", 586 | "\t A1CF & PT089\\_P1\\_A01 & 0.00 \\\\\n", 587 | "\t A2M & PT089\\_P1\\_A01 & 0.00 \\\\\n", 588 | "\t A2M-AS1 & PT089\\_P1\\_A01 & 0.00 \\\\\n", 589 | "\t A2ML1 & PT089\\_P1\\_A01 & 1.08 \\\\\n", 590 | "\\end{tabular}\n" 591 | ], 592 | "text/markdown": [ 593 | "\n", 594 | "| V1 | variable | value |\n", 595 | "|---|---|---|\n", 596 | "| A1BG | PT089_P1_A01 | 0.00 |\n", 597 | "| A1BG-AS1 | PT089_P1_A01 | 0.00 |\n", 598 | "| A1CF | PT089_P1_A01 | 0.00 |\n", 599 | "| A2M | PT089_P1_A01 | 0.00 |\n", 600 | "| A2M-AS1 | PT089_P1_A01 | 0.00 |\n", 601 | "| A2ML1 | PT089_P1_A01 | 1.08 |\n", 602 | "\n" 603 | ], 604 | "text/plain": [ 605 | " V1 variable value\n", 606 | "1 A1BG PT089_P1_A01 0.00 \n", 607 | "2 A1BG-AS1 PT089_P1_A01 0.00 \n", 608 | "3 A1CF PT089_P1_A01 0.00 \n", 609 | "4 A2M PT089_P1_A01 0.00 \n", 610 | "5 A2M-AS1 PT089_P1_A01 0.00 \n", 611 | "6 A2ML1 PT089_P1_A01 1.08 " 612 | ] 613 | }, 614 | "metadata": {}, 615 | "output_type": "display_data" 616 | } 617 | ], 618 | "source": [ 619 | "TNBC_data_long=melt(TNBC_data,id.vars = \"V1\")\n", 620 | "head(TNBC_data_long)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 6, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "TNBC_data_long=merge(TNBC_data_long,TNBC_annot,by.x = \"variable\",by.y=\"V1\")" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 7, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "setnames(TNBC_data_long,\"V2\",\"ClusterName\")" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 8, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/html": [ 649 | "\n", 650 | "\n", 651 | "\n", 652 | "\t\n", 653 | "\t\n", 654 | "\t\n", 655 | "\t\n", 656 | "\t\n", 657 | "\t\n", 658 | "\n", 659 | "

V1	ClusterName	value	N
A1BG	epithelial	3.25911290	868
A1BG-AS1	epithelial	4.61316820	868
A1CF	epithelial	0.06591014	868
A2M	epithelial	633.41102535	868
A2M-AS1	epithelial	1.97836406	868
A2ML1	epithelial	20.12418203	868

\n" 660 | ], 661 | "text/latex": [ 662 | "\\begin{tabular}{r|llll}\n", 663 | " V1 & ClusterName & value & N\\\\\n", 664 | "\\hline\n", 665 | "\t A1BG & epithelial & 3.25911290 & 868 \\\\\n", 666 | "\t A1BG-AS1 & epithelial & 4.61316820 & 868 \\\\\n", 667 | "\t A1CF & epithelial & 0.06591014 & 868 \\\\\n", 668 | "\t A2M & epithelial & 633.41102535 & 868 \\\\\n", 669 | "\t A2M-AS1 & epithelial & 1.97836406 & 868 \\\\\n", 670 | "\t A2ML1 & epithelial & 20.12418203 & 868 \\\\\n", 671 | "\\end{tabular}\n" 672 | ], 673 | "text/markdown": [ 674 | "\n", 675 | "| V1 | ClusterName | value | N |\n", 676 | "|---|---|---|---|\n", 677 | "| A1BG | epithelial | 3.25911290 | 868 |\n", 678 | "| A1BG-AS1 | epithelial | 4.61316820 | 868 |\n", 679 | "| A1CF | epithelial | 0.06591014 | 868 |\n", 680 | "| A2M | epithelial | 633.41102535 | 868 |\n", 681 | "| A2M-AS1 | epithelial | 1.97836406 | 868 |\n", 682 | "| A2ML1 | epithelial | 20.12418203 | 868 |\n", 683 | "\n" 684 | ], 685 | "text/plain": [ 686 | " V1 ClusterName value N \n", 687 | "1 A1BG epithelial 3.25911290 868\n", 688 | "2 A1BG-AS1 epithelial 4.61316820 868\n", 689 | "3 A1CF epithelial 0.06591014 868\n", 690 | "4 A2M epithelial 633.41102535 868\n", 691 | "5 A2M-AS1 epithelial 1.97836406 868\n", 692 | "6 A2ML1 epithelial 20.12418203 868" 693 | ] 694 | }, 695 | "metadata": {}, 696 | "output_type": "display_data" 697 | } 698 | ], 699 | "source": [ 700 | "tnbc_norm=TNBC_data_long[,.(value=mean(value),N=.N),by=c(\"V1\",\"ClusterName\")]\n", 701 | "head(tnbc_norm)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 11, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "data": { 711 | "text/html": [ 712 | "\n", 713 | "\n", 714 | "\n", 715 | "\t\n", 716 | "\t\n", 717 | "\t\n", 718 | "\t\n", 719 | "\t\n", 720 | "\t\n", 721 | "\n", 722 | "

gene	ClusterName	value	N	norm	log_norm
ZXDC	Tcell	20.228302	53	3.017484e-05	-10.408502
ZYG11A	Tcell	3.791887	53	5.656409e-06	-12.082721
ZYG11B	Tcell	20.612264	53	3.074760e-05	-10.389699
ZYX	Tcell	124.452830	53	1.856480e-04	-8.591658
ZZEF1	Tcell	91.452830	53	1.364214e-04	-8.899762
ZZZ3	Tcell	55.037736	53	8.210054e-05	-9.407566

\n" 723 | ], 724 | "text/latex": [ 725 | "\\begin{tabular}{r|llllll}\n", 726 | " gene & ClusterName & value & N & norm & log\\_norm\\\\\n", 727 | "\\hline\n", 728 | "\t ZXDC & Tcell & 20.228302 & 53 & 3.017484e-05 & -10.408502 \\\\\n", 729 | "\t ZYG11A & Tcell & 3.791887 & 53 & 5.656409e-06 & -12.082721 \\\\\n", 730 | "\t ZYG11B & Tcell & 20.612264 & 53 & 3.074760e-05 & -10.389699 \\\\\n", 731 | "\t ZYX & Tcell & 124.452830 & 53 & 1.856480e-04 & -8.591658 \\\\\n", 732 | "\t ZZEF1 & Tcell & 91.452830 & 53 & 1.364214e-04 & -8.899762 \\\\\n", 733 | "\t ZZZ3 & Tcell & 55.037736 & 53 & 8.210054e-05 & -9.407566 \\\\\n", 734 | "\\end{tabular}\n" 735 | ], 736 | "text/markdown": [ 737 | "\n", 738 | "| gene | ClusterName | value | N | norm | log_norm |\n", 739 | "|---|---|---|---|---|---|\n", 740 | "| ZXDC | Tcell | 20.228302 | 53 | 3.017484e-05 | -10.408502 |\n", 741 | "| ZYG11A | Tcell | 3.791887 | 53 | 5.656409e-06 | -12.082721 |\n", 742 | "| ZYG11B | Tcell | 20.612264 | 53 | 3.074760e-05 | -10.389699 |\n", 743 | "| ZYX | Tcell | 124.452830 | 53 | 1.856480e-04 | -8.591658 |\n", 744 | "| ZZEF1 | Tcell | 91.452830 | 53 | 1.364214e-04 | -8.899762 |\n", 745 | "| ZZZ3 | Tcell | 55.037736 | 53 | 8.210054e-05 | -9.407566 |\n", 746 | "\n" 747 | ], 748 | "text/plain": [ 749 | " gene ClusterName value N norm log_norm \n", 750 | "1 ZXDC Tcell 20.228302 53 3.017484e-05 -10.408502\n", 751 | "2 ZYG11A Tcell 3.791887 53 5.656409e-06 -12.082721\n", 752 | "3 ZYG11B Tcell 20.612264 53 3.074760e-05 -10.389699\n", 753 | "4 ZYX Tcell 124.452830 53 1.856480e-04 -8.591658\n", 754 | "5 ZZEF1 Tcell 91.452830 53 1.364214e-04 -8.899762\n", 755 | "6 ZZZ3 Tcell 55.037736 53 8.210054e-05 -9.407566" 756 | ] 757 | }, 758 | "metadata": {}, 759 | "output_type": "display_data" 760 | } 761 | ], 762 | "source": [ 763 | "tnbc_norm[,norm:=value/sum(value),by=\"ClusterName\"]\n", 764 | "tnbc_norm[,log_norm:=log(norm),]\n", 765 | "setnames(tnbc_norm,\"V1\",\"gene\")\n", 766 | "tail(tnbc_norm)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 12, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "data": { 776 | "text/html": [ 777 | "6" 778 | ], 779 | "text/latex": [ 780 | "6" 781 | ], 782 | "text/markdown": [ 783 | "6" 784 | ], 785 | "text/plain": [ 786 | "[1] 6" 787 | ] 788 | }, 789 | "metadata": {}, 790 | "output_type": "display_data" 791 | } 792 | ], 793 | "source": [ 794 | "N_ct=length(unique(tnbc_norm$ClusterName))\n", 795 | "N_ct" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 13, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "tnbc_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 14, 810 | "metadata": {}, 811 | "outputs": [ 812 | { 813 | "data": { 814 | "text/html": [ 815 | "19595" 816 | ], 817 | "text/latex": [ 818 | "19595" 819 | ], 820 | "text/markdown": [ 821 | "19595" 822 | ], 823 | "text/plain": [ 824 | "[1] 19595" 825 | ] 826 | }, 827 | "metadata": {}, 828 | "output_type": "display_data" 829 | } 830 | ], 831 | "source": [ 832 | "Ngenes=length(unique(tnbc_norm[N_zero_ct= 530){\n", 49 | " y=y-529\n", 50 | " }\n", 51 | " return(paste0(x,\"x\",y))\n", 52 | " }\n", 53 | " if (file.exists(paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"))){\n", 54 | " dat_long_filt_annot_red=fread(paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"))\n", 55 | " pl=ggplot(dat_long_filt_annot_red[sample(1:nrow(dat_long_filt_annot_red),40000)],aes(y=spot_px_y,x=spot_px_x))+geom_point(size=0.5)+coord_fixed()\n", 56 | " png(paste0(dir,\"/\",sample,\"_filtered_corrected.png\"),height=500,width=700)\n", 57 | " print(pl)\n", 58 | " dev.off()\n", 59 | " return(dat_long_filt_annot_red)\n", 60 | " }\n", 61 | " \n", 62 | " dat=fread(paste0(dir,\"/\",sample,\"_filtered.tsv.gz\"))\n", 63 | " sel_bc=fread(paste0(dir,\"/\",sub(\"_unmodgtf|_modgtf\",\"\",sample),\"_barcodes_under_tissue.tsv\"))\n", 64 | " if (any(grepl(\"ENS\",dat$V1))){\n", 65 | " message(\"Running in transpose mode\")\n", 66 | " chunks=unique(c(seq(from=2,to=ncol(dat),by=1000),ncol(dat)))\n", 67 | " steps=as.data.table(cbind(from=chunks[-length(chunks)],to=chunks[-1]-1))\n", 68 | " steps[nrow(steps),to:=to+1,]\n", 69 | " \n", 70 | " dat_long=data.table()\n", 71 | " for (i in 1:nrow(steps)){\n", 72 | " cat(paste0(i,\" \"))\n", 73 | " from=steps[i]$from\n", 74 | " to=steps[i]$to\n", 75 | " dat_long_tmp=melt(dat[,c(1,from:to),with=FALSE],id.vars=c(\"V1\"),variable.name = \"bc_old\")\n", 76 | " dat_long_tmp[,bc:=adjust.y(bc_old[1]),by=bc_old]\n", 77 | " dat_long=rbindlist(list(dat_long,dat_long_tmp[value>0]))\n", 78 | " }\n", 79 | " setnames(dat_long,c(\"V1\",\"value\"),c(\"ensGV\",\"count\"))\n", 80 | " setcolorder(dat_long,c(\"bc_old\",\"bc\",\"ensGV\",\"count\"))\n", 81 | "\n", 82 | " }else{\n", 83 | " dat[,V2:=adjust.y(V1),by=1:nrow(dat)]\n", 84 | " chunks=unique(c(seq(from=1,to=nrow(dat),by=1000),nrow(dat)))\n", 85 | " steps=as.data.table(cbind(from=chunks[-length(chunks)],to=chunks[-1]-1))\n", 86 | " steps[nrow(steps),to:=to+1,]\n", 87 | " \n", 88 | " dat_long=data.table()\n", 89 | " for (i in 1:nrow(steps)){\n", 90 | " cat(paste0(i,\" \"))\n", 91 | " from=steps[i]$from\n", 92 | " to=steps[i]$to\n", 93 | " dat_long_tmp=melt(dat[from:to,],id.vars=c(\"V1\",\"V2\"))\n", 94 | " dat_long=rbindlist(list(dat_long,dat_long_tmp[value>0]))\n", 95 | " }\n", 96 | " setnames(dat_long,names(dat_long),c(\"bc_old\",\"bc\",\"ensGV\",\"count\"))\n", 97 | " }\n", 98 | " \n", 99 | " dat_long_filt=dat_long[!grepl(\"\\\\+\",ensGV)]\n", 100 | " \n", 101 | " dat_long_filt[,ensG:=unlist(strsplit(as.character(ensGV),\"\\\\.\"))[1],by=1:nrow(dat_long_filt)]\n", 102 | " dat_long_filt[,x:=as.numeric(unlist(strsplit(as.character(bc),\"x\"))[1]),by=1:nrow(dat_long_filt)]\n", 103 | " dat_long_filt[,y:=as.numeric(unlist(strsplit(as.character(bc),\"x\"))[2]),by=1:nrow(dat_long_filt)]\n", 104 | " \n", 105 | " dat_long_filt_annot=merge(dat_long_filt,genes,by=\"ensG\",all.x=TRUE)\n", 106 | " dat_long_filt_annot_red=merge(dat_long_filt_annot,sel_bc,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))\n", 107 | " \n", 108 | " write.table(dat_long_filt_annot,paste0(dir,\"/\",sample,\"_filtered_red.tsv\"),sep=\"\\t\",row.names=FALSE,quote=FALSE)\n", 109 | " write.table(dat_long_filt_annot_red,paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"),sep=\"\\t\",row.names=FALSE,quote=FALSE)\n", 110 | " if (plot == TRUE){\n", 111 | " pl=ggplot(dat_long_filt_annot_red[sample(1:nrow(dat_long_filt_annot_red),40000)],aes(y=spot_px_y,x=spot_px_x))+geom_point(size=0.5)+coord_fixed()\n", 112 | " png(paste0(dir,\"/\",sample,\"_filtered_corrected.png\"),height=500,width=700)\n", 113 | " print(pl)\n", 114 | " dev.off()\n", 115 | " }\n", 116 | " return(dat_long_filt_annot_red)\n", 117 | "}" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "##### MOB data" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "genes=fread(\"ext_data/ensemble_gene_names_V94.txt\")\n", 134 | "setnames(genes,names(genes),c(\"ensGV\",\"ensG\",\"gene\"))" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 22, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "dat=prep_hdst_data(\"MOB\",\"CN13_D2\",genes,TRUE)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 21, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "dat=prep_hdst_data(\"MOB\",\"CN24_D1\",genes,TRUE)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "dat=prep_hdst_data(\"MOB\",\"CN24_E1\",genes,TRUE)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "dat=prep_hdst_data(\"MOB_nc\",\"CN13_D2_unmodgtf\",genes,TRUE)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "dat=prep_hdst_data(\"MOB_nc\",\"CN24_D1_unmodgtf\",genes,TRUE)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "dat=prep_hdst_data(\"MOB_nc\",\"CN24_E1_unmodgtf\",genes,TRUE)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "##### Breast cancer data" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "genes=fread(\"ext_data/ensemble_gene_names_human_V96.txt\")\n", 205 | "setnames(genes,names(genes),c(\"ensGV\",\"ensG\",\"gene\"))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "dat=prep_hdst_data(\"BC\",\"CN21_BC24350_E2\",genes,TRUE)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "dat=prep_hdst_data(\"BC_nc/\",\"CN21_BC24350_E2_unmodgtf\",genes,TRUE)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Process segmented HDST data" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# for MOB (standard gtf)\n", 240 | "dat=fread(\"MOB_nc/CN13_D2_filtered_red_ut.tsv\")\n", 241 | "seg=fread(\"MOB/CellID_Spot_Position_CN13_D2_filtered_red_ut.csv\")\n", 242 | "tag=\"CN13_D2\"\n", 243 | "dir=\"MOB\"" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# for MOB (nc gtf)\n", 253 | "dat=fread(\"MOB_nc/CN13_D2_unmodgtf_filtered_red_ut.tsv\")\n", 254 | "seg=fread(\"MOB/CellID_Spot_Position_CN13_D2_filtered_red_ut.csv\")\n", 255 | "tag=\"CN13_D2_unmodgtf\"\n", 256 | "dir=\"MOB_nc\"" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 10, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# for MOB (nc gtf)\n", 266 | "dat=fread(\"MOB_nc/CN24_D1_unmodgtf_filtered_red_ut.tsv\")\n", 267 | "seg=fread(\"MOB_nc/CellID_Spot_Position_CN24_D1_unmodgtf_filtered_red_ut_flipped.csv\")\n", 268 | "tag=\"CN24_D1_unmodgtf\"\n", 269 | "dir=\"MOB_nc\"" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 14, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "# for MOB (nc gtf)\n", 279 | "dat=fread(\"MOB_nc/CN24_E1_unmodgtf_filtered_red_ut.tsv\")\n", 280 | "seg=fread(\"MOB_nc/CellID_Spot_Position_CN24_E1_unmodgtf_filtered_red_ut_flipped.csv\")\n", 281 | "tag=\"CN24_E1_unmodgtf\"\n", 282 | "dir=\"MOB_nc\"" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "# for BC (nc gtf)\n", 292 | "dat=fread(\"BC_nc/CN21_BC24350_E2_unmodgtf_filtered_red_ut.tsv\")\n", 293 | "seg=fread(\"BC/CellID_Spot_Position_CN21_E2_filtered_red_ut_BC_flipped.csv\")\n", 294 | "tag=\"CN21_BC24350_E2_unmodgtf\"\n", 295 | "dir=\"BC_nc\" #previously stored in BC" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "#### here run" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 15, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/html": [ 313 | "\n", 314 | "\n", 315 | "\n", 316 | "\t\n", 317 | "\t\n", 318 | "\t\n", 319 | "\t\n", 320 | "\t\n", 321 | "\t\n", 322 | "\n", 323 | "

bc	x	y	ensG	bc_old	ensGV.x	count	ensGV.y	gene	spot_px_y	spot_px_x	cell_id	x_centroid	y_centroid	N_bc
1000x187	1000	187	ENSMUSG00000002985	1000x716	ENSMUSG00000002985.16	1	ENSMUSG00000002985.16	Apoe	2295	10077	25146	10067.29	5306.808	10
1000x187	1000	187	ENSMUSG00000020193	1000x716	ENSMUSG00000020193.3	1	ENSMUSG00000020193.3	Zpbp	2295	10077	25146	10067.29	5306.808	10
1000x187	1000	187	ENSMUSG00000020483	1000x716	ENSMUSG00000020483.14	1	ENSMUSG00000020483.14	Dynll2	2295	10077	25146	10067.29	5306.808	10
1000x187	1000	187	ENSMUSG00000025907	1000x716	ENSMUSG00000025907.14	1	ENSMUSG00000025907.14	Rb1cc1	2295	10077	25146	10067.29	5306.808	10
1000x187	1000	187	ENSMUSG00000029635	1000x716	ENSMUSG00000029635.15	1	ENSMUSG00000029635.15	Cdk8	2295	10077	25146	10067.29	5306.808	10
1000x187	1000	187	ENSMUSG00000035202	1000x716	ENSMUSG00000035202.7	1	ENSMUSG00000035202.8	Lars2	2295	10077	25146	10067.29	5306.808	10

\n" 324 | ], 325 | "text/latex": [ 326 | "\\begin{tabular}{r|lllllllllllllll}\n", 327 | " bc & x & y & ensG & bc\\_old & ensGV.x & count & ensGV.y & gene & spot\\_px\\_y & spot\\_px\\_x & cell\\_id & x\\_centroid & y\\_centroid & N\\_bc\\\\\n", 328 | "\\hline\n", 329 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000002985 & 1000x716 & ENSMUSG00000002985.16 & 1 & ENSMUSG00000002985.16 & Apoe & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 330 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000020193 & 1000x716 & ENSMUSG00000020193.3 & 1 & ENSMUSG00000020193.3 & Zpbp & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 331 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000020483 & 1000x716 & ENSMUSG00000020483.14 & 1 & ENSMUSG00000020483.14 & Dynll2 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 332 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000025907 & 1000x716 & ENSMUSG00000025907.14 & 1 & ENSMUSG00000025907.14 & Rb1cc1 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 333 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000029635 & 1000x716 & ENSMUSG00000029635.15 & 1 & ENSMUSG00000029635.15 & Cdk8 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 334 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000035202 & 1000x716 & ENSMUSG00000035202.7 & 1 & ENSMUSG00000035202.8 & Lars2 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n", 335 | "\\end{tabular}\n" 336 | ], 337 | "text/markdown": [ 338 | "\n", 339 | "| bc | x | y | ensG | bc_old | ensGV.x | count | ensGV.y | gene | spot_px_y | spot_px_x | cell_id | x_centroid | y_centroid | N_bc |\n", 340 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", 341 | "| 1000x187 | 1000 | 187 | ENSMUSG00000002985 | 1000x716 | ENSMUSG00000002985.16 | 1 | ENSMUSG00000002985.16 | Apoe | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 342 | "| 1000x187 | 1000 | 187 | ENSMUSG00000020193 | 1000x716 | ENSMUSG00000020193.3 | 1 | ENSMUSG00000020193.3 | Zpbp | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 343 | "| 1000x187 | 1000 | 187 | ENSMUSG00000020483 | 1000x716 | ENSMUSG00000020483.14 | 1 | ENSMUSG00000020483.14 | Dynll2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 344 | "| 1000x187 | 1000 | 187 | ENSMUSG00000025907 | 1000x716 | ENSMUSG00000025907.14 | 1 | ENSMUSG00000025907.14 | Rb1cc1 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 345 | "| 1000x187 | 1000 | 187 | ENSMUSG00000029635 | 1000x716 | ENSMUSG00000029635.15 | 1 | ENSMUSG00000029635.15 | Cdk8 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 346 | "| 1000x187 | 1000 | 187 | ENSMUSG00000035202 | 1000x716 | ENSMUSG00000035202.7 | 1 | ENSMUSG00000035202.8 | Lars2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n", 347 | "\n" 348 | ], 349 | "text/plain": [ 350 | " bc x y ensG bc_old ensGV.x count\n", 351 | "1 1000x187 1000 187 ENSMUSG00000002985 1000x716 ENSMUSG00000002985.16 1 \n", 352 | "2 1000x187 1000 187 ENSMUSG00000020193 1000x716 ENSMUSG00000020193.3 1 \n", 353 | "3 1000x187 1000 187 ENSMUSG00000020483 1000x716 ENSMUSG00000020483.14 1 \n", 354 | "4 1000x187 1000 187 ENSMUSG00000025907 1000x716 ENSMUSG00000025907.14 1 \n", 355 | "5 1000x187 1000 187 ENSMUSG00000029635 1000x716 ENSMUSG00000029635.15 1 \n", 356 | "6 1000x187 1000 187 ENSMUSG00000035202 1000x716 ENSMUSG00000035202.7 1 \n", 357 | " ensGV.y gene spot_px_y spot_px_x cell_id x_centroid\n", 358 | "1 ENSMUSG00000002985.16 Apoe 2295 10077 25146 10067.29 \n", 359 | "2 ENSMUSG00000020193.3 Zpbp 2295 10077 25146 10067.29 \n", 360 | "3 ENSMUSG00000020483.14 Dynll2 2295 10077 25146 10067.29 \n", 361 | "4 ENSMUSG00000025907.14 Rb1cc1 2295 10077 25146 10067.29 \n", 362 | "5 ENSMUSG00000029635.15 Cdk8 2295 10077 25146 10067.29 \n", 363 | "6 ENSMUSG00000035202.8 Lars2 2295 10077 25146 10067.29 \n", 364 | " y_centroid N_bc\n", 365 | "1 5306.808 10 \n", 366 | "2 5306.808 10 \n", 367 | "3 5306.808 10 \n", 368 | "4 5306.808 10 \n", 369 | "5 5306.808 10 \n", 370 | "6 5306.808 10 " 371 | ] 372 | }, 373 | "metadata": {}, 374 | "output_type": "display_data" 375 | }, 376 | { 377 | "data": { 378 | "text/html": [ 379 | "856994" 380 | ], 381 | "text/latex": [ 382 | "856994" 383 | ], 384 | "text/markdown": [ 385 | "856994" 386 | ], 387 | "text/plain": [ 388 | "[1] 856994" 389 | ] 390 | }, 391 | "metadata": {}, 392 | "output_type": "display_data" 393 | } 394 | ], 395 | "source": [ 396 | "dat_seg_pre=merge(dat,seg[cell_id!=0],by=\"bc\")\n", 397 | "dat_seg_pre[,c(\"x\",\"y\",\"N_bc\"):=list(x[1],y[1],length(unique(bc))),by=\"cell_id\"] #assign the x y coordinates of the first barcode to each cell id such that each cell id only has 1 xy coordinate\n", 398 | "head(dat_seg_pre)\n", 399 | "nrow(dat_seg_pre)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 16, 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/html": [ 410 | "\n", 411 | "\n", 412 | "\n", 413 | "\t\n", 414 | "\t\n", 415 | "\t\n", 416 | "\t\n", 417 | "\t\n", 418 | "\t\n", 419 | "\n", 420 | "

cell_id	x_centroid	y_centroid	N_bc	gene	x	y	bc	count	spot_px_y	spot_px_x
25146	10067.29	5306.808	10	Apoe	1000	187	1000x187	1	2295	10077
25146	10067.29	5306.808	10	Zpbp	1000	187	1000x187	1	2295	10077
25146	10067.29	5306.808	10	Dynll2	1000	187	1000x187	1	2295	10077
25146	10067.29	5306.808	10	Rb1cc1	1000	187	1000x187	1	2295	10077
25146	10067.29	5306.808	10	Cdk8	1000	187	1000x187 996x186 999x189	3	2295	10077
25146	10067.29	5306.808	10	Lars2	1000	187	1000x187	1	2295	10077

\n" 421 | ], 422 | "text/latex": [ 423 | "\\begin{tabular}{r|lllllllllll}\n", 424 | " cell\\_id & x\\_centroid & y\\_centroid & N\\_bc & gene & x & y & bc & count & spot\\_px\\_y & spot\\_px\\_x\\\\\n", 425 | "\\hline\n", 426 | "\t 25146 & 10067.29 & 5306.808 & 10 & Apoe & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n", 427 | "\t 25146 & 10067.29 & 5306.808 & 10 & Zpbp & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n", 428 | "\t 25146 & 10067.29 & 5306.808 & 10 & Dynll2 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n", 429 | "\t 25146 & 10067.29 & 5306.808 & 10 & Rb1cc1 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n", 430 | "\t 25146 & 10067.29 & 5306.808 & 10 & Cdk8 & 1000 & 187 & 1000x187 996x186 999x189 & 3 & 2295 & 10077 \\\\\n", 431 | "\t 25146 & 10067.29 & 5306.808 & 10 & Lars2 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n", 432 | "\\end{tabular}\n" 433 | ], 434 | "text/markdown": [ 435 | "\n", 436 | "| cell_id | x_centroid | y_centroid | N_bc | gene | x | y | bc | count | spot_px_y | spot_px_x |\n", 437 | "|---|---|---|---|---|---|---|---|---|---|---|\n", 438 | "| 25146 | 10067.29 | 5306.808 | 10 | Apoe | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n", 439 | "| 25146 | 10067.29 | 5306.808 | 10 | Zpbp | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n", 440 | "| 25146 | 10067.29 | 5306.808 | 10 | Dynll2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n", 441 | "| 25146 | 10067.29 | 5306.808 | 10 | Rb1cc1 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n", 442 | "| 25146 | 10067.29 | 5306.808 | 10 | Cdk8 | 1000 | 187 | 1000x187 996x186 999x189 | 3 | 2295 | 10077 |\n", 443 | "| 25146 | 10067.29 | 5306.808 | 10 | Lars2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n", 444 | "\n" 445 | ], 446 | "text/plain": [ 447 | " cell_id x_centroid y_centroid N_bc gene x y bc \n", 448 | "1 25146 10067.29 5306.808 10 Apoe 1000 187 1000x187 \n", 449 | "2 25146 10067.29 5306.808 10 Zpbp 1000 187 1000x187 \n", 450 | "3 25146 10067.29 5306.808 10 Dynll2 1000 187 1000x187 \n", 451 | "4 25146 10067.29 5306.808 10 Rb1cc1 1000 187 1000x187 \n", 452 | "5 25146 10067.29 5306.808 10 Cdk8 1000 187 1000x187 996x186 999x189\n", 453 | "6 25146 10067.29 5306.808 10 Lars2 1000 187 1000x187 \n", 454 | " count spot_px_y spot_px_x\n", 455 | "1 1 2295 10077 \n", 456 | "2 1 2295 10077 \n", 457 | "3 1 2295 10077 \n", 458 | "4 1 2295 10077 \n", 459 | "5 3 2295 10077 \n", 460 | "6 1 2295 10077 " 461 | ] 462 | }, 463 | "metadata": {}, 464 | "output_type": "display_data" 465 | }, 466 | { 467 | "data": { 468 | "text/html": [ 469 | "22229" 470 | ], 471 | "text/latex": [ 472 | "22229" 473 | ], 474 | "text/markdown": [ 475 | "22229" 476 | ], 477 | "text/plain": [ 478 | "[1] 22229" 479 | ] 480 | }, 481 | "metadata": {}, 482 | "output_type": "display_data" 483 | }, 484 | { 485 | "data": { 486 | "text/html": [ 487 | "684443" 488 | ], 489 | "text/latex": [ 490 | "684443" 491 | ], 492 | "text/markdown": [ 493 | "684443" 494 | ], 495 | "text/plain": [ 496 | "[1] 684443" 497 | ] 498 | }, 499 | "metadata": {}, 500 | "output_type": "display_data" 501 | }, 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "\n", 506 | " 1 \n", 507 | "22229 " 508 | ] 509 | }, 510 | "metadata": {}, 511 | "output_type": "display_data" 512 | }, 513 | { 514 | "data": { 515 | "text/plain": [ 516 | "\n", 517 | " 1 \n", 518 | "22229 " 519 | ] 520 | }, 521 | "metadata": {}, 522 | "output_type": "display_data" 523 | } 524 | ], 525 | "source": [ 526 | "dat_seg=dat_seg_pre[,.(bc=paste0(bc,collapse = \" \"),count=sum(count),spot_px_y=spot_px_y[1],spot_px_x=spot_px_x[1]),by=c(\"cell_id\",\"x_centroid\",\"y_centroid\",\"N_bc\",\"gene\",\"x\",\"y\")]\n", 527 | "head(dat_seg)\n", 528 | "length(unique(dat_seg$cell_id))\n", 529 | "nrow(dat_seg)\n", 530 | "table(dat_seg[,length(unique(cell_id)),by=c(\"x\",\"y\")]$V1) #make sure each coordinate only hase one cell id\n", 531 | "table(dat_seg[,length(unique(paste0(x,\"_\",y))),by=c(\"cell_id\")]$V1) #make sure each cell id only has one coordinate" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 17, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "write.table(dat_seg,paste0(dir,\"/\",tag,\"_filtered_red_ut_segmented.tsv\"),sep=\"\\t\",quote=FALSE,row.names = FALSE)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "### Process binned HDST data" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 2, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "#for MOB (standard gtf)\n", 557 | "bin_sizes=c(\"5x\",\"10x\",\"20x\",\"38x\",\"38x-thin\")\n", 558 | "dir=\"MOB_binned\"" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "#for MOB (nc gtf)\n", 568 | "bin_sizes=c(\"5x\")\n", 569 | "dir=\"MOB_binned_nc\"" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 3, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "#for MOB (nc gtf) new sample\n", 579 | "bin_sizes=c(\"5x\")\n", 580 | "dir=\"MOB_binned_nc/E1\"" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 7, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "#for MOB (nc gtf) new sample\n", 590 | "bin_sizes=c(\"5x\")\n", 591 | "dir=\"MOB_binned_nc/D1\"" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 2, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "#for BC (standard gtf)\n", 601 | "bin_sizes=c(\"5x\")\n", 602 | "dir=\"BC_binned\"" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 2, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "#for BC (nc gtf)\n", 612 | "bin_sizes=c(\"5x\")\n", 613 | "dir=\"BC_binned_nc\"" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 3, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "#for BC (nc gtf)\n", 623 | "bin_sizes=c(\"5x\")\n", 624 | "dir=\"BC_binned_nc/C1\"" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 7, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "#for BC (nc gtf)\n", 634 | "bin_sizes=c(\"5x\")\n", 635 | "dir=\"BC_binned_nc/D1\"" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 8, 641 | "metadata": {}, 642 | "outputs": [ 643 | { 644 | "name": "stdout", 645 | "output_type": "stream", 646 | "text": [ 647 | "[1] \"BC_binned_nc/D1/hdst-breast-cancer-D1-lowres-5x.csv.gz\"\n" 648 | ] 649 | } 650 | ], 651 | "source": [ 652 | "dat_all_bin=data.table()\n", 653 | "for (bin in bin_sizes){\n", 654 | " dat_file=list.files(path = dir,pattern = paste0(\"hdst.*lowres-\",bin,\".csv.*\"),full.names = TRUE)\n", 655 | " bins_file=list.files(path = dir,pattern = paste0(\"hdst.*lowres-\",bin,\"-bins.csv.*\"),full.names = TRUE)\n", 656 | " print(dat_file)\n", 657 | " dat=fread(dat_file)\n", 658 | " bins=fread(bins_file)\n", 659 | " dat[,x:=bins$x,]\n", 660 | " dat[,y:=bins$y,]\n", 661 | " dat_long=melt(dat,id.vars = c(\"x\",\"y\"),variable.name = \"gene\",value.name = \"count\")\n", 662 | " dat_long=dat_long[count>0]\n", 663 | " dat_long[,bin:=bin,]\n", 664 | " dat_long[,bc:=paste0(x,\"_\",y),]\n", 665 | " dat_all_bin=rbindlist(list(dat_all_bin,dat_long))\n", 666 | "}" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 9, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "data": { 676 | "text/html": [ 677 | "\n", 678 | "\n", 679 | "\n", 680 | "\t\n", 681 | "\t\n", 682 | "\t\n", 683 | "\t\n", 684 | "\t\n", 685 | "\t\n", 686 | "\n", 687 | "

x	y	gene	count	bin	bc
241	67	TSPAN6	1	5x	241_67
76	134	DPM1	1	5x	76_134
94	80	DPM1	1	5x	94_80
142	155	DPM1	1	5x	142_155
144	156	DPM1	1	5x	144_156
185	16	DPM1	1	5x	185_16

\n" 688 | ], 689 | "text/latex": [ 690 | "\\begin{tabular}{r|llllll}\n", 691 | " x & y & gene & count & bin & bc\\\\\n", 692 | "\\hline\n", 693 | "\t 241 & 67 & TSPAN6 & 1 & 5x & 241\\_67 \\\\\n", 694 | "\t 76 & 134 & DPM1 & 1 & 5x & 76\\_134 \\\\\n", 695 | "\t 94 & 80 & DPM1 & 1 & 5x & 94\\_80 \\\\\n", 696 | "\t 142 & 155 & DPM1 & 1 & 5x & 142\\_155\\\\\n", 697 | "\t 144 & 156 & DPM1 & 1 & 5x & 144\\_156\\\\\n", 698 | "\t 185 & 16 & DPM1 & 1 & 5x & 185\\_16 \\\\\n", 699 | "\\end{tabular}\n" 700 | ], 701 | "text/markdown": [ 702 | "\n", 703 | "| x | y | gene | count | bin | bc |\n", 704 | "|---|---|---|---|---|---|\n", 705 | "| 241 | 67 | TSPAN6 | 1 | 5x | 241_67 |\n", 706 | "| 76 | 134 | DPM1 | 1 | 5x | 76_134 |\n", 707 | "| 94 | 80 | DPM1 | 1 | 5x | 94_80 |\n", 708 | "| 142 | 155 | DPM1 | 1 | 5x | 142_155 |\n", 709 | "| 144 | 156 | DPM1 | 1 | 5x | 144_156 |\n", 710 | "| 185 | 16 | DPM1 | 1 | 5x | 185_16 |\n", 711 | "\n" 712 | ], 713 | "text/plain": [ 714 | " x y gene count bin bc \n", 715 | "1 241 67 TSPAN6 1 5x 241_67 \n", 716 | "2 76 134 DPM1 1 5x 76_134 \n", 717 | "3 94 80 DPM1 1 5x 94_80 \n", 718 | "4 142 155 DPM1 1 5x 142_155\n", 719 | "5 144 156 DPM1 1 5x 144_156\n", 720 | "6 185 16 DPM1 1 5x 185_16 " 721 | ] 722 | }, 723 | "metadata": {}, 724 | "output_type": "display_data" 725 | } 726 | ], 727 | "source": [ 728 | "head(dat_all_bin)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 10, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "write.table(dat_all_bin,paste0(dir,\"/hdst-lowres.tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "### Process standard ST" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 90, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "prep_st=function(mat){\n", 754 | " mat[,x:=unlist(strsplit(V1,\"x\"))[1],by=1:nrow(mat)]\n", 755 | " mat[,y:=unlist(strsplit(V1,\"x\"))[2],by=1:nrow(mat)]\n", 756 | " mat_long=melt(mat[,-c(\"V1\"),],id.vars = c(\"x\",\"y\"),variable.name = \"gene\",value.name = \"count\")\n", 757 | " return(mat_long[count>0])\n", 758 | "}" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 91, 764 | "metadata": {}, 765 | "outputs": [ 766 | { 767 | "name": "stderr", 768 | "output_type": "stream", 769 | "text": [ 770 | "Warning message in fread(\"MOB_STST/Rep4_MOB1x.csv\"):\n", 771 | "\"Detected 15941 column names but the data has 15942 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.\"" 772 | ] 773 | } 774 | ], 775 | "source": [ 776 | "st_dat=fread(\"MOB_STST/Rep4_MOB1x.csv\")" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 92, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "st_long=prep_st(st_dat)" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 94, 791 | "metadata": {}, 792 | "outputs": [ 793 | { 794 | "data": { 795 | "text/html": [ 796 | "\n", 797 | "\n", 798 | "\n", 799 | "\t\n", 800 | "\t\n", 801 | "\t\n", 802 | "\t\n", 803 | "\t\n", 804 | "\t\n", 805 | "\n", 806 | "

x	y	gene	count
16.105	29.003	Mx1	1
16.105	29.003	Cenpa	1
16.105	29.003	Snora17	1
16.984	29.022	Nlrp5	1
16.984	29.022	Sned1	1
16.984	29.022	Gm933	1

\n" 807 | ], 808 | "text/latex": [ 809 | "\\begin{tabular}{r|llll}\n", 810 | " x & y & gene & count\\\\\n", 811 | "\\hline\n", 812 | "\t 16.105 & 29.003 & Mx1 & 1 \\\\\n", 813 | "\t 16.105 & 29.003 & Cenpa & 1 \\\\\n", 814 | "\t 16.105 & 29.003 & Snora17 & 1 \\\\\n", 815 | "\t 16.984 & 29.022 & Nlrp5 & 1 \\\\\n", 816 | "\t 16.984 & 29.022 & Sned1 & 1 \\\\\n", 817 | "\t 16.984 & 29.022 & Gm933 & 1 \\\\\n", 818 | "\\end{tabular}\n" 819 | ], 820 | "text/markdown": [ 821 | "\n", 822 | "x | y | gene | count | \n", 823 | "|---|---|---|---|---|---|\n", 824 | "| 16.105 | 29.003 | Mx1 | 1 | \n", 825 | "| 16.105 | 29.003 | Cenpa | 1 | \n", 826 | "| 16.105 | 29.003 | Snora17 | 1 | \n", 827 | "| 16.984 | 29.022 | Nlrp5 | 1 | \n", 828 | "| 16.984 | 29.022 | Sned1 | 1 | \n", 829 | "| 16.984 | 29.022 | Gm933 | 1 | \n", 830 | "\n", 831 | "\n" 832 | ], 833 | "text/plain": [ 834 | " x y gene count\n", 835 | "1 16.105 29.003 Mx1 1 \n", 836 | "2 16.105 29.003 Cenpa 1 \n", 837 | "3 16.105 29.003 Snora17 1 \n", 838 | "4 16.984 29.022 Nlrp5 1 \n", 839 | "5 16.984 29.022 Sned1 1 \n", 840 | "6 16.984 29.022 Gm933 1 " 841 | ] 842 | }, 843 | "metadata": {}, 844 | "output_type": "display_data" 845 | } 846 | ], 847 | "source": [ 848 | "tail(st_long)" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 95, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "write.table(st_long,\"MOB_STST/Rep4_MOB1x_long.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE)" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "### Annotate barcodes with anatomic features" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": {}, 871 | "outputs": [], 872 | "source": [ 873 | "library(data.table)\n", 874 | "library(raster)\n", 875 | "library(rgeos)\n", 876 | "library(ggplot2)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "setwd(data_dir) #set wd to directory containing the polygon file as well as the barcodes under tissue file" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "annotate_bc=function(sample,plot=TRUE,max_dist=Inf,n_chunks=NULL,flip=TRUE){\n", 895 | " #read data\n", 896 | " poly=fread(paste0(sample,\"_annotations.txt\"))\n", 897 | " print(nrow(poly))\n", 898 | " bc=fread(paste0(sample,\"_barcodes_under_tissue.tsv\"))\n", 899 | " poly[value==\"\",value:=\"Unknown\"]\n", 900 | " if (any(\"polygon\"%in%names(poly))){setnames(poly,\"polygon\",\"x_y\")}\n", 901 | "\n", 902 | " \n", 903 | " #convert polygons table into list\n", 904 | " poly_long=poly[,.(x_y=strsplit(x_y,\" \"),value),by=1:nrow(poly)]\n", 905 | " poly_list=apply(poly_long,1,function(x){r=lapply(strsplit(unlist(x[\"x_y\"]),\",\"),as.numeric);r=lapply(r,function(x){if(length(x)==1){x=c(x,NA)};return(x)});r=t(as.data.table(r));colnames(r)=c(\"x\",\"y\");return(na.omit(r))})\n", 906 | " names(poly_list)=poly_long$value\n", 907 | " \n", 908 | " #make list of polygon objects\n", 909 | " sp=rapply(poly_list, Polygon,hole=FALSE, how = \"replace\")\n", 910 | " sp=lapply(1:length(sp), function(i) {Polygons(sp[i], as.character(i))})\n", 911 | " \n", 912 | " #make spatial poligons object\n", 913 | " pols=SpatialPolygons(sp)\n", 914 | " plot(pols)\n", 915 | " \n", 916 | " #prepare HD_ST coordinates (need to be mirrored at y-axis in some samples)\n", 917 | " if (flip==TRUE){\n", 918 | " print(\"Flipping y axis.\")\n", 919 | " flipped_bc=bc[,.(spot_px_x=spot_px_x,spot_px_y=-(spot_px_y-min(spot_px_y))+(max(spot_px_y))),]\n", 920 | " }else{\n", 921 | " print(\"Not flipping y axis.\")\n", 922 | " flipped_bc=bc[,.(spot_px_x=spot_px_x,spot_px_y=spot_px_y),]\n", 923 | " }\n", 924 | " \n", 925 | " #overlap polygons and HD_ST coordinates\n", 926 | " nr=nrow(flipped_bc)\n", 927 | " print(nr)\n", 928 | "\n", 929 | " if (!is.null(n_chunks)){\n", 930 | " chunks_size=floor(nr/n_chunks)\n", 931 | " e=data.table()\n", 932 | " for (i in 0:n_chunks){\n", 933 | " start=i*chunks_size+1\n", 934 | " end=ifelse(i==n_chunks,nr,(i+1)*chunks_size)\n", 935 | " if (start > nr){break}\n", 936 | " print(paste0(\"Processing chunk \",start,\" to \",end))\n", 937 | " e1 = as.data.table(extract(pols, flipped_bc[start:end]))\n", 938 | " e1[,point.ID:=point.ID+(start-1),]\n", 939 | " e=rbindlist(list(e,e1))\n", 940 | " }\n", 941 | " }else{\n", 942 | " e = as.data.table(extract(pols, flipped_bc))\n", 943 | " } \n", 944 | " \n", 945 | " e[,poly:=poly.ID,]\n", 946 | " e[,poly.ID:=names(poly_list)[poly],]\n", 947 | " \n", 948 | " #find nearest polygons for HD_ST coordinates that don't fall into a polygon\n", 949 | " print(\"Now assigning missing.\")\n", 950 | " missing_id=e[is.na(poly.ID)]$point.ID\n", 951 | " missing=flipped_bc[missing_id,]\n", 952 | " missing[,point.ID:=missing_id,]\n", 953 | " sp_pts=SpatialPoints(missing[,c(\"spot_px_x\", \"spot_px_y\"),])\n", 954 | " dist=gDistance(sp_pts,pols,byid = TRUE)\n", 955 | " if(max_dist!=Inf){\n", 956 | " dist=apply(dist,c(1,2),function(x){ifelse(x>max_dist,NA,x)})\n", 957 | " missing[,np:=apply(dist,2,function(x){res=which.min(x);ifelse(length(res)>0,res,NA)})]\n", 958 | " missing=missing[!is.na(np)]\n", 959 | " }else{\n", 960 | " missing[,np:=apply(dist,2,which.min)]\n", 961 | " }\n", 962 | " missing[,poly:=np,]\n", 963 | " missing[,poly.ID:=names(poly_list)[np],]\n", 964 | " \n", 965 | " #combine annotations from primary and secondary assignment\n", 966 | " e_complete=rbindlist(list(e[!is.na(poly.ID)],missing[,c(\"point.ID\",\"poly.ID\",\"poly\"),]),use.names = TRUE)\n", 967 | " \n", 968 | " #merge with original HD_ST coordinates file\n", 969 | " bc[,point.ID:=1:nrow(bc),] \n", 970 | " bc_annot=unique(merge(bc,e_complete,by=\"point.ID\",all.x=TRUE)) \n", 971 | " bc_annot[is.na(poly.ID),poly.ID:=\"missing\",]\n", 972 | " bc_annot=bc_annot[!duplicated(point.ID)] #just take the first entry if a barcode is assigned to several annotations\n", 973 | " \n", 974 | " if (plot == TRUE){\n", 975 | " pl=ggplot(bc_annot,aes(y=spot_px_y,x=spot_px_x,col=poly.ID))+geom_point(size=0.5)+coord_fixed()\n", 976 | " png(paste0(sample,\"_bc_annot.png\"),height=500,width=700)\n", 977 | " print(pl)\n", 978 | " dev.off()\n", 979 | " }\n", 980 | " write.table(bc_annot,paste0(sample,\"_barcodes_under_tissue_annot.tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)\n", 981 | " return(bc_annot)\n", 982 | "}" 983 | ] 984 | }, 985 | { 986 | "cell_type": "markdown", 987 | "metadata": {}, 988 | "source": [ 989 | "##### MOB" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "metadata": {}, 996 | "outputs": [], 997 | "source": [ 998 | "bc=annotate_bc(\"CN13_D2\",TRUE)" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "markdown", 1003 | "metadata": {}, 1004 | "source": [ 1005 | "#### Breast cancer" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": null, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "bc=annotate_bc(\"CN21_BC24350_E2\",TRUE,max_dist = 5,n_chunks = 10,flip = FALSE)" 1015 | ] 1016 | } 1017 | ], 1018 | "metadata": { 1019 | "kernelspec": { 1020 | "display_name": "R", 1021 | "language": "R", 1022 | "name": "ir" 1023 | }, 1024 | "language_info": { 1025 | "codemirror_mode": "r", 1026 | "file_extension": ".r", 1027 | "mimetype": "text/x-r-source", 1028 | "name": "R", 1029 | "pygments_lexer": "r", 1030 | "version": "3.5.0" 1031 | } 1032 | }, 1033 | "nbformat": 4, 1034 | "nbformat_minor": 2 1035 | } 1036 | -------------------------------------------------------------------------------- /segmentation/HD_ST_Master.m: -------------------------------------------------------------------------------- 1 | function [] = HD_ST_Master(st_spot_table_file,st_sc_mask_file,csv_output_file) 2 | %UNTITLED Summary of this function goes here 3 | % Detailed explanation goes here 4 | 5 | % Clear variable space 6 | clear all 7 | clc 8 | 9 | %% Load tsv spot table 10 | st_spot_table = readtable(st_spot_table_file,'Delimiter','\t'); 11 | 12 | %% Load mask 13 | st_sc_mask = imread(st_sc_mask_file); 14 | % If image is flipped: 15 | % st_sc_mask= flipud(st_sc_mask_raw); 16 | 17 | % Extract x and y from regionprops 18 | mask_centroid = regionprops(st_sc_mask,'centroid'); 19 | for i=1:length(mask_centroid) 20 | x_centroid(i) = mask_centroid(i).Centroid(1)'; 21 | y_centroid(i) = mask_centroid(i).Centroid(2)'; 22 | end 23 | 24 | %% Extract all CellID's for the spot location 25 | % Flip centroid to align with spots (only if flipped) 26 | st_spot_table.spot_px_y=-(st_spot_table.spot_px_y-min(st_spot_table.spot_px_y))+(max(st_spot_table.spot_px_y)); 27 | 28 | % Extract unique values 29 | [unique_spots,unique_value_location,ic] = unique(st_spot_table.bc,'stable'); 30 | 31 | % Extract only the unique values 32 | unique_x = round(st_spot_table.spot_px_x(unique_value_location)); 33 | unique_y = round(st_spot_table.spot_px_y(unique_value_location)); 34 | unique_bc = st_spot_table.bc(unique_value_location); 35 | 36 | % Extract the overlap with the mask 37 | for i=1:size(unique_spots,1) 38 | unique_bc{i,2} = st_sc_mask(unique_y(i,1),unique_x(i,1)); 39 | if unique_bc{i,2} == 0 40 | continue 41 | else 42 | unique_bc{i,3} = x_centroid(1,unique_bc{i,2}); 43 | unique_bc{i,4} = y_centroid(1,unique_bc{i,2}); 44 | end 45 | end 46 | 47 | export_table = cell2table(unique_bc,'VariableNames',... 48 | {'bc','cell_id','x_centroid','y_centroid'}); 49 | 50 | %% Plot for QC 51 | % Plot image for unique barcodes 52 | figure() 53 | scatter(cell2mat(unique_bc(:,3)),cell2mat(unique_bc(:,4))); 54 | % Plot mask centroids 55 | figure() 56 | scatter(x_centroid,y_centroid); 57 | % Plot barcode beads 58 | figure() 59 | scatter(st_spot_table.spot_px_x,st_spot_table.spot_px_y); 60 | 61 | %% Export CSV 62 | writetable(export_table,csv_output_file); 63 | 64 | end 65 | 66 | --------------------------------------------------------------------------------