├── Differential expression
├── BC-04-Differential Expression.ipynb
├── MOB-04-DE for regions.ipynb
└── MOB-05-DE for cell types.ipynb
├── Nuclear RNA
├── MOB-NuclearRNA-01-Load data in R.ipynb
└── MOB-NuclearRNA-02-Compare nucleus and cell.ipynb
├── README.md
├── alignment
├── .gitignore
├── README.md
├── setup.cfg
├── setup.py
└── staligner
│ ├── __init__.py
│ ├── __main__.py
│ ├── __version__.py
│ └── staligner.py
├── cell_typing
├── cell_type_assignment.ipynb
└── quality_check.ipynb
├── enrichment_analysis
└── enrichment_analysis.ipynb
├── files.png
├── hdst.png
├── pre_processing
├── BC-01-GenerateAnnData.ipynb
├── BC-02-Binning.ipynb
├── BC-03-Smoothing.ipynb
├── MOB-00-ABA Gene retrieval via API.ipynb
├── MOB-01-GenerateAnnData.ipynb
├── MOB-02-Binning.ipynb
├── MOB-03-Smoothing.ipynb
├── pre-processing_external.ipynb
└── pre-processing_hdst.ipynb
└── segmentation
└── HD_ST_Master.m
/Nuclear RNA/MOB-NuclearRNA-01-Load data in R.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load Allen and Macosko datasets"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stderr",
17 | "output_type": "stream",
18 | "text": [
19 | "Warning message:\n",
20 | "“package ‘data.table’ was built under R version 3.5.2”"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "library(Matrix)\n",
26 | "library(data.table)"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### Allen data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "cells <- readRDS('data/allen_50k.RDS')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "data": {
52 | "text/html": [
53 | "
\n",
54 | "\t- 27998
\n",
55 | "\t- 50000
\n",
56 | "
\n"
57 | ],
58 | "text/latex": [
59 | "\\begin{enumerate*}\n",
60 | "\\item 27998\n",
61 | "\\item 50000\n",
62 | "\\end{enumerate*}\n"
63 | ],
64 | "text/markdown": [
65 | "1. 27998\n",
66 | "2. 50000\n",
67 | "\n",
68 | "\n"
69 | ],
70 | "text/plain": [
71 | "[1] 27998 50000"
72 | ]
73 | },
74 | "metadata": {},
75 | "output_type": "display_data"
76 | }
77 | ],
78 | "source": [
79 | "dim(cells)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 4,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "cells.mat <- Matrix(cells, sparse=T)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "genes <- rownames(cells)\n",
98 | "barcodes <- colnames(cells)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 6,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/html": [
109 | "\n",
110 | " | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |
\n",
111 | "\n",
112 | "\tNcells | 1647717 | 88.0 | 2661402 | 142.2 | NA | 2661402 | 142.2 |
\n",
113 | "\tVcells | 130454269 | 995.3 | 1348288924 | 10286.7 | 16384 | 1530682202 | 11678.2 |
\n",
114 | "\n",
115 | "
\n"
116 | ],
117 | "text/latex": [
118 | "\\begin{tabular}{r|lllllll}\n",
119 | " & used & (Mb) & gc trigger & (Mb) & limit (Mb) & max used & (Mb)\\\\\n",
120 | "\\hline\n",
121 | "\tNcells & 1647717 & 88.0 & 2661402 & 142.2 & NA & 2661402 & 142.2 \\\\\n",
122 | "\tVcells & 130454269 & 995.3 & 1348288924 & 10286.7 & 16384 & 1530682202 & 11678.2 \\\\\n",
123 | "\\end{tabular}\n"
124 | ],
125 | "text/markdown": [
126 | "\n",
127 | "| | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |\n",
128 | "|---|---|---|---|---|---|---|---|\n",
129 | "| Ncells | 1647717 | 88.0 | 2661402 | 142.2 | NA | 2661402 | 142.2 |\n",
130 | "| Vcells | 130454269 | 995.3 | 1348288924 | 10286.7 | 16384 | 1530682202 | 11678.2 |\n",
131 | "\n"
132 | ],
133 | "text/plain": [
134 | " used (Mb) gc trigger (Mb) limit (Mb) max used (Mb) \n",
135 | "Ncells 1647717 88.0 2661402 142.2 NA 2661402 142.2\n",
136 | "Vcells 130454269 995.3 1348288924 10286.7 16384 1530682202 11678.2"
137 | ]
138 | },
139 | "metadata": {},
140 | "output_type": "display_data"
141 | }
142 | ],
143 | "source": [
144 | "rm(cells)\n",
145 | "gc()"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 7,
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/plain": [
156 | "NULL"
157 | ]
158 | },
159 | "metadata": {},
160 | "output_type": "display_data"
161 | }
162 | ],
163 | "source": [
164 | "writeMM(cells.mat, 'data/allen.mtx')"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 8,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "write.csv(genes, 'data/allen-genes.csv', row.names=F, quote=F)\n",
174 | "write.csv(barcodes, 'data/allen-barcodes.csv', row.names=F, quote=F)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "### Macosko data"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 9,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "cells <- readRDS('data/macosko_50k.RDS')"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 10,
196 | "metadata": {},
197 | "outputs": [
198 | {
199 | "data": {
200 | "text/html": [
201 | "\n",
202 | "\t- 27877
\n",
203 | "\t- 50000
\n",
204 | "
\n"
205 | ],
206 | "text/latex": [
207 | "\\begin{enumerate*}\n",
208 | "\\item 27877\n",
209 | "\\item 50000\n",
210 | "\\end{enumerate*}\n"
211 | ],
212 | "text/markdown": [
213 | "1. 27877\n",
214 | "2. 50000\n",
215 | "\n",
216 | "\n"
217 | ],
218 | "text/plain": [
219 | "[1] 27877 50000"
220 | ]
221 | },
222 | "metadata": {},
223 | "output_type": "display_data"
224 | }
225 | ],
226 | "source": [
227 | "dim(cells)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 11,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "cells.dt <- as.data.table(cells)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 12,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "genes <- rownames(cells)"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 13,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/html": [
256 | "\n",
257 | " | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |
\n",
258 | "\n",
259 | "\tNcells | 1755340 | 93.8 | 2661402 | 142.2 | NA | 2661402 | 142.2 |
\n",
260 | "\tVcells | 827788748 | 6315.6 | 1619974282 | 12359.5 | 16384 | 1530682202 | 11678.2 |
\n",
261 | "\n",
262 | "
\n"
263 | ],
264 | "text/latex": [
265 | "\\begin{tabular}{r|lllllll}\n",
266 | " & used & (Mb) & gc trigger & (Mb) & limit (Mb) & max used & (Mb)\\\\\n",
267 | "\\hline\n",
268 | "\tNcells & 1755340 & 93.8 & 2661402 & 142.2 & NA & 2661402 & 142.2 \\\\\n",
269 | "\tVcells & 827788748 & 6315.6 & 1619974282 & 12359.5 & 16384 & 1530682202 & 11678.2 \\\\\n",
270 | "\\end{tabular}\n"
271 | ],
272 | "text/markdown": [
273 | "\n",
274 | "| | used | (Mb) | gc trigger | (Mb) | limit (Mb) | max used | (Mb) |\n",
275 | "|---|---|---|---|---|---|---|---|\n",
276 | "| Ncells | 1755340 | 93.8 | 2661402 | 142.2 | NA | 2661402 | 142.2 |\n",
277 | "| Vcells | 827788748 | 6315.6 | 1619974282 | 12359.5 | 16384 | 1530682202 | 11678.2 |\n",
278 | "\n"
279 | ],
280 | "text/plain": [
281 | " used (Mb) gc trigger (Mb) limit (Mb) max used (Mb) \n",
282 | "Ncells 1755340 93.8 2661402 142.2 NA 2661402 142.2\n",
283 | "Vcells 827788748 6315.6 1619974282 12359.5 16384 1530682202 11678.2"
284 | ]
285 | },
286 | "metadata": {},
287 | "output_type": "display_data"
288 | }
289 | ],
290 | "source": [
291 | "rm(cells)\n",
292 | "gc()"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 14,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "fwrite(cells.dt, 'data/macosko.csv')"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 15,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "write.csv(genes, 'data/macosko-genes.csv', row.names=F, quote=F)"
311 | ]
312 | }
313 | ],
314 | "metadata": {
315 | "kernelspec": {
316 | "display_name": "R",
317 | "language": "R",
318 | "name": "ir"
319 | },
320 | "language_info": {
321 | "codemirror_mode": "r",
322 | "file_extension": ".r",
323 | "mimetype": "text/x-r-source",
324 | "name": "R",
325 | "pygments_lexer": "r",
326 | "version": "3.5.1"
327 | }
328 | },
329 | "nbformat": 4,
330 | "nbformat_minor": 2
331 | }
332 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HDST
2 |
3 | This is a public repository for all code connected to HDST (High-definition spatial transcriptomics).
4 |
5 | Please cite: Vickovic S et al. High-definition spatial transcriptomics for in situ tissue profiling. Nat Methods 2019: doi: https://doi.org/10.1038/s41592-019-0548-y
6 |
7 | # Tech workflow
8 | 
9 |
10 | # File Structure Overview
11 | All processed files are available at: https://portals.broadinstitute.org/single_cell/study/SCP420
12 |
13 | 
14 |
15 | We recommed using the `Bulk Download` function and to consult the `Metadata` file.
16 |
17 | #### `*red_ut*`files: Sorted counts tsv files with:
18 |
19 | `bc` barcode (XxY) coordinate
20 | `spot_px_x` representing (x) pixel coordinate in the HE image and `X`in `bc`
21 | `spot_px_y` representing (y) pixel coordinate in the HE image and `Y`in `bc`
22 | `gene` representing the gene name
23 | `count` representing UMI filtered expressed counts per corresponding gene
24 |
25 | (Note: spatial resolution is marked as `HDST`, `5x` or `segments`in all file names)
26 |
27 | #### `*barcodes_under_tissue_annot*`files: files conenction (x,y) coordinates to annotation regions in `HDST` with:
28 |
29 | `bc` barcode (XxY) coordinate
30 | `spot_px_x` representing (x) pixel coordinate in the HE image and `X`in `bc`
31 | `spot_px_y` representing (y) pixel coordinate in the HE image and `Y`in `bc
32 | `annotation_region` as region names to each (x,y) coordinate
33 |
34 | #### `*HE.png` files are HE images used in the study
35 |
36 | #### `*HE_Probabilities_mask.tiff` files are coordinates of segmented nuclei based on corresponding HE images
37 |
38 | #### Files needed to run the ST pipeline:
39 | ##### `*.fastq` raw seq data with encoded barcode information
40 | ##### `*barcode_ids.tsv` ids files needed for demultiplexing
41 |
42 |
43 | # Alignment
44 | This is [code](./alignment) for aligning HE images to (x,y) barcode coordiantes as given by ST Pipeline ([v.1.5.1](https://github.com/SpatialTranscriptomicsResearch/st_pipeline/releases/tag/1.5.1)).
45 |
46 | # Segmentation
47 | This is [code](./segmentation) for segmenting HE nuclei. HE image segmentation was performed by combining Ilastik and CellProfiler. The labeled segmentation mask was used to assign the individual spots to the corresponding Cell ID. The output CSV file includes Cell IDs, X and Y position of the cells (centroid) and the corresponding spots.
48 |
49 | # Cell typing
50 | This is [code](./cell_typing) for imputing cell types onto (x,y) spatial positions based on scRNA-seq data.
51 |
52 | # Differential expression (DE) analysis
53 | This is [code](./Differential%20expression) for DE analysis between annotated regions.
54 |
--------------------------------------------------------------------------------
/alignment/.gitignore:
--------------------------------------------------------------------------------
1 | **__pycache__
2 |
--------------------------------------------------------------------------------
/alignment/README.md:
--------------------------------------------------------------------------------
1 | # ST Aligner
2 |
3 | This package can be used to find approximate coordinates of spots on an HDST array.
4 |
5 | ## Installation
6 |
7 | To install the package and its dependencies with pip, run
8 |
9 | ```
10 | pip install
11 | ```
12 |
13 | ## Usage
14 |
15 | ST Aligner is run on the bright-field microscopy image from an HDST experiment.
16 | Before proceeding, make sure that the microscopy image has the right orientation; spots will be indexed from the top left in the output file.
17 |
18 | Invoke the alignment script by running
19 |
20 | ```
21 | staligner --input --output --annotate
22 | ```
23 |
24 | The `--annotate` flag is optional but recommended.
25 | When specified, ST Aligner will emit an annotated bright-field image, showing the inferred locations of the spots.
26 | The annotated image can be used to verify that the results are correct.
27 |
--------------------------------------------------------------------------------
/alignment/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = staligner
3 | version = attr: staligner.__version__
4 |
5 | [options]
6 | zip_safe = True
7 | packages = find:
8 | install_requires =
9 | imageio ~= 2.4.1
10 | numpy ~= 1.16.2
11 | pandas ~= 0.24.1
12 | scipy ~= 1.2.1
13 | python_requires = ~= 3.7
14 |
15 | [options.entry_points]
16 | console_scripts =
17 | staligner = staligner.__main__:main
18 |
--------------------------------------------------------------------------------
/alignment/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import setuptools as st
3 |
4 | st.setup()
5 |
--------------------------------------------------------------------------------
/alignment/staligner/__init__.py:
--------------------------------------------------------------------------------
1 | from .staligner import align
2 | from .__version__ import __version__
3 |
--------------------------------------------------------------------------------
/alignment/staligner/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse as ap
2 |
3 | import logging
4 |
5 | from . import align
6 |
7 |
8 | logging.basicConfig(level=logging.INFO)
9 | LOG = logging.getLogger(__package__)
10 |
11 |
12 | def main():
13 | """Script entry point.
14 | """
15 | opt = ap.ArgumentParser()
16 | opt.add_argument('-i', '--input', type=str, required=True,
17 | help='Input image.')
18 | opt.add_argument('-o', '--output-directory', type=str, default='.',
19 | help='Output directory.')
20 | opt.add_argument('--size', nargs=2, default=[-1, 7330], type=int,
21 | help='Rescale the image to at most the given size before '
22 | 'running.')
23 | opt.add_argument('--win-size', default=1 / 25, type=float,
24 | help='Size of the decision window. Use smaller values if '
25 | 'the image is very rotated or if the tissue extends '
26 | 'close to the borders of the array.')
27 | opt.add_argument('--annotate', action='store_true',
28 | help='Emit bright-field image with spot annotations.')
29 | opt.add_argument('--debug', action='store_true',
30 | help='Print debug messages.')
31 | opt = opt.parse_args()
32 |
33 | if opt.debug:
34 | LOG.setLevel(logging.DEBUG)
35 |
36 | LOG.info('Running frame detection with options: %s.',
37 | ', '.join([f'{k}={v}' for k, v in vars(opt).items()]))
38 | align(
39 | im_file=opt.input,
40 | im_size=opt.size,
41 | win_size=[
42 | round(min([s for s in opt.size if s > 0]) * opt.win_size)] * 2,
43 | annotate=opt.annotate,
44 | output_directory=opt.output_directory,
45 | )
46 |
47 |
48 | if __name__ == "__main__":
49 | main()
50 |
--------------------------------------------------------------------------------
/alignment/staligner/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 |
--------------------------------------------------------------------------------
/alignment/staligner/staligner.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import itertools as it
4 |
5 | import logging
6 |
7 | import os
8 |
9 | from imageio import imread, imwrite
10 |
11 | import numpy as np
12 |
13 | import pandas as pd
14 |
15 | from scipy.ndimage.interpolation import zoom
16 | from scipy.signal import fftconvolve
17 |
18 |
19 | __all__ = ['align']
20 |
21 | LOG = logging.getLogger(__package__)
22 |
23 | SOBELX = np.matrix([[-1, 0, 1]]).T * np.matrix([[1, 2, 1]])
24 | SOBELY = SOBELX.T
25 |
26 |
27 | def sobel(im, amplify):
28 | """Applies a sobel filter to the input image.
29 | """
30 | if len(im.shape) == 2:
31 | fx, fy = SOBELX, SOBELY
32 | pad_width = (1,)
33 | elif len(im.shape) == 3:
34 | fx, fy = [np.stack([np.array(f)], axis=2) for f in (SOBELX, SOBELY)]
35 | pad_width = ((1, 1), (1, 1), (0, 0))
36 | else:
37 | raise ValueError('Invalid image dimensions')
38 |
39 | gx = fftconvolve(im, fx, mode='valid')
40 | gy = fftconvolve(im, fy, mode='valid')
41 |
42 | return np.pad(
43 | (gx * gx + gy * gy) ** (1 / (1 + np.exp(amplify))),
44 | pad_width,
45 | mode='edge',
46 | )
47 |
48 |
49 | def zoomto(shape, im):
50 | """Zoom image to a given shape.
51 | """
52 | dims = len(im.shape)
53 | if dims not in [2, 3]:
54 | raise ValueError('Invalid image dimensions')
55 |
56 | zoom_factor = min([
57 | target / current
58 | for (target, current) in zip(shape, im.shape) if target != -1
59 | ])
60 |
61 | if zoom_factor == 1.:
62 | return im, 1.
63 |
64 | zoom_seq = [1] * dims
65 | zoom_seq[:2] = [zoom_factor] * 2
66 | return zoom(im, zoom=zoom_seq, order=1, mode='nearest'), zoom_factor
67 |
68 |
69 | def ma(seq, length):
70 | """Computes the moving average of a 1-d sequence.
71 | """
72 | return np.pad(
73 | np.convolve(seq, [1 / length] * length, mode='valid'),
74 | (length // 2,),
75 | mode='edge',
76 | )
77 |
78 |
79 | def deriv(seq):
80 | """Computes the derivative of a 1-d sequence using finite differences
81 | approximation.
82 | """
83 | return np.pad(
84 | np.convolve(seq, [1, 0, -1], mode='valid'),
85 | (1,),
86 | mode='edge',
87 | )
88 |
89 |
90 | def getbounds(im, ymax=None, xmax=None):
91 | """Estimates the top and left boundaries of a light, rectangular object in a
92 | flattened image by optimizing the derivatives of the col and row sums of
93 | the image's intensity function.
94 | """
95 | ymax, xmax = [
96 | a if a is not None else s
97 | for (a, s) in zip((ymax, xmax), im.shape)
98 | ]
99 | y, x = (np.sum(im, axis=i) for i in (1, 0))
100 | dy, dx = (ma(deriv(a), max(1, len(a) // 100)) for a in (y, x))
101 | top, left = [np.argmax(d[:m]) for (d, m) in zip((dy, dx), (ymax, xmax))]
102 | return (top, left), (dy, dx)
103 |
104 |
105 | def restriction(H, W, x1s, t):
106 | """Given x1s and t, computes the other xss:s (s.t. them forming a rectangle
107 | with height H and width W).
108 | """
109 | x11, x12 = x1s
110 | x2s = [x11 + W * np.sin(t), x12 + W * np.cos(t)]
111 | x3s = [x11 + H * np.cos(t), x12 - H * np.sin(t)]
112 | x4s = [x2s[0] + x3s[0] - x11, x2s[1] + x3s[1] - x12]
113 | return [x1s, x2s, x3s, x4s]
114 |
115 |
116 | def drestricted_cost(H, W, yss, x1s, t):
117 | """Derivative of restricted cost function w.r.t. to x1s and t.
118 | """
119 | dx1s, dx2s, dx3s, dx4s = dcost(yss, restriction(H, W, x1s, t))
120 | dx1sr = [
121 | dx1s[0] + dx2s[0] + dx3s[0] + dx4s[0],
122 | dx1s[1] + dx2s[1] + dx3s[1] + dx4s[1],
123 | ]
124 | dt = dx2s[0] * W * np.cos(t) - dx2s[1] * W * np.sin(t) + \
125 | -dx3s[0] * H * np.sin(t) - dx3s[1] * H * np.cos(t) + \
126 | dx4s[0] * (W * np.cos(t) - H * np.sin(t)) + \
127 | dx4s[1] * (-W * np.sin(t) - H * np.cos(t))
128 | return [dx1sr, dt]
129 |
130 |
131 | def restricted_cost(H, W, yss, x1s, t):
132 | """Restricted cost function.
133 | """
134 | return cost(yss, restriction(H, W, x1s, t))
135 |
136 |
137 | def dcost(yss, xss):
138 | """Derivative of the cost function w.r.t. to the xss.
139 | """
140 | return [
141 | [2 * (x - y) for (x, y) in zip(xs, ys)]
142 | for (xs, ys) in zip(xss, yss)
143 | ]
144 |
145 |
146 | def cost(yss, xss):
147 | """Euclidean cost function.
148 | """
149 | return sum(
150 | sum((x - y) ** 2 for (x, y) in zip(xs, ys))
151 | for (xs, ys) in zip(xss, yss)
152 | )
153 |
154 |
155 | def optimize_cost(H, W, yss, x1s0, t0):
156 | """Optimize cost by gradient descent.
157 | """
158 | x1s = x1s0
159 | t = t0
160 | f = partial(restricted_cost, H, W, yss)
161 | df = partial(drestricted_cost, H, W, yss)
162 | for i in range(10000):
163 | dx1s, dt = df(x1s, t)
164 | x1s[0] -= 1e-3 * dx1s[0]
165 | x1s[1] -= 1e-3 * dx1s[1]
166 | t -= 1e-9 * dt
167 | LOG.debug('Iteration %d, loss=%.2e', i, f(x1s, t))
168 | LOG.debug('-----------------------')
169 | LOG.debug('x1s: %s', x1s)
170 | LOG.debug('dx1s: %s', dx1s)
171 | LOG.debug('t: %.3f', t)
172 | LOG.debug('dt: %.3f', dt)
173 | return x1s, t
174 |
175 |
176 | def align(
177 | im_file,
178 | im_size,
179 | win_size,
180 | annotate=False,
181 | output_directory=None,
182 | ):
183 | """Runs the frame detection.
184 | """
185 | if output_directory is None:
186 | output_directory = '.'
187 | elif not os.path.exists(output_directory):
188 | os.makedirs(output_directory)
189 |
190 | im_file_no_ext = os.path.basename(im_file)[:-(im_file[::-1].find('.') + 1)]
191 |
192 | def _go(im_):
193 | (t1, l1), (dy1, dx1) = getbounds(im_, *[s // 8 for s in im_.shape])
194 | slices = [
195 | slice(max(c - s // 2, 0), c + (s + 1) // 2 + 1)
196 | for (c, s) in zip((t1, l1), win_size)
197 | ]
198 | win = im_[tuple(slices)]
199 | (t2, l2), (dy2, dx2) = getbounds(win)
200 | return [slices[0].start + t2, slices[1].start + l2]
201 |
202 | def _annotate(spots, image):
203 | maxval = np.iinfo(image.dtype).max
204 | image[spots[0, :], spots[1, :]] = (
205 | [maxval, 0, 0]
206 | if image.shape[-1] == 3 else
207 | [maxval, 0, 0, maxval]
208 | if image.shape[-1] == 4 else
209 | maxval
210 | )
211 | save_path = os.path.join(
212 | output_directory,
213 | f'{im_file_no_ext}.annotated.tif',
214 | )
215 | LOG.info('Saving annotated image to %s', save_path)
216 | imwrite(save_path, image)
217 |
218 | im = imread(im_file)
219 |
220 | if annotate:
221 | _annotate = partial(_annotate, image=im.copy())
222 | else:
223 | _annotate = lambda *_: None
224 |
225 | LOG.info('Scaling image to %dx%d', *im_size)
226 | im, zoom_factor = zoomto(im_size, im)
227 |
228 | LOG.info('Applying sobel filter and flattening')
229 | im = np.sum(sobel(im, 1.5), axis=2)
230 |
231 | LOG.info('Running bounds detection')
232 |
233 | LOG.debug('Running bounds detection on the top-left corner')
234 | tl = _go(im)
235 |
236 | LOG.debug('Running bounds detection on the top-right corner')
237 | tr = _go(im[:, ::-1])
238 | tr[1] = im.shape[1] - tr[1] - 1
239 |
240 | LOG.debug('Running bounds detection on the bottom-left corner')
241 | bl = _go(im[::-1, :])
242 | bl[0] = im.shape[0] - bl[0] - 1
243 |
244 | LOG.debug('Running bounds detection on the bottom-right corner')
245 | br = _go(im[::-1, ::-1])
246 | br[0] = im.shape[0] - br[0] - 1
247 | br[1] = im.shape[1] - br[1] - 1
248 |
249 | tl, tr, bl, br = [[a / zoom_factor for a in b] for b in (tl, tr, bl, br)]
250 | LOG.info('Unaligned result: top-left=%s', tl)
251 | LOG.info('Unaligned result: top-right=%s', tr)
252 | LOG.info('Unaligned result: bottom-left=%s', bl)
253 | LOG.info('Unaligned result: bottom-right=%s', br)
254 |
255 | array_size_px = [
256 | (
257 | np.sqrt((tl[1] - bl[1]) ** 2 + (tl[0] - bl[0]) ** 2) +
258 | np.sqrt((tr[1] - br[1]) ** 2 + (tr[0] - br[0]) ** 2)
259 | ) / 2,
260 | (
261 | np.sqrt((tr[1] - tl[1]) ** 2 + (tr[0] - tr[0]) ** 2) +
262 | np.sqrt((br[1] - bl[1]) ** 2 + (br[0] - br[0]) ** 2)
263 | ) / 2,
264 | ]
265 | x1s, t = optimize_cost(
266 | *array_size_px,
267 | [tl, tr, bl, br],
268 | tl,
269 | 0,
270 | )
271 | tl_, tr_, bl_, br_ = restriction(*array_size_px, x1s, t)
272 |
273 | print('Rotation=%.3f rad' % t)
274 | print('Top-left=%s' % tl_)
275 | print('Top-right=%s' % tr_)
276 | print('Bottom-left=%s' % bl_)
277 | print('Bottom-right=%s' % br_)
278 |
279 | spots = np.concatenate(
280 | list(map(
281 | np.transpose,
282 | map(np.matrix, it.product(range(783), range(1918), [1]))
283 | )),
284 | axis=1,
285 | ).astype(np.float64)
286 |
287 | spot_labels = spots[:2, :].copy()
288 |
289 | spots[1, :] += 0.5 * (0.5 + spots[0, :] % 2)
290 | spots[0, :] += 0.5
291 | spots[1, :] *= array_size_px[1] / 1918
292 | spots[0, :] *= array_size_px[0] / 783
293 |
294 | R = np.matrix([
295 | [np.cos(t), np.sin(t), 0],
296 | [-np.sin(t), np.cos(t), 0],
297 | [0, 0, 1],
298 | ])
299 | T = np.matrix([
300 | [1, 0, tl_[0]],
301 | [0, 1, tl_[1]],
302 | [0, 0, 1],
303 | ])
304 |
305 | spots = (T * R * spots)
306 | spots = np.round(spots).astype(int)
307 |
308 | # index labels from 1
309 | spot_labels += 1
310 |
311 | df = pd.concat(
312 | [
313 | pd.DataFrame(
314 | spot_labels.T,
315 | columns=['spot_y', 'spot_x'],
316 | dtype=int,
317 | ),
318 | pd.DataFrame(
319 | spots[:2, :].T,
320 | columns=['spot_px_y', 'spot_px_x'],
321 | dtype=int,
322 | ),
323 | ],
324 | axis=1,
325 | )
326 |
327 | save_path = os.path.join(output_directory, f'{im_file_no_ext}.tsv')
328 | LOG.info('Saving spots file to %s', save_path)
329 | df.to_csv(save_path, index=None, sep='\t')
330 |
331 | _annotate(spots)
332 |
--------------------------------------------------------------------------------
/cell_typing/cell_type_assignment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "library(data.table)\n",
10 | "library(parallel)"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "setwd(\"/broad/regevtmp/jklugham/HD_ST\") # set wd to project directory containing all the sub folder"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "#### Calculate cell type likelihoods for each barcode/bin"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "calc_lk=function(ref,dat,exp=NULL,n_shuff=0){\n",
36 | " dat_ct=merge(dat,ref,by=\"gene\",allow.cartesian=TRUE)\n",
37 | " likelihoods=dat_ct[,.(lk=sum(count*log_norm),\n",
38 | " count=sum(count),\n",
39 | " umi_distrib=paste0(\"c(\",paste(count,collapse = \",\"),\")\")\n",
40 | " ),by=c(\"x\",\"y\",\"ClusterName\",exp)]\n",
41 | " \n",
42 | " get_emp_pval=function(lk_orig,counts,cluster,ref_data,n_shuff){\n",
43 | " ref_cl=ref_data[ClusterName==cluster]$log_norm\n",
44 | " counts=eval(parse(text=counts))\n",
45 | " zero_rat=sum(ref_cl==-Inf)/length(ref_cl)\n",
46 | " p_no0=(1-zero_rat)^length(counts)\n",
47 | " \n",
48 | " set.seed(1234)\n",
49 | " shuffles=lapply(rep(length(counts),n_shuff),FUN=function(x)sample(ref_cl[ref_cl!=-Inf],x,replace = FALSE))\n",
50 | " lk_shuff=unlist(mclapply(X=shuffles,FUN=function(x)sum(counts*x),mc.cores=1,mc.preschedule = TRUE))\n",
51 | " p=unlist(mclapply(X=lk_orig,FUN=function(x){sum(lk_shuff>=x)/n_shuff},mc.cores=1,mc.preschedule = TRUE))\n",
52 | " return(p*p_no0)\n",
53 | " }\n",
54 | " if (n_shuff!=0){\n",
55 | " likelihoods[lk!=-Inf,emp_pval:=get_emp_pval(lk,umi_distrib,ClusterName,ref,n_shuff),\n",
56 | " by=c(\"umi_distrib\",\"ClusterName\")]\n",
57 | " likelihoods[lk==-Inf,emp_pval:=1,]\n",
58 | " }else{\n",
59 | " likelihoods[,emp_pval:=NA,]\n",
60 | " }\n",
61 | " return(likelihoods)\n",
62 | "}"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "#### Process the the output from calc_lk"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "process_calk_lk=function(likelihoods,Ng=Ngenes,exp=NULL){\n",
79 | " find_nexthighest=function(lk){\n",
80 | " lk_sort=sort(lk,decreasing=TRUE)\n",
81 | " nexth=sapply(lk,function(x){c(lk_sort[lk_sort=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c(\"x\",\"y\")]\n",
288 | "write.table(likelihoods_li,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "### Breast cancer"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "thres=c(0.05,0.7) #p-value, lk_norm"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "tnbc_norm=fread(\"results/tnbc_norm.tsv\")"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "#### 1x (hd)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "dat=fread(\"BC/CN21_BC24350_E2_filtered_red_ut.tsv\")\n",
330 | "tag=\"E2_tnbc_hd\""
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "dat=fread(\"BC_nc/CN21_BC24350_E2_unmodgtf_filtered_red_ut.tsv\")\n",
340 | "tag=\"E2_unmodgtf_tnbc_hd\""
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "dat=fread(\"BC_nc/CN21_BC24350_C1_unmodgtf_filtered_red_ut.tsv\")\n",
350 | "tag=\"C1_unmodgtf_tnbc_hd\""
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "dat=fread(\"BC_nc/CN21_BC24350_D1_unmodgtf_filtered_red_ut.tsv\")\n",
360 | "tag=\"D1_unmodgtf_tnbc_hd\""
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "#### binned "
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "dat=fread(\"BC_binned/hdst-lowres.tsv\")\n",
377 | "tag=\"E2_tnbc_low\""
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "dat=fread(\"BC_binned_nc/hdst-lowres.tsv\")\n",
387 | "tag=\"E2_unmodgtf_tnbc_low\""
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "dat=fread(\"BC_binned_nc/C1/hdst-lowres.tsv\")\n",
397 | "tag=\"C1_unmodgtf_tnbc_low\""
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "dat=fread(\"BC_binned_nc/D1/hdst-lowres.tsv\")\n",
407 | "tag=\"D1_unmodgtf_tnbc_low\""
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": [
414 | "#### segmentd"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "dat=fread(\"BC/CN21_BC24350_E2_filtered_red_ut_segmentd.tsv\")\n",
424 | "tag=\"E2_tnbc_seg\""
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "dat=fread(\"BC/CN21_BC24350_E2_unmodgtf_filtered_red_ut_segmentd.tsv\")\n",
434 | "tag=\"E2_unmodgtf_tnbc_seg\""
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "metadata": {},
440 | "source": [
441 | "#### here actually run"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "ns=1000\n",
451 | "system.time(\n",
452 | "expr=likelihoods_tnbc<-calc_lk(tnbc_norm,dat,n_shuff = ns)\n",
453 | ")\n",
454 | "write.table(likelihoods_tnbc,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE) #fastp_ns only introduced 5/15\n",
455 | "likelihoods_tnbc=process_calk_lk(likelihoods = likelihoods_tnbc,Ng = 10000)\n",
456 | "likelihoods_tnbc[,N_ct:=sum(lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]&lk_rat>=0.8,na.rm = TRUE),by=c(\"x\",\"y\")]\n",
457 | "write.table(likelihoods_tnbc,paste0(\"results/ct_likelihoods_\",tag,\"_fastp_\",ns,\".tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)"
458 | ]
459 | }
460 | ],
461 | "metadata": {
462 | "kernelspec": {
463 | "display_name": "R",
464 | "language": "R",
465 | "name": "ir"
466 | },
467 | "language_info": {
468 | "codemirror_mode": "r",
469 | "file_extension": ".r",
470 | "mimetype": "text/x-r-source",
471 | "name": "R",
472 | "pygments_lexer": "r",
473 | "version": "3.5.0"
474 | }
475 | },
476 | "nbformat": 4,
477 | "nbformat_minor": 2
478 | }
479 |
--------------------------------------------------------------------------------
/enrichment_analysis/enrichment_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "library(data.table)"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "setwd(project_directory) # set wd to project directory containing all the sub folder"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Location enrichment analysis (genes or cells)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "test_loc_enrichment=function(data,lk_thres=NULL,p_thres=NULL,sub_col){\n",
35 | " if (!is.null(lk_thres)){dat_filt=data[lk_norm>=lk_thres]}else{dat_filt=data}\n",
36 | " if (!is.null(p_thres)){dat_filt=dat_filt[emp_pval_adjust_BH<=p_thres]}\n",
37 | " \n",
38 | " cts=unique(dat_filt$ClusterName)\n",
39 | " anats=unique(dat_filt$layer)\n",
40 | " \n",
41 | " ftest_res=data.table()\n",
42 | " for (ict in cts){\n",
43 | " for (ianat in anats){\n",
44 | " # print(paste0(ict,\" \",ianat))\n",
45 | " cont_mat=matrix(c(dat_filt[ClusterName==ict&layer==ianat,length(unique(get(sub_col)))], \n",
46 | " dat_filt[ClusterName!=ict&layer==ianat,length(unique(get(sub_col)))], \n",
47 | " dat_filt[ClusterName==ict&layer!=ianat,length(unique(get(sub_col)))], \n",
48 | " dat_filt[ClusterName!=ict&layer!=ianat,length(unique(get(sub_col)))]),\n",
49 | " nrow = 2,\n",
50 | " dimnames = list(anat = c(\"yes\", \"no\"),\n",
51 | " ct = c(\"yes\", \"no\")))\n",
52 | " ft=fisher.test(cont_mat,alternative = \"greater\",conf.int = TRUE)\n",
53 | " ftest_res=rbindlist(list(ftest_res,data.table(cell_type=ict,\n",
54 | " layer=ianat,\n",
55 | " p.value=ft$p.value,\n",
56 | " cof.int.low=ft$conf.int[1],\n",
57 | " cof.int.high=ft$conf.int[2])))\n",
58 | " }\n",
59 | " }\n",
60 | " return(ftest_res)\n",
61 | "}"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### MOB cell type enrichment analysis"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "anat=fread(\"MOB/CN13_D2_barcodes_under_tissue_annot.tsv\")\n",
78 | "setnames(anat,\"poly.ID\",\"layer\")"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "thres=c(0.01,0.1) #p-value, lk_norm"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "#### 1x (hd)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "lk=fread(\"results/ct_likelihoods_li_hd_fastp_1000.tsv\")\n",
104 | "tag=\"li_hd\""
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_hd_fastp_1000.tsv\")\n",
114 | "tag=\"unmodgtf_li_hd\""
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "#prepare\n",
124 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
125 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "#### binned "
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "lk=fread(\"results/ct_likelihoods_li_low_fastp_1000.tsv\")\n",
142 | "coords=fread(\"MOB_binned/hdst-lowres-coordinates.csv\")\n",
143 | "tag=\"li_low\""
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_low_fastp_1000.tsv\")\n",
153 | "coords=fread(\"MOB_binned/hdst-lowres-coordinates.csv\")\n",
154 | "tag=\"unmodgtf_li_low\""
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "#prepare\n",
164 | "sub=lk[bin==\"5x\"]\n",
165 | "sub[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
166 | "lk_anat=merge(sub[lk_rat==1&N_ct==1],\n",
167 | " unique(coords[,c(\"region\",\"5x_x\",\"5x_y\"),with=FALSE][!duplicated(cbind(`5x_x`,`5x_y`))]),\n",
168 | " by.x=c(\"x\",\"y\"),\n",
169 | " by.y=c(\"5x_x\",\"5x_y\"))\n",
170 | "setnames(lk_anat,\"region\",\"layer\")"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "#### segmentd"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "lk=fread(\"results/ct_likelihoods_li_seg_fastp_1000.tsv\")\n",
187 | "tag=\"li_seg\""
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "lk=fread(\"results/ct_likelihoods_unmodgtf_li_seg_fastp_1000.tsv\")\n",
197 | "tag=\"unmodgtf_li_seg\""
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "#prepare\n",
207 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
208 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "#### here actually run"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "#enrichment analyis\n",
225 | "res_li=test_loc_enrichment(data = lk_anat,p_thres = thres[1],lk_thres = thres[2],sub_col = \"bc\")\n",
226 | "res_li[,p.value.adjust:=p.adjust(p.value,method = \"BY\"),]\n",
227 | "write.table(res_li,paste0(\"results/ct_anat_enrich_\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "### Breast cancer cell type enrichment analysis"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "anat=fread(\"BC/CN21_BC24350_E2_barcodes_under_tissue_annot.tsv\")\n",
244 | "setnames(anat,\"poly.ID\",\"layer\")"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "thres=c(0.05,0.7) #p-value, lk_norm"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "#### 1x (hd)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_hd.tsv\")\n",
270 | "tag=\"E2_tnbc_hd\""
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_hd.tsv\")\n",
280 | "tag=\"E2_unmodgtf_tnbc_hd\""
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "#prepare\n",
290 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
291 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "#### binned "
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_low.tsv\")\n",
308 | "coords=fread(\"BC_binned/hdst-breast-cancer-lowres-coordinates.csv\")\n",
309 | "tag=\"E2_tnbc_low\""
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_low.tsv\")\n",
319 | "coords=fread(\"BC_binned/hdst-breast-cancer-lowres-coordinates.csv\")\n",
320 | "tag=\"E2_unmodgtf_tnbc_low\""
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "#prepare\n",
330 | "sub=lk[bin==\"5x\"]\n",
331 | "sub[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
332 | "lk_anat=merge(sub[lk_rat==1&N_ct==1],\n",
333 | " unique(coords[,c(\"region\",\"5x_x\",\"5x_y\"),with=FALSE][!duplicated(cbind(`5x_x`,`5x_y`))]),\n",
334 | " by.x=c(\"x\",\"y\"),\n",
335 | " by.y=c(\"5x_x\",\"5x_y\"))\n",
336 | "setnames(lk_anat,\"region\",\"layer\")"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "#### segmentd"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "lk=fread(\"results/ct_likelihoods_E2_tnbc_seg.tsv\")\n",
353 | "tag=\"E2_tnbc_seg\""
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "lk=fread(\"results/ct_likelihoods_E2_unmodgtf_tnbc_seg.tsv\")\n",
363 | "tag=\"E2_unmodgtf_tnbc_seg\""
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": [
372 | "#prepare\n",
373 | "lk[,N_ct:=sum(lk_rat>=0.8&lk_norm>=thres[2]&emp_pval_adjust_BH<=thres[1]),by=c(\"x\",\"y\")]\n",
374 | "lk_anat=merge(lk[lk_rat==1&N_ct==1],anat,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "#### here actually run"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "#enrichment analyis\n",
391 | "res_li=test_loc_enrichment(data = lk_anat,p_thres = thres[1],lk_thres = thres[2],sub_col = \"bc\")\n",
392 | "res_li[,p.value.adjust:=p.adjust(p.value,method = \"BY\"),]\n",
393 | "write.table(res_li,paste0(\"results/ct_anat_enrich_\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "### MOB enrichment of HDST layer specific genes in layers specific genes according to ABA"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "#aba genes\n",
410 | "aba_layer_spec=fread(\"ext_data/aba_MOB_diff.tsv\")"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "#hdst genes\n",
420 | "hdst_markers=fread(\"DE/Table_DEGs.csv\",drop = \"V1\")\n",
421 | "setnames(hdst_markers,names(hdst_markers),c(\"gene\",\"lfc\",\"p.value\",\"ClusterName\"))\n",
422 | "pthres=0.1\n",
423 | "tag=\"\""
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "#hdst genes\n",
433 | "hdst_markers=fread(\"DE/mob_region_de_unmodgtf.csv\")\n",
434 | "setnames(hdst_markers,names(hdst_markers),c(\"gene\",\"lfc\",\"p.value\",\"ClusterName\"))\n",
435 | "pthres=0.05\n",
436 | "tag=\"_unmodgtf\""
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "hdst_aba=merge(hdst_markers[lfc>1.5& p.value1.5],by=\"gene\",all=FALSE)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "#remove ONL because it doesn't exist in ABA\n",
455 | "hdst_aba=hdst_aba[!ClusterName==\"Olfactory Nerve Layer (ONL)\"]\n",
456 | "#combine some layers to match ABA annotation\n",
457 | "hdst_aba[ClusterName%in%c(\"Granule Cell Layer External (GCL-E)\",\"Granule Cell Layer Internal (GCL-I)\",\"Rostral Migratory System (RMS)\",\"Ependymal Cell Zone (E)\"),ClusterName:=\"Granule Layer (GR)\",]"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {},
464 | "outputs": [],
465 | "source": [
466 | "hdst_aba_enrich=test_loc_enrichment(hdst_aba,sub_col = \"gene\")\n",
467 | "hdst_aba_enrich[,p.adjust:=p.adjust(p.value,method = \"BY\"),]"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "write.table(hdst_aba_enrich,paste0(\"results/aba_deg_enrich\",tag,\".tsv\"),sep=\"\\t\", quote=FALSE,row.names=FALSE)"
477 | ]
478 | }
479 | ],
480 | "metadata": {
481 | "kernelspec": {
482 | "display_name": "R",
483 | "language": "R",
484 | "name": "ir"
485 | },
486 | "language_info": {
487 | "codemirror_mode": "r",
488 | "file_extension": ".r",
489 | "mimetype": "text/x-r-source",
490 | "name": "R",
491 | "pygments_lexer": "r",
492 | "version": "3.5.0"
493 | }
494 | },
495 | "nbformat": 4,
496 | "nbformat_minor": 2
497 | }
498 |
--------------------------------------------------------------------------------
/files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klarman-cell-observatory/hdst/1dc0578c6b8539bc4ec704ade6a2fe7165321bc5/files.png
--------------------------------------------------------------------------------
/hdst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/klarman-cell-observatory/hdst/1dc0578c6b8539bc4ec704ade6a2fe7165321bc5/hdst.png
--------------------------------------------------------------------------------
/pre_processing/MOB-00-ABA Gene retrieval via API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Differential search via ABA API"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "from allensdk.api.queries.ontologies_api import OntologiesApi, StructureTree"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Get MOB structure IDs from Ontology Structure Graph"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "oapi = OntologiesApi()\n",
34 | "structure_graph = oapi.get_structures_with_sets([1]) # 1 is the id of the adult mouse structure graph\n",
35 | "\n",
36 | "# This removes some unused fields returned by the query\n",
37 | "structure_graph = StructureTree.clean_structures(structure_graph) \n",
38 | "tree = StructureTree(structure_graph)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "[{'acronym': 'MOBgl',\n",
50 | " 'rgb_triplet': [130, 199, 174],\n",
51 | " 'graph_id': 1,\n",
52 | " 'graph_order': 381,\n",
53 | " 'id': 212,\n",
54 | " 'name': 'Main olfactory bulb, glomerular layer',\n",
55 | " 'structure_id_path': [997, 8, 567, 688, 695, 698, 507, 212],\n",
56 | " 'structure_set_ids': [10, 12]},\n",
57 | " {'acronym': 'MOBgr',\n",
58 | " 'rgb_triplet': [130, 199, 174],\n",
59 | " 'graph_id': 1,\n",
60 | " 'graph_order': 382,\n",
61 | " 'id': 220,\n",
62 | " 'name': 'Main olfactory bulb, granule layer',\n",
63 | " 'structure_id_path': [997, 8, 567, 688, 695, 698, 507, 220],\n",
64 | " 'structure_set_ids': [10, 12]}]"
65 | ]
66 | },
67 | "execution_count": 3,
68 | "metadata": {},
69 | "output_type": "execute_result"
70 | }
71 | ],
72 | "source": [
73 | "mob = tree.get_structures_by_name(['Main olfactory bulb'])\n",
74 | "mob_structures = tree.children([mob[0]['id']])[0]\n",
75 | "\n",
76 | "mob_structures[:2]"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "## Functions for differential gene expression search"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "def retrieve_ABA(url, start_row=0, num_rows=2000):\n",
93 | " '''Retrieves an ABA query result as CSV in multiple steps using paging'''\n",
94 | "\n",
95 | " filled_url = url % {'start_row': start_row, 'num_rows': num_rows}\n",
96 | " query_df = pd.read_csv(filled_url)\n",
97 | " final_df = []\n",
98 | "\n",
99 | " while (len(query_df) > 0):\n",
100 | " final_df.append(query_df)\n",
101 | " start_row += num_rows\n",
102 | " filled_url = url % {'start_row': start_row, 'num_rows': num_rows}\n",
103 | " query_df = pd.read_csv(filled_url)\n",
104 | "\n",
105 | " return pd.concat(final_df).reset_index()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 5,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "from allensdk.api.queries.connected_services import ConnectedServices\n",
115 | "\n",
116 | "def build_differential_search_url(target_structures, contrast_structures, threshold=1):\n",
117 | " cs = ConnectedServices()\n",
118 | "\n",
119 | " # For parameters see: http://help.brain-map.org/display/api/Connected+Services+and+Pipes#ConnectedServicesandPipes-service::mouse_differential\n",
120 | " url = cs.build_url('mouse_differential', kwargs={'set': 'mouse_coronal',\n",
121 | " 'structures2': target_structures,\n",
122 | " 'structures1': contrast_structures,\n",
123 | " 'threshold2': [threshold, 50],\n",
124 | " 'threshold1': [0, 50],\n",
125 | " 'start_row': '%(start_row)s', #placeholders are resolved in the download_ABA function\n",
126 | " 'num_rows': '%(num_rows)s'\n",
127 | " }).replace('query.json', 'query.csv')\n",
128 | " return url"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "## Test if URL construction and download works"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 6,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "name": "stdout",
145 | "output_type": "stream",
146 | "text": [
147 | "http://api.brain-map.org/api/v2/data/query.csv?q=service::mouse_differential[num_rows$eq%(num_rows)s][set$eqmouse_coronal][start_row$eq%(start_row)s][structures1$eq220,228,236,244][structures2$eq212][threshold1$eq0,50][threshold2$eq1,50]\n"
148 | ]
149 | },
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
154 | "\n",
167 | "
\n",
168 | " \n",
169 | " \n",
170 | " | \n",
171 | " index | \n",
172 | " id | \n",
173 | " name | \n",
174 | " gene-id | \n",
175 | " gene-symbol | \n",
176 | " gene-name | \n",
177 | " entrez-id | \n",
178 | " chromosome | \n",
179 | " plane-of-section | \n",
180 | " specimen-id | \n",
181 | " fold-change | \n",
182 | " target-sum | \n",
183 | " contrast-sum | \n",
184 | " num-target-samples | \n",
185 | " num-contrast-samples | \n",
186 | "
\n",
187 | " \n",
188 | " \n",
189 | " \n",
190 | " 0 | \n",
191 | " 0 | \n",
192 | " 73520993 | \n",
193 | " RP_051101_02_G12 | \n",
194 | " 88550 | \n",
195 | " Kctd12 | \n",
196 | " potassium channel tetramerisation domain conta... | \n",
197 | " 239217 | \n",
198 | " 14 | \n",
199 | " coronal | \n",
200 | " NaN | \n",
201 | " 4.234 | \n",
202 | " 963.874 | \n",
203 | " 1020.211 | \n",
204 | " 83 | \n",
205 | " 372 | \n",
206 | "
\n",
207 | " \n",
208 | " 1 | \n",
209 | " 1 | \n",
210 | " 74357573 | \n",
211 | " RP_050915_03_C03 | \n",
212 | " 88861 | \n",
213 | " Tspan18 | \n",
214 | " tetraspanin 18 | \n",
215 | " 241556 | \n",
216 | " 2 | \n",
217 | " coronal | \n",
218 | " NaN | \n",
219 | " 4.183 | \n",
220 | " 142.255 | \n",
221 | " 156.017 | \n",
222 | " 80 | \n",
223 | " 367 | \n",
224 | "
\n",
225 | " \n",
226 | " 2 | \n",
227 | " 2 | \n",
228 | " 72008121 | \n",
229 | " RP_051017_01_B10 | \n",
230 | " 14038 | \n",
231 | " Fmo1 | \n",
232 | " flavin containing monooxygenase 1 | \n",
233 | " 14261 | \n",
234 | " 1 | \n",
235 | " coronal | \n",
236 | " NaN | \n",
237 | " 4.041 | \n",
238 | " 211.137 | \n",
239 | " 234.167 | \n",
240 | " 83 | \n",
241 | " 372 | \n",
242 | "
\n",
243 | " \n",
244 | " 3 | \n",
245 | " 3 | \n",
246 | " 74512017 | \n",
247 | " RP_060220_03_D03 | \n",
248 | " 83946 | \n",
249 | " Phldb2 | \n",
250 | " pleckstrin homology-like domain, family B, mem... | \n",
251 | " 208177 | \n",
252 | " 16 | \n",
253 | " coronal | \n",
254 | " NaN | \n",
255 | " 3.971 | \n",
256 | " 222.445 | \n",
257 | " 251.422 | \n",
258 | " 82 | \n",
259 | " 368 | \n",
260 | "
\n",
261 | " \n",
262 | " 4 | \n",
263 | " 4 | \n",
264 | " 73929578 | \n",
265 | " RP_050927_03_H11 | \n",
266 | " 21585 | \n",
267 | " Thbs2 | \n",
268 | " thrombospondin 2 | \n",
269 | " 21826 | \n",
270 | " 17 | \n",
271 | " coronal | \n",
272 | " NaN | \n",
273 | " 3.922 | \n",
274 | " 175.864 | \n",
275 | " 200.959 | \n",
276 | " 83 | \n",
277 | " 372 | \n",
278 | "
\n",
279 | " \n",
280 | "
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " index id name gene-id gene-symbol \\\n",
285 | "0 0 73520993 RP_051101_02_G12 88550 Kctd12 \n",
286 | "1 1 74357573 RP_050915_03_C03 88861 Tspan18 \n",
287 | "2 2 72008121 RP_051017_01_B10 14038 Fmo1 \n",
288 | "3 3 74512017 RP_060220_03_D03 83946 Phldb2 \n",
289 | "4 4 73929578 RP_050927_03_H11 21585 Thbs2 \n",
290 | "\n",
291 | " gene-name entrez-id chromosome \\\n",
292 | "0 potassium channel tetramerisation domain conta... 239217 14 \n",
293 | "1 tetraspanin 18 241556 2 \n",
294 | "2 flavin containing monooxygenase 1 14261 1 \n",
295 | "3 pleckstrin homology-like domain, family B, mem... 208177 16 \n",
296 | "4 thrombospondin 2 21826 17 \n",
297 | "\n",
298 | " plane-of-section specimen-id fold-change target-sum contrast-sum \\\n",
299 | "0 coronal NaN 4.234 963.874 1020.211 \n",
300 | "1 coronal NaN 4.183 142.255 156.017 \n",
301 | "2 coronal NaN 4.041 211.137 234.167 \n",
302 | "3 coronal NaN 3.971 222.445 251.422 \n",
303 | "4 coronal NaN 3.922 175.864 200.959 \n",
304 | "\n",
305 | " num-target-samples num-contrast-samples \n",
306 | "0 83 372 \n",
307 | "1 80 367 \n",
308 | "2 83 372 \n",
309 | "3 82 368 \n",
310 | "4 83 372 "
311 | ]
312 | },
313 | "execution_count": 6,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "mob_url = build_differential_search_url(mob_structures[0]['id'], [x['id'] for x in mob_structures[1:]])\n",
320 | "print(mob_url)\n",
321 | "mob_df = retrieve_ABA(mob_url)\n",
322 | "mob_df.head()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 7,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "http://mouse.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,structure_sets[id$eq2],rma::options[only$eq'id'],pipe::list[xstructures$eq'id'],service::differential_rows[set$eq'P56coronal'][domain1$eq'220,228,236,244'][domain1_threshold$eq'0,50'][domain2$eq'212'][domain2_threshold$eq'1,50'][start_row$eq%(start_row)s][num_rows$eq%(num_rows)s]\n"
335 | ]
336 | },
337 | {
338 | "data": {
339 | "text/html": [
340 | "\n",
341 | "\n",
354 | "
\n",
355 | " \n",
356 | " \n",
357 | " | \n",
358 | " index | \n",
359 | " id | \n",
360 | " name | \n",
361 | " gene-id | \n",
362 | " gene-symbol | \n",
363 | " gene-name | \n",
364 | " entrez-id | \n",
365 | " chromosome | \n",
366 | " plane-of-section | \n",
367 | " specimen-id | \n",
368 | " fold-change | \n",
369 | " target-sum | \n",
370 | " contrast-sum | \n",
371 | " num-target-samples | \n",
372 | " num-contrast-samples | \n",
373 | "
\n",
374 | " \n",
375 | " \n",
376 | " \n",
377 | " 0 | \n",
378 | " 0 | \n",
379 | " 73520993 | \n",
380 | " RP_051101_02_G12 | \n",
381 | " 88550 | \n",
382 | " Kctd12 | \n",
383 | " potassium channel tetramerisation domain conta... | \n",
384 | " 239217 | \n",
385 | " 14 | \n",
386 | " coronal | \n",
387 | " NaN | \n",
388 | " 4.234 | \n",
389 | " 963.874 | \n",
390 | " 1020.211 | \n",
391 | " 83 | \n",
392 | " 372 | \n",
393 | "
\n",
394 | " \n",
395 | " 1 | \n",
396 | " 1 | \n",
397 | " 74357573 | \n",
398 | " RP_050915_03_C03 | \n",
399 | " 88861 | \n",
400 | " Tspan18 | \n",
401 | " tetraspanin 18 | \n",
402 | " 241556 | \n",
403 | " 2 | \n",
404 | " coronal | \n",
405 | " NaN | \n",
406 | " 4.183 | \n",
407 | " 142.255 | \n",
408 | " 156.017 | \n",
409 | " 80 | \n",
410 | " 367 | \n",
411 | "
\n",
412 | " \n",
413 | " 2 | \n",
414 | " 2 | \n",
415 | " 72008121 | \n",
416 | " RP_051017_01_B10 | \n",
417 | " 14038 | \n",
418 | " Fmo1 | \n",
419 | " flavin containing monooxygenase 1 | \n",
420 | " 14261 | \n",
421 | " 1 | \n",
422 | " coronal | \n",
423 | " NaN | \n",
424 | " 4.041 | \n",
425 | " 211.137 | \n",
426 | " 234.167 | \n",
427 | " 83 | \n",
428 | " 372 | \n",
429 | "
\n",
430 | " \n",
431 | " 3 | \n",
432 | " 3 | \n",
433 | " 74512017 | \n",
434 | " RP_060220_03_D03 | \n",
435 | " 83946 | \n",
436 | " Phldb2 | \n",
437 | " pleckstrin homology-like domain, family B, mem... | \n",
438 | " 208177 | \n",
439 | " 16 | \n",
440 | " coronal | \n",
441 | " NaN | \n",
442 | " 3.971 | \n",
443 | " 222.445 | \n",
444 | " 251.422 | \n",
445 | " 82 | \n",
446 | " 368 | \n",
447 | "
\n",
448 | " \n",
449 | " 4 | \n",
450 | " 4 | \n",
451 | " 73929578 | \n",
452 | " RP_050927_03_H11 | \n",
453 | " 21585 | \n",
454 | " Thbs2 | \n",
455 | " thrombospondin 2 | \n",
456 | " 21826 | \n",
457 | " 17 | \n",
458 | " coronal | \n",
459 | " NaN | \n",
460 | " 3.922 | \n",
461 | " 175.864 | \n",
462 | " 200.959 | \n",
463 | " 83 | \n",
464 | " 372 | \n",
465 | "
\n",
466 | " \n",
467 | "
\n",
468 | "
"
469 | ],
470 | "text/plain": [
471 | " index id name gene-id gene-symbol \\\n",
472 | "0 0 73520993 RP_051101_02_G12 88550 Kctd12 \n",
473 | "1 1 74357573 RP_050915_03_C03 88861 Tspan18 \n",
474 | "2 2 72008121 RP_051017_01_B10 14038 Fmo1 \n",
475 | "3 3 74512017 RP_060220_03_D03 83946 Phldb2 \n",
476 | "4 4 73929578 RP_050927_03_H11 21585 Thbs2 \n",
477 | "\n",
478 | " gene-name entrez-id chromosome \\\n",
479 | "0 potassium channel tetramerisation domain conta... 239217 14 \n",
480 | "1 tetraspanin 18 241556 2 \n",
481 | "2 flavin containing monooxygenase 1 14261 1 \n",
482 | "3 pleckstrin homology-like domain, family B, mem... 208177 16 \n",
483 | "4 thrombospondin 2 21826 17 \n",
484 | "\n",
485 | " plane-of-section specimen-id fold-change target-sum contrast-sum \\\n",
486 | "0 coronal NaN 4.234 963.874 1020.211 \n",
487 | "1 coronal NaN 4.183 142.255 156.017 \n",
488 | "2 coronal NaN 4.041 211.137 234.167 \n",
489 | "3 coronal NaN 3.971 222.445 251.422 \n",
490 | "4 coronal NaN 3.922 175.864 200.959 \n",
491 | "\n",
492 | " num-target-samples num-contrast-samples \n",
493 | "0 83 372 \n",
494 | "1 80 367 \n",
495 | "2 83 372 \n",
496 | "3 82 368 \n",
497 | "4 83 372 "
498 | ]
499 | },
500 | "execution_count": 7,
501 | "metadata": {},
502 | "output_type": "execute_result"
503 | }
504 | ],
505 | "source": [
506 | "from urllib.parse import unquote\n",
507 | "\n",
508 | "# hand-crafted URL from ABA website\n",
509 | "test_url = 'http://mouse.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,structure_sets%5Bid$eq2%5D,rma::options%5Bonly$eq%27id%27%5D,pipe::list%5Bxstructures$eq%27id%27%5D,service::differential_rows%5Bset$eq%27P56coronal%27%5D%5Bdomain1$eq%27220,228,236,244%27%5D%5Bdomain1_threshold$eq%270,50%27%5D%5Bdomain2$eq%27212%27%5D%5Bdomain2_threshold$eq%271,50%27%5D%5Bstart_row$eq%(start_row)s%5D%5Bnum_rows$eq%(num_rows)s%5D'\n",
510 | "test_url = unquote(test_url)\n",
511 | "print(test_url)\n",
512 | "\n",
513 | "mob_manual = retrieve_ABA(test_url)\n",
514 | "mob_manual.head()"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 8,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "assert mob_df.equals(mob_manual)"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "## Perform one-vs-all differential searches for MOB regions"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 9,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "mob_dfs = {}\n",
540 | "\n",
541 | "for region in mob_structures:\n",
542 | " name = region['acronym']\n",
543 | " _id = region['id']\n",
544 | " url = build_differential_search_url(_id, \n",
545 | " [x['id'] for x in mob_structures if x['id'] != _id])\n",
546 | " mob_dfs[name] = retrieve_ABA(url)"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 10,
552 | "metadata": {},
553 | "outputs": [],
554 | "source": [
555 | "for k, v in mob_dfs.items():\n",
556 | " v.to_csv(k + '.csv', index=False)"
557 | ]
558 | }
559 | ],
560 | "metadata": {
561 | "kernelspec": {
562 | "display_name": "Python 3",
563 | "language": "python",
564 | "name": "python3"
565 | },
566 | "language_info": {
567 | "codemirror_mode": {
568 | "name": "ipython",
569 | "version": 3
570 | },
571 | "file_extension": ".py",
572 | "mimetype": "text/x-python",
573 | "name": "python",
574 | "nbconvert_exporter": "python",
575 | "pygments_lexer": "ipython3",
576 | "version": "3.6.6"
577 | }
578 | },
579 | "nbformat": 4,
580 | "nbformat_minor": 2
581 | }
582 |
--------------------------------------------------------------------------------
/pre_processing/pre-processing_external.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "library(data.table)"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "setwd(project_directory) # set wd to project directory containing all the sub folder"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Allen Brain Atlas (ABA) data"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "#### ABA layer specific genes"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "ABA_genes=readLines(\"ext_data/aba_mob-genes_raw.txt\")\n",
42 | "head(ABA_genes)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "ABA_genes_dt=rbindlist(lapply(ABA_genes,function(x){\n",
52 | " spl=unlist(strsplit(x,\",\"))\n",
53 | " g=spl[-c(1)];n=spl[1]\n",
54 | " dt=data.table(layer=n,gene=g)\n",
55 | " return(dt)}\n",
56 | " )\n",
57 | " )\n",
58 | "head(ABA_genes_dt)\n",
59 | "write.table(ABA_genes_dt,\"ext_data/aba_mob-genes.txt\",sep=\"\\t\",quote = FALSE,row.names=FALSE)"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "#### ABA differential genes"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "ABA_diff_files=list.files(path = \"ext_data/\",\"aba_MOB.*.csv\",full.names = TRUE)\n",
76 | "head(ABA_diff_files)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "ABA_diff_dt=data.table()\n",
86 | "for (diff_file in ABA_diff_files){\n",
87 | " tab=fread(diff_file)\n",
88 | " tab[,layer:=unlist(strsplit(diff_file,\"MOB|\\\\.\"))[2]]\n",
89 | " ABA_diff_dt=rbindlist(list(ABA_diff_dt,tab))\n",
90 | "}\n",
91 | "setnames(ABA_diff_dt,\"gene-symbol\",\"gene\")\n",
92 | "head(ABA_diff_dt)\n",
93 | "write.table(ABA_diff_dt,\"ext_data/aba_MOB_diff.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE)"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### Mouse olfactory bulb (MOB) single cell RNA-seq data (Linnarsson)"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Calculating the normalized expression values (relative frequencies) for each relevant cell type for the MOB data"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 19,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "li_mean=fread(\"ext_data/li_mean_expr.tsv\")"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 20,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/html": [
127 | "27998"
128 | ],
129 | "text/latex": [
130 | "27998"
131 | ],
132 | "text/markdown": [
133 | "27998"
134 | ],
135 | "text/plain": [
136 | "[1] 27998"
137 | ]
138 | },
139 | "metadata": {},
140 | "output_type": "display_data"
141 | }
142 | ],
143 | "source": [
144 | "nrow(li_mean)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 21,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/html": [
155 | "56"
156 | ],
157 | "text/latex": [
158 | "56"
159 | ],
160 | "text/markdown": [
161 | "56"
162 | ],
163 | "text/plain": [
164 | "[1] 56"
165 | ]
166 | },
167 | "metadata": {},
168 | "output_type": "display_data"
169 | }
170 | ],
171 | "source": [
172 | "#there are duplicated gene names in the mean expression matrix, but we don't know why, so we leave them in\n",
173 | "dupl_genes=unique(li_mean$V1[duplicated(li_mean$V1)])\n",
174 | "length(dupl_genes)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 24,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "li_norm=melt(li_mean[,c(grep(\"^OB\",names(li_mean)),grep(\"Neuron\",names(li_mean),invert = TRUE)),with=FALSE],id.vars = \"V1\",variable.name = \"ClusterName\")"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 25,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "li_norm[,norm:=value/sum(value),by=\"ClusterName\"]\n",
193 | "li_norm[,log_norm:=log(norm),]\n",
194 | "setnames(li_norm,\"V1\",\"gene\")"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 28,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/html": [
205 | "63"
206 | ],
207 | "text/latex": [
208 | "63"
209 | ],
210 | "text/markdown": [
211 | "63"
212 | ],
213 | "text/plain": [
214 | "[1] 63"
215 | ]
216 | },
217 | "metadata": {},
218 | "output_type": "display_data"
219 | }
220 | ],
221 | "source": [
222 | "N_ct=length(unique(li_norm$ClusterName))\n",
223 | "N_ct"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 29,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/html": [
234 | "5848"
235 | ],
236 | "text/latex": [
237 | "5848"
238 | ],
239 | "text/markdown": [
240 | "5848"
241 | ],
242 | "text/plain": [
243 | "[1] 5848"
244 | ]
245 | },
246 | "metadata": {},
247 | "output_type": "display_data"
248 | }
249 | ],
250 | "source": [
251 | "#some genes have 0 expression for all of the cell types.\n",
252 | "li_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]\n",
253 | "length(unique(li_norm[N_zero_ct==N_ct]$gene))"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 31,
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "data": {
263 | "text/html": [
264 | "22085"
265 | ],
266 | "text/latex": [
267 | "22085"
268 | ],
269 | "text/markdown": [
270 | "22085"
271 | ],
272 | "text/plain": [
273 | "[1] 22085"
274 | ]
275 | },
276 | "metadata": {},
277 | "output_type": "display_data"
278 | }
279 | ],
280 | "source": [
281 | "#number of genes that are expressed in at least one cell type\n",
282 | "Ngenes=length(unique(li_norm[N_zero_ct!=N_ct]$gene))\n",
283 | "Ngenes"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 13,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "write.table(li_norm,\"results/li_norm.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE) #data as published"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### Triple negative breast cancer (tnbc) single-cell RNA seq data \n",
300 | "https://www.nature.com/articles/s41467-018-06052-0#Sec24 \n",
301 | "GSE118389: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE118389"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "Calculating the normalized expression values (relative frequencies) for each relevant cell type for the breast cancer data"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 3,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "name": "stderr",
318 | "output_type": "stream",
319 | "text": [
320 | "Warning message in fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_counts_rsem.txt\")):\n",
321 | "\"Detected 1534 column names but the data has 1535 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.\""
322 | ]
323 | },
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "V1 | PT089_P1_A01 | PT089_P1_A02 | PT089_P1_A03 | PT089_P1_A04 | PT089_P1_A05 | PT089_P1_A06 | PT089_P1_A07 | PT089_P1_A08 | PT089_P1_A09 | ... | PT039_P10_H03_S279 | PT039_P10_H04_S280 | PT039_P10_H05_S281 | PT039_P10_H06_S282 | PT039_P10_H07_S283 | PT039_P10_H08_S284 | PT039_P10_H09_S285 | PT039_P10_H10_S286 | PT039_P10_H11_S287 | PT039_P10_H12_S288 |
\n",
329 | "\n",
330 | "\tZXDC | 1.76 | 5.06 | 4.86 | 2.57 | 7.48 | 7.26 | 12.10 | 2.61 | 7.59 | ... | 0.00 | 0 | 0.00 | 0 | 1275.78 | 2.03 | 0.00 | 0.00 | 0.00 | 85.87 |
\n",
331 | "\tZYG11A | 4.73 | 111.84 | 1.26 | 0.00 | 1.42 | 7.26 | 12.36 | 4.72 | 13.45 | ... | 0.00 | 0 | 0.00 | 0 | 1.02 | 2.18 | 3.85 | 0.00 | 0.00 | 6.15 |
\n",
332 | "\tZYG11B | 7.86 | 2.14 | 4.43 | 1.77 | 2.43 | 0.00 | 7.56 | 2.24 | 6.63 | ... | 3.35 | 0 | 1.91 | 0 | 4.12 | 1.74 | 1.02 | 1.12 | 3.72 | 23.32 |
\n",
333 | "\tZYX | 0.00 | 0.00 | 1.00 | 0.00 | 937.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 3.00 | 143.00 | 0.00 | 0.00 | 85.00 |
\n",
334 | "\tZZEF1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 6.00 | 0.00 | 0.00 | 27.00 |
\n",
335 | "\tZZZ3 | 0.00 | 0.00 | 0.00 | 2.00 | 0.00 | 2006.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 23.00 |
\n",
336 | "\n",
337 | "
\n"
338 | ],
339 | "text/latex": [
340 | "\\begin{tabular}{r|lllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllllll}\n",
341 | " V1 & PT089\\_P1\\_A01 & PT089\\_P1\\_A02 & PT089\\_P1\\_A03 & PT089\\_P1\\_A04 & PT089\\_P1\\_A05 & PT089\\_P1\\_A06 & PT089\\_P1\\_A07 & PT089\\_P1\\_A08 & PT089\\_P1\\_A09 & ... & PT039\\_P10\\_H03\\_S279 & PT039\\_P10\\_H04\\_S280 & PT039\\_P10\\_H05\\_S281 & PT039\\_P10\\_H06\\_S282 & PT039\\_P10\\_H07\\_S283 & PT039\\_P10\\_H08\\_S284 & PT039\\_P10\\_H09\\_S285 & PT039\\_P10\\_H10\\_S286 & PT039\\_P10\\_H11\\_S287 & PT039\\_P10\\_H12\\_S288\\\\\n",
342 | "\\hline\n",
343 | "\t ZXDC & 1.76 & 5.06 & 4.86 & 2.57 & 7.48 & 7.26 & 12.10 & 2.61 & 7.59 & ... & 0.00 & 0 & 0.00 & 0 & 1275.78 & 2.03 & 0.00 & 0.00 & 0.00 & 85.87 \\\\\n",
344 | "\t ZYG11A & 4.73 & 111.84 & 1.26 & 0.00 & 1.42 & 7.26 & 12.36 & 4.72 & 13.45 & ... & 0.00 & 0 & 0.00 & 0 & 1.02 & 2.18 & 3.85 & 0.00 & 0.00 & 6.15 \\\\\n",
345 | "\t ZYG11B & 7.86 & 2.14 & 4.43 & 1.77 & 2.43 & 0.00 & 7.56 & 2.24 & 6.63 & ... & 3.35 & 0 & 1.91 & 0 & 4.12 & 1.74 & 1.02 & 1.12 & 3.72 & 23.32 \\\\\n",
346 | "\t ZYX & 0.00 & 0.00 & 1.00 & 0.00 & 937.00 & 0.00 & 0.00 & 0.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 3.00 & 143.00 & 0.00 & 0.00 & 85.00 \\\\\n",
347 | "\t ZZEF1 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 7.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 0.00 & 6.00 & 0.00 & 0.00 & 27.00 \\\\\n",
348 | "\t ZZZ3 & 0.00 & 0.00 & 0.00 & 2.00 & 0.00 & 2006.00 & 0.00 & 0.00 & 0.00 & ... & 0.00 & 0 & 0.00 & 0 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 23.00 \\\\\n",
349 | "\\end{tabular}\n"
350 | ],
351 | "text/markdown": [
352 | "\n",
353 | "| V1 | PT089_P1_A01 | PT089_P1_A02 | PT089_P1_A03 | PT089_P1_A04 | PT089_P1_A05 | PT089_P1_A06 | PT089_P1_A07 | PT089_P1_A08 | PT089_P1_A09 | ... | PT039_P10_H03_S279 | PT039_P10_H04_S280 | PT039_P10_H05_S281 | PT039_P10_H06_S282 | PT039_P10_H07_S283 | PT039_P10_H08_S284 | PT039_P10_H09_S285 | PT039_P10_H10_S286 | PT039_P10_H11_S287 | PT039_P10_H12_S288 |\n",
354 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
355 | "| ZXDC | 1.76 | 5.06 | 4.86 | 2.57 | 7.48 | 7.26 | 12.10 | 2.61 | 7.59 | ... | 0.00 | 0 | 0.00 | 0 | 1275.78 | 2.03 | 0.00 | 0.00 | 0.00 | 85.87 |\n",
356 | "| ZYG11A | 4.73 | 111.84 | 1.26 | 0.00 | 1.42 | 7.26 | 12.36 | 4.72 | 13.45 | ... | 0.00 | 0 | 0.00 | 0 | 1.02 | 2.18 | 3.85 | 0.00 | 0.00 | 6.15 |\n",
357 | "| ZYG11B | 7.86 | 2.14 | 4.43 | 1.77 | 2.43 | 0.00 | 7.56 | 2.24 | 6.63 | ... | 3.35 | 0 | 1.91 | 0 | 4.12 | 1.74 | 1.02 | 1.12 | 3.72 | 23.32 |\n",
358 | "| ZYX | 0.00 | 0.00 | 1.00 | 0.00 | 937.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 3.00 | 143.00 | 0.00 | 0.00 | 85.00 |\n",
359 | "| ZZEF1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 6.00 | 0.00 | 0.00 | 27.00 |\n",
360 | "| ZZZ3 | 0.00 | 0.00 | 0.00 | 2.00 | 0.00 | 2006.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 23.00 |\n",
361 | "\n"
362 | ],
363 | "text/plain": [
364 | " V1 PT089_P1_A01 PT089_P1_A02 PT089_P1_A03 PT089_P1_A04 PT089_P1_A05\n",
365 | "1 ZXDC 1.76 5.06 4.86 2.57 7.48 \n",
366 | "2 ZYG11A 4.73 111.84 1.26 0.00 1.42 \n",
367 | "3 ZYG11B 7.86 2.14 4.43 1.77 2.43 \n",
368 | "4 ZYX 0.00 0.00 1.00 0.00 937.00 \n",
369 | "5 ZZEF1 0.00 0.00 0.00 0.00 0.00 \n",
370 | "6 ZZZ3 0.00 0.00 0.00 2.00 0.00 \n",
371 | " PT089_P1_A06 PT089_P1_A07 PT089_P1_A08 PT089_P1_A09 ... PT039_P10_H03_S279\n",
372 | "1 7.26 12.10 2.61 7.59 ... 0.00 \n",
373 | "2 7.26 12.36 4.72 13.45 ... 0.00 \n",
374 | "3 0.00 7.56 2.24 6.63 ... 3.35 \n",
375 | "4 0.00 0.00 0.00 0.00 ... 0.00 \n",
376 | "5 0.00 0.00 7.00 0.00 ... 0.00 \n",
377 | "6 2006.00 0.00 0.00 0.00 ... 0.00 \n",
378 | " PT039_P10_H04_S280 PT039_P10_H05_S281 PT039_P10_H06_S282 PT039_P10_H07_S283\n",
379 | "1 0 0.00 0 1275.78 \n",
380 | "2 0 0.00 0 1.02 \n",
381 | "3 0 1.91 0 4.12 \n",
382 | "4 0 0.00 0 0.00 \n",
383 | "5 0 0.00 0 0.00 \n",
384 | "6 0 0.00 0 0.00 \n",
385 | " PT039_P10_H08_S284 PT039_P10_H09_S285 PT039_P10_H10_S286 PT039_P10_H11_S287\n",
386 | "1 2.03 0.00 0.00 0.00 \n",
387 | "2 2.18 3.85 0.00 0.00 \n",
388 | "3 1.74 1.02 1.12 3.72 \n",
389 | "4 3.00 143.00 0.00 0.00 \n",
390 | "5 0.00 6.00 0.00 0.00 \n",
391 | "6 0.00 0.00 0.00 0.00 \n",
392 | " PT039_P10_H12_S288\n",
393 | "1 85.87 \n",
394 | "2 6.15 \n",
395 | "3 23.32 \n",
396 | "4 85.00 \n",
397 | "5 27.00 \n",
398 | "6 23.00 "
399 | ]
400 | },
401 | "metadata": {},
402 | "output_type": "display_data"
403 | },
404 | {
405 | "data": {
406 | "text/html": [
407 | "\n",
408 | "\t- 21785
\n",
409 | "\t- 1535
\n",
410 | "
\n"
411 | ],
412 | "text/latex": [
413 | "\\begin{enumerate*}\n",
414 | "\\item 21785\n",
415 | "\\item 1535\n",
416 | "\\end{enumerate*}\n"
417 | ],
418 | "text/markdown": [
419 | "1. 21785\n",
420 | "2. 1535\n",
421 | "\n",
422 | "\n"
423 | ],
424 | "text/plain": [
425 | "[1] 21785 1535"
426 | ]
427 | },
428 | "metadata": {},
429 | "output_type": "display_data"
430 | },
431 | {
432 | "data": {
433 | "text/html": [
434 | "0"
435 | ],
436 | "text/latex": [
437 | "0"
438 | ],
439 | "text/markdown": [
440 | "0"
441 | ],
442 | "text/plain": [
443 | "[1] 0"
444 | ]
445 | },
446 | "metadata": {},
447 | "output_type": "display_data"
448 | }
449 | ],
450 | "source": [
451 | "#data\n",
452 | "TNBC_data=fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_counts_rsem.txt\"))\n",
453 | "tail(TNBC_data)\n",
454 | "dim(TNBC_data)\n",
455 | "sum(duplicated(TNBC_data$V1)) #check for duplicated gene names - here are none"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 4,
461 | "metadata": {},
462 | "outputs": [
463 | {
464 | "data": {
465 | "text/html": [
466 | "\n",
467 | "V1 | V2 |
\n",
468 | "\n",
469 | "\tPT089_P1_A01 | epithelial |
\n",
470 | "\tPT089_P1_A02 | epithelial |
\n",
471 | "\tPT089_P1_A03 | epithelial |
\n",
472 | "\tPT089_P1_A04 | macrophage |
\n",
473 | "\tPT089_P1_A05 | macrophage |
\n",
474 | "\tPT089_P1_A06 | epithelial |
\n",
475 | "\n",
476 | "
\n"
477 | ],
478 | "text/latex": [
479 | "\\begin{tabular}{r|ll}\n",
480 | " V1 & V2\\\\\n",
481 | "\\hline\n",
482 | "\t PT089\\_P1\\_A01 & epithelial \\\\\n",
483 | "\t PT089\\_P1\\_A02 & epithelial \\\\\n",
484 | "\t PT089\\_P1\\_A03 & epithelial \\\\\n",
485 | "\t PT089\\_P1\\_A04 & macrophage \\\\\n",
486 | "\t PT089\\_P1\\_A05 & macrophage \\\\\n",
487 | "\t PT089\\_P1\\_A06 & epithelial \\\\\n",
488 | "\\end{tabular}\n"
489 | ],
490 | "text/markdown": [
491 | "\n",
492 | "| V1 | V2 |\n",
493 | "|---|---|\n",
494 | "| PT089_P1_A01 | epithelial |\n",
495 | "| PT089_P1_A02 | epithelial |\n",
496 | "| PT089_P1_A03 | epithelial |\n",
497 | "| PT089_P1_A04 | macrophage |\n",
498 | "| PT089_P1_A05 | macrophage |\n",
499 | "| PT089_P1_A06 | epithelial |\n",
500 | "\n"
501 | ],
502 | "text/plain": [
503 | " V1 V2 \n",
504 | "1 PT089_P1_A01 epithelial\n",
505 | "2 PT089_P1_A02 epithelial\n",
506 | "3 PT089_P1_A03 epithelial\n",
507 | "4 PT089_P1_A04 macrophage\n",
508 | "5 PT089_P1_A05 macrophage\n",
509 | "6 PT089_P1_A06 epithelial"
510 | ]
511 | },
512 | "metadata": {},
513 | "output_type": "display_data"
514 | },
515 | {
516 | "data": {
517 | "text/html": [
518 | "\n",
519 | "\t- 1112
\n",
520 | "\t- 2
\n",
521 | "
\n"
522 | ],
523 | "text/latex": [
524 | "\\begin{enumerate*}\n",
525 | "\\item 1112\n",
526 | "\\item 2\n",
527 | "\\end{enumerate*}\n"
528 | ],
529 | "text/markdown": [
530 | "1. 1112\n",
531 | "2. 2\n",
532 | "\n",
533 | "\n"
534 | ],
535 | "text/plain": [
536 | "[1] 1112 2"
537 | ]
538 | },
539 | "metadata": {},
540 | "output_type": "display_data"
541 | }
542 | ],
543 | "source": [
544 | "#annotation\n",
545 | "TNBC_annot=fread(file.path(\"ext_data/GSE118389_tnbc/GSE118389_cell_annot.tsv\"))\n",
546 | "TNBC_annot[,V1:=as.character(V1),]\n",
547 | "TNBC_annot[,V2:=as.character(V2),]\n",
548 | "head(TNBC_annot)\n",
549 | "dim(TNBC_annot)"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 5,
555 | "metadata": {},
556 | "outputs": [
557 | {
558 | "name": "stderr",
559 | "output_type": "stream",
560 | "text": [
561 | "Warning message in melt.data.table(TNBC_data, id.vars = \"V1\"):\n",
562 | "\"'measure.vars' [PT089_P1_A01, PT089_P1_A02, PT089_P1_A03, PT089_P1_A04, ...] are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'. All measure variables not of type 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on coercion.\""
563 | ]
564 | },
565 | {
566 | "data": {
567 | "text/html": [
568 | "\n",
569 | "V1 | variable | value |
\n",
570 | "\n",
571 | "\tA1BG | PT089_P1_A01 | 0.00 |
\n",
572 | "\tA1BG-AS1 | PT089_P1_A01 | 0.00 |
\n",
573 | "\tA1CF | PT089_P1_A01 | 0.00 |
\n",
574 | "\tA2M | PT089_P1_A01 | 0.00 |
\n",
575 | "\tA2M-AS1 | PT089_P1_A01 | 0.00 |
\n",
576 | "\tA2ML1 | PT089_P1_A01 | 1.08 |
\n",
577 | "\n",
578 | "
\n"
579 | ],
580 | "text/latex": [
581 | "\\begin{tabular}{r|lll}\n",
582 | " V1 & variable & value\\\\\n",
583 | "\\hline\n",
584 | "\t A1BG & PT089\\_P1\\_A01 & 0.00 \\\\\n",
585 | "\t A1BG-AS1 & PT089\\_P1\\_A01 & 0.00 \\\\\n",
586 | "\t A1CF & PT089\\_P1\\_A01 & 0.00 \\\\\n",
587 | "\t A2M & PT089\\_P1\\_A01 & 0.00 \\\\\n",
588 | "\t A2M-AS1 & PT089\\_P1\\_A01 & 0.00 \\\\\n",
589 | "\t A2ML1 & PT089\\_P1\\_A01 & 1.08 \\\\\n",
590 | "\\end{tabular}\n"
591 | ],
592 | "text/markdown": [
593 | "\n",
594 | "| V1 | variable | value |\n",
595 | "|---|---|---|\n",
596 | "| A1BG | PT089_P1_A01 | 0.00 |\n",
597 | "| A1BG-AS1 | PT089_P1_A01 | 0.00 |\n",
598 | "| A1CF | PT089_P1_A01 | 0.00 |\n",
599 | "| A2M | PT089_P1_A01 | 0.00 |\n",
600 | "| A2M-AS1 | PT089_P1_A01 | 0.00 |\n",
601 | "| A2ML1 | PT089_P1_A01 | 1.08 |\n",
602 | "\n"
603 | ],
604 | "text/plain": [
605 | " V1 variable value\n",
606 | "1 A1BG PT089_P1_A01 0.00 \n",
607 | "2 A1BG-AS1 PT089_P1_A01 0.00 \n",
608 | "3 A1CF PT089_P1_A01 0.00 \n",
609 | "4 A2M PT089_P1_A01 0.00 \n",
610 | "5 A2M-AS1 PT089_P1_A01 0.00 \n",
611 | "6 A2ML1 PT089_P1_A01 1.08 "
612 | ]
613 | },
614 | "metadata": {},
615 | "output_type": "display_data"
616 | }
617 | ],
618 | "source": [
619 | "TNBC_data_long=melt(TNBC_data,id.vars = \"V1\")\n",
620 | "head(TNBC_data_long)"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 6,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "TNBC_data_long=merge(TNBC_data_long,TNBC_annot,by.x = \"variable\",by.y=\"V1\")"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 7,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "setnames(TNBC_data_long,\"V2\",\"ClusterName\")"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 8,
644 | "metadata": {},
645 | "outputs": [
646 | {
647 | "data": {
648 | "text/html": [
649 | "\n",
650 | "V1 | ClusterName | value | N |
\n",
651 | "\n",
652 | "\tA1BG | epithelial | 3.25911290 | 868 |
\n",
653 | "\tA1BG-AS1 | epithelial | 4.61316820 | 868 |
\n",
654 | "\tA1CF | epithelial | 0.06591014 | 868 |
\n",
655 | "\tA2M | epithelial | 633.41102535 | 868 |
\n",
656 | "\tA2M-AS1 | epithelial | 1.97836406 | 868 |
\n",
657 | "\tA2ML1 | epithelial | 20.12418203 | 868 |
\n",
658 | "\n",
659 | "
\n"
660 | ],
661 | "text/latex": [
662 | "\\begin{tabular}{r|llll}\n",
663 | " V1 & ClusterName & value & N\\\\\n",
664 | "\\hline\n",
665 | "\t A1BG & epithelial & 3.25911290 & 868 \\\\\n",
666 | "\t A1BG-AS1 & epithelial & 4.61316820 & 868 \\\\\n",
667 | "\t A1CF & epithelial & 0.06591014 & 868 \\\\\n",
668 | "\t A2M & epithelial & 633.41102535 & 868 \\\\\n",
669 | "\t A2M-AS1 & epithelial & 1.97836406 & 868 \\\\\n",
670 | "\t A2ML1 & epithelial & 20.12418203 & 868 \\\\\n",
671 | "\\end{tabular}\n"
672 | ],
673 | "text/markdown": [
674 | "\n",
675 | "| V1 | ClusterName | value | N |\n",
676 | "|---|---|---|---|\n",
677 | "| A1BG | epithelial | 3.25911290 | 868 |\n",
678 | "| A1BG-AS1 | epithelial | 4.61316820 | 868 |\n",
679 | "| A1CF | epithelial | 0.06591014 | 868 |\n",
680 | "| A2M | epithelial | 633.41102535 | 868 |\n",
681 | "| A2M-AS1 | epithelial | 1.97836406 | 868 |\n",
682 | "| A2ML1 | epithelial | 20.12418203 | 868 |\n",
683 | "\n"
684 | ],
685 | "text/plain": [
686 | " V1 ClusterName value N \n",
687 | "1 A1BG epithelial 3.25911290 868\n",
688 | "2 A1BG-AS1 epithelial 4.61316820 868\n",
689 | "3 A1CF epithelial 0.06591014 868\n",
690 | "4 A2M epithelial 633.41102535 868\n",
691 | "5 A2M-AS1 epithelial 1.97836406 868\n",
692 | "6 A2ML1 epithelial 20.12418203 868"
693 | ]
694 | },
695 | "metadata": {},
696 | "output_type": "display_data"
697 | }
698 | ],
699 | "source": [
700 | "tnbc_norm=TNBC_data_long[,.(value=mean(value),N=.N),by=c(\"V1\",\"ClusterName\")]\n",
701 | "head(tnbc_norm)"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": 11,
707 | "metadata": {},
708 | "outputs": [
709 | {
710 | "data": {
711 | "text/html": [
712 | "\n",
713 | "gene | ClusterName | value | N | norm | log_norm |
\n",
714 | "\n",
715 | "\tZXDC | Tcell | 20.228302 | 53 | 3.017484e-05 | -10.408502 |
\n",
716 | "\tZYG11A | Tcell | 3.791887 | 53 | 5.656409e-06 | -12.082721 |
\n",
717 | "\tZYG11B | Tcell | 20.612264 | 53 | 3.074760e-05 | -10.389699 |
\n",
718 | "\tZYX | Tcell | 124.452830 | 53 | 1.856480e-04 | -8.591658 |
\n",
719 | "\tZZEF1 | Tcell | 91.452830 | 53 | 1.364214e-04 | -8.899762 |
\n",
720 | "\tZZZ3 | Tcell | 55.037736 | 53 | 8.210054e-05 | -9.407566 |
\n",
721 | "\n",
722 | "
\n"
723 | ],
724 | "text/latex": [
725 | "\\begin{tabular}{r|llllll}\n",
726 | " gene & ClusterName & value & N & norm & log\\_norm\\\\\n",
727 | "\\hline\n",
728 | "\t ZXDC & Tcell & 20.228302 & 53 & 3.017484e-05 & -10.408502 \\\\\n",
729 | "\t ZYG11A & Tcell & 3.791887 & 53 & 5.656409e-06 & -12.082721 \\\\\n",
730 | "\t ZYG11B & Tcell & 20.612264 & 53 & 3.074760e-05 & -10.389699 \\\\\n",
731 | "\t ZYX & Tcell & 124.452830 & 53 & 1.856480e-04 & -8.591658 \\\\\n",
732 | "\t ZZEF1 & Tcell & 91.452830 & 53 & 1.364214e-04 & -8.899762 \\\\\n",
733 | "\t ZZZ3 & Tcell & 55.037736 & 53 & 8.210054e-05 & -9.407566 \\\\\n",
734 | "\\end{tabular}\n"
735 | ],
736 | "text/markdown": [
737 | "\n",
738 | "| gene | ClusterName | value | N | norm | log_norm |\n",
739 | "|---|---|---|---|---|---|\n",
740 | "| ZXDC | Tcell | 20.228302 | 53 | 3.017484e-05 | -10.408502 |\n",
741 | "| ZYG11A | Tcell | 3.791887 | 53 | 5.656409e-06 | -12.082721 |\n",
742 | "| ZYG11B | Tcell | 20.612264 | 53 | 3.074760e-05 | -10.389699 |\n",
743 | "| ZYX | Tcell | 124.452830 | 53 | 1.856480e-04 | -8.591658 |\n",
744 | "| ZZEF1 | Tcell | 91.452830 | 53 | 1.364214e-04 | -8.899762 |\n",
745 | "| ZZZ3 | Tcell | 55.037736 | 53 | 8.210054e-05 | -9.407566 |\n",
746 | "\n"
747 | ],
748 | "text/plain": [
749 | " gene ClusterName value N norm log_norm \n",
750 | "1 ZXDC Tcell 20.228302 53 3.017484e-05 -10.408502\n",
751 | "2 ZYG11A Tcell 3.791887 53 5.656409e-06 -12.082721\n",
752 | "3 ZYG11B Tcell 20.612264 53 3.074760e-05 -10.389699\n",
753 | "4 ZYX Tcell 124.452830 53 1.856480e-04 -8.591658\n",
754 | "5 ZZEF1 Tcell 91.452830 53 1.364214e-04 -8.899762\n",
755 | "6 ZZZ3 Tcell 55.037736 53 8.210054e-05 -9.407566"
756 | ]
757 | },
758 | "metadata": {},
759 | "output_type": "display_data"
760 | }
761 | ],
762 | "source": [
763 | "tnbc_norm[,norm:=value/sum(value),by=\"ClusterName\"]\n",
764 | "tnbc_norm[,log_norm:=log(norm),]\n",
765 | "setnames(tnbc_norm,\"V1\",\"gene\")\n",
766 | "tail(tnbc_norm)"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": 12,
772 | "metadata": {},
773 | "outputs": [
774 | {
775 | "data": {
776 | "text/html": [
777 | "6"
778 | ],
779 | "text/latex": [
780 | "6"
781 | ],
782 | "text/markdown": [
783 | "6"
784 | ],
785 | "text/plain": [
786 | "[1] 6"
787 | ]
788 | },
789 | "metadata": {},
790 | "output_type": "display_data"
791 | }
792 | ],
793 | "source": [
794 | "N_ct=length(unique(tnbc_norm$ClusterName))\n",
795 | "N_ct"
796 | ]
797 | },
798 | {
799 | "cell_type": "code",
800 | "execution_count": 13,
801 | "metadata": {},
802 | "outputs": [],
803 | "source": [
804 | "tnbc_norm[,N_zero_ct:=length(unique(ClusterName[norm==0])),by=gene]"
805 | ]
806 | },
807 | {
808 | "cell_type": "code",
809 | "execution_count": 14,
810 | "metadata": {},
811 | "outputs": [
812 | {
813 | "data": {
814 | "text/html": [
815 | "19595"
816 | ],
817 | "text/latex": [
818 | "19595"
819 | ],
820 | "text/markdown": [
821 | "19595"
822 | ],
823 | "text/plain": [
824 | "[1] 19595"
825 | ]
826 | },
827 | "metadata": {},
828 | "output_type": "display_data"
829 | }
830 | ],
831 | "source": [
832 | "Ngenes=length(unique(tnbc_norm[N_zero_ct= 530){\n",
49 | " y=y-529\n",
50 | " }\n",
51 | " return(paste0(x,\"x\",y))\n",
52 | " }\n",
53 | " if (file.exists(paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"))){\n",
54 | " dat_long_filt_annot_red=fread(paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"))\n",
55 | " pl=ggplot(dat_long_filt_annot_red[sample(1:nrow(dat_long_filt_annot_red),40000)],aes(y=spot_px_y,x=spot_px_x))+geom_point(size=0.5)+coord_fixed()\n",
56 | " png(paste0(dir,\"/\",sample,\"_filtered_corrected.png\"),height=500,width=700)\n",
57 | " print(pl)\n",
58 | " dev.off()\n",
59 | " return(dat_long_filt_annot_red)\n",
60 | " }\n",
61 | " \n",
62 | " dat=fread(paste0(dir,\"/\",sample,\"_filtered.tsv.gz\"))\n",
63 | " sel_bc=fread(paste0(dir,\"/\",sub(\"_unmodgtf|_modgtf\",\"\",sample),\"_barcodes_under_tissue.tsv\"))\n",
64 | " if (any(grepl(\"ENS\",dat$V1))){\n",
65 | " message(\"Running in transpose mode\")\n",
66 | " chunks=unique(c(seq(from=2,to=ncol(dat),by=1000),ncol(dat)))\n",
67 | " steps=as.data.table(cbind(from=chunks[-length(chunks)],to=chunks[-1]-1))\n",
68 | " steps[nrow(steps),to:=to+1,]\n",
69 | " \n",
70 | " dat_long=data.table()\n",
71 | " for (i in 1:nrow(steps)){\n",
72 | " cat(paste0(i,\" \"))\n",
73 | " from=steps[i]$from\n",
74 | " to=steps[i]$to\n",
75 | " dat_long_tmp=melt(dat[,c(1,from:to),with=FALSE],id.vars=c(\"V1\"),variable.name = \"bc_old\")\n",
76 | " dat_long_tmp[,bc:=adjust.y(bc_old[1]),by=bc_old]\n",
77 | " dat_long=rbindlist(list(dat_long,dat_long_tmp[value>0]))\n",
78 | " }\n",
79 | " setnames(dat_long,c(\"V1\",\"value\"),c(\"ensGV\",\"count\"))\n",
80 | " setcolorder(dat_long,c(\"bc_old\",\"bc\",\"ensGV\",\"count\"))\n",
81 | "\n",
82 | " }else{\n",
83 | " dat[,V2:=adjust.y(V1),by=1:nrow(dat)]\n",
84 | " chunks=unique(c(seq(from=1,to=nrow(dat),by=1000),nrow(dat)))\n",
85 | " steps=as.data.table(cbind(from=chunks[-length(chunks)],to=chunks[-1]-1))\n",
86 | " steps[nrow(steps),to:=to+1,]\n",
87 | " \n",
88 | " dat_long=data.table()\n",
89 | " for (i in 1:nrow(steps)){\n",
90 | " cat(paste0(i,\" \"))\n",
91 | " from=steps[i]$from\n",
92 | " to=steps[i]$to\n",
93 | " dat_long_tmp=melt(dat[from:to,],id.vars=c(\"V1\",\"V2\"))\n",
94 | " dat_long=rbindlist(list(dat_long,dat_long_tmp[value>0]))\n",
95 | " }\n",
96 | " setnames(dat_long,names(dat_long),c(\"bc_old\",\"bc\",\"ensGV\",\"count\"))\n",
97 | " }\n",
98 | " \n",
99 | " dat_long_filt=dat_long[!grepl(\"\\\\+\",ensGV)]\n",
100 | " \n",
101 | " dat_long_filt[,ensG:=unlist(strsplit(as.character(ensGV),\"\\\\.\"))[1],by=1:nrow(dat_long_filt)]\n",
102 | " dat_long_filt[,x:=as.numeric(unlist(strsplit(as.character(bc),\"x\"))[1]),by=1:nrow(dat_long_filt)]\n",
103 | " dat_long_filt[,y:=as.numeric(unlist(strsplit(as.character(bc),\"x\"))[2]),by=1:nrow(dat_long_filt)]\n",
104 | " \n",
105 | " dat_long_filt_annot=merge(dat_long_filt,genes,by=\"ensG\",all.x=TRUE)\n",
106 | " dat_long_filt_annot_red=merge(dat_long_filt_annot,sel_bc,by.x=c(\"x\",\"y\"),by.y=c(\"spot_x\",\"spot_y\"))\n",
107 | " \n",
108 | " write.table(dat_long_filt_annot,paste0(dir,\"/\",sample,\"_filtered_red.tsv\"),sep=\"\\t\",row.names=FALSE,quote=FALSE)\n",
109 | " write.table(dat_long_filt_annot_red,paste0(dir,\"/\",sample,\"_filtered_red_ut.tsv\"),sep=\"\\t\",row.names=FALSE,quote=FALSE)\n",
110 | " if (plot == TRUE){\n",
111 | " pl=ggplot(dat_long_filt_annot_red[sample(1:nrow(dat_long_filt_annot_red),40000)],aes(y=spot_px_y,x=spot_px_x))+geom_point(size=0.5)+coord_fixed()\n",
112 | " png(paste0(dir,\"/\",sample,\"_filtered_corrected.png\"),height=500,width=700)\n",
113 | " print(pl)\n",
114 | " dev.off()\n",
115 | " }\n",
116 | " return(dat_long_filt_annot_red)\n",
117 | "}"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "##### MOB data"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "genes=fread(\"ext_data/ensemble_gene_names_V94.txt\")\n",
134 | "setnames(genes,names(genes),c(\"ensGV\",\"ensG\",\"gene\"))"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 22,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "dat=prep_hdst_data(\"MOB\",\"CN13_D2\",genes,TRUE)"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 21,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "dat=prep_hdst_data(\"MOB\",\"CN24_D1\",genes,TRUE)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "dat=prep_hdst_data(\"MOB\",\"CN24_E1\",genes,TRUE)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "dat=prep_hdst_data(\"MOB_nc\",\"CN13_D2_unmodgtf\",genes,TRUE)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "dat=prep_hdst_data(\"MOB_nc\",\"CN24_D1_unmodgtf\",genes,TRUE)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "dat=prep_hdst_data(\"MOB_nc\",\"CN24_E1_unmodgtf\",genes,TRUE)"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "##### Breast cancer data"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "genes=fread(\"ext_data/ensemble_gene_names_human_V96.txt\")\n",
205 | "setnames(genes,names(genes),c(\"ensGV\",\"ensG\",\"gene\"))"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "dat=prep_hdst_data(\"BC\",\"CN21_BC24350_E2\",genes,TRUE)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "dat=prep_hdst_data(\"BC_nc/\",\"CN21_BC24350_E2_unmodgtf\",genes,TRUE)"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "### Process segmented HDST data"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "# for MOB (standard gtf)\n",
240 | "dat=fread(\"MOB_nc/CN13_D2_filtered_red_ut.tsv\")\n",
241 | "seg=fread(\"MOB/CellID_Spot_Position_CN13_D2_filtered_red_ut.csv\")\n",
242 | "tag=\"CN13_D2\"\n",
243 | "dir=\"MOB\""
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# for MOB (nc gtf)\n",
253 | "dat=fread(\"MOB_nc/CN13_D2_unmodgtf_filtered_red_ut.tsv\")\n",
254 | "seg=fread(\"MOB/CellID_Spot_Position_CN13_D2_filtered_red_ut.csv\")\n",
255 | "tag=\"CN13_D2_unmodgtf\"\n",
256 | "dir=\"MOB_nc\""
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 10,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# for MOB (nc gtf)\n",
266 | "dat=fread(\"MOB_nc/CN24_D1_unmodgtf_filtered_red_ut.tsv\")\n",
267 | "seg=fread(\"MOB_nc/CellID_Spot_Position_CN24_D1_unmodgtf_filtered_red_ut_flipped.csv\")\n",
268 | "tag=\"CN24_D1_unmodgtf\"\n",
269 | "dir=\"MOB_nc\""
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 14,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "# for MOB (nc gtf)\n",
279 | "dat=fread(\"MOB_nc/CN24_E1_unmodgtf_filtered_red_ut.tsv\")\n",
280 | "seg=fread(\"MOB_nc/CellID_Spot_Position_CN24_E1_unmodgtf_filtered_red_ut_flipped.csv\")\n",
281 | "tag=\"CN24_E1_unmodgtf\"\n",
282 | "dir=\"MOB_nc\""
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "# for BC (nc gtf)\n",
292 | "dat=fread(\"BC_nc/CN21_BC24350_E2_unmodgtf_filtered_red_ut.tsv\")\n",
293 | "seg=fread(\"BC/CellID_Spot_Position_CN21_E2_filtered_red_ut_BC_flipped.csv\")\n",
294 | "tag=\"CN21_BC24350_E2_unmodgtf\"\n",
295 | "dir=\"BC_nc\" #previously stored in BC"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "#### here run"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 15,
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/html": [
313 | "\n",
314 | "bc | x | y | ensG | bc_old | ensGV.x | count | ensGV.y | gene | spot_px_y | spot_px_x | cell_id | x_centroid | y_centroid | N_bc |
\n",
315 | "\n",
316 | "\t1000x187 | 1000 | 187 | ENSMUSG00000002985 | 1000x716 | ENSMUSG00000002985.16 | 1 | ENSMUSG00000002985.16 | Apoe | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
317 | "\t1000x187 | 1000 | 187 | ENSMUSG00000020193 | 1000x716 | ENSMUSG00000020193.3 | 1 | ENSMUSG00000020193.3 | Zpbp | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
318 | "\t1000x187 | 1000 | 187 | ENSMUSG00000020483 | 1000x716 | ENSMUSG00000020483.14 | 1 | ENSMUSG00000020483.14 | Dynll2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
319 | "\t1000x187 | 1000 | 187 | ENSMUSG00000025907 | 1000x716 | ENSMUSG00000025907.14 | 1 | ENSMUSG00000025907.14 | Rb1cc1 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
320 | "\t1000x187 | 1000 | 187 | ENSMUSG00000029635 | 1000x716 | ENSMUSG00000029635.15 | 1 | ENSMUSG00000029635.15 | Cdk8 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
321 | "\t1000x187 | 1000 | 187 | ENSMUSG00000035202 | 1000x716 | ENSMUSG00000035202.7 | 1 | ENSMUSG00000035202.8 | Lars2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |
\n",
322 | "\n",
323 | "
\n"
324 | ],
325 | "text/latex": [
326 | "\\begin{tabular}{r|lllllllllllllll}\n",
327 | " bc & x & y & ensG & bc\\_old & ensGV.x & count & ensGV.y & gene & spot\\_px\\_y & spot\\_px\\_x & cell\\_id & x\\_centroid & y\\_centroid & N\\_bc\\\\\n",
328 | "\\hline\n",
329 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000002985 & 1000x716 & ENSMUSG00000002985.16 & 1 & ENSMUSG00000002985.16 & Apoe & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
330 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000020193 & 1000x716 & ENSMUSG00000020193.3 & 1 & ENSMUSG00000020193.3 & Zpbp & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
331 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000020483 & 1000x716 & ENSMUSG00000020483.14 & 1 & ENSMUSG00000020483.14 & Dynll2 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
332 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000025907 & 1000x716 & ENSMUSG00000025907.14 & 1 & ENSMUSG00000025907.14 & Rb1cc1 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
333 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000029635 & 1000x716 & ENSMUSG00000029635.15 & 1 & ENSMUSG00000029635.15 & Cdk8 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
334 | "\t 1000x187 & 1000 & 187 & ENSMUSG00000035202 & 1000x716 & ENSMUSG00000035202.7 & 1 & ENSMUSG00000035202.8 & Lars2 & 2295 & 10077 & 25146 & 10067.29 & 5306.808 & 10 \\\\\n",
335 | "\\end{tabular}\n"
336 | ],
337 | "text/markdown": [
338 | "\n",
339 | "| bc | x | y | ensG | bc_old | ensGV.x | count | ensGV.y | gene | spot_px_y | spot_px_x | cell_id | x_centroid | y_centroid | N_bc |\n",
340 | "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n",
341 | "| 1000x187 | 1000 | 187 | ENSMUSG00000002985 | 1000x716 | ENSMUSG00000002985.16 | 1 | ENSMUSG00000002985.16 | Apoe | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
342 | "| 1000x187 | 1000 | 187 | ENSMUSG00000020193 | 1000x716 | ENSMUSG00000020193.3 | 1 | ENSMUSG00000020193.3 | Zpbp | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
343 | "| 1000x187 | 1000 | 187 | ENSMUSG00000020483 | 1000x716 | ENSMUSG00000020483.14 | 1 | ENSMUSG00000020483.14 | Dynll2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
344 | "| 1000x187 | 1000 | 187 | ENSMUSG00000025907 | 1000x716 | ENSMUSG00000025907.14 | 1 | ENSMUSG00000025907.14 | Rb1cc1 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
345 | "| 1000x187 | 1000 | 187 | ENSMUSG00000029635 | 1000x716 | ENSMUSG00000029635.15 | 1 | ENSMUSG00000029635.15 | Cdk8 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
346 | "| 1000x187 | 1000 | 187 | ENSMUSG00000035202 | 1000x716 | ENSMUSG00000035202.7 | 1 | ENSMUSG00000035202.8 | Lars2 | 2295 | 10077 | 25146 | 10067.29 | 5306.808 | 10 |\n",
347 | "\n"
348 | ],
349 | "text/plain": [
350 | " bc x y ensG bc_old ensGV.x count\n",
351 | "1 1000x187 1000 187 ENSMUSG00000002985 1000x716 ENSMUSG00000002985.16 1 \n",
352 | "2 1000x187 1000 187 ENSMUSG00000020193 1000x716 ENSMUSG00000020193.3 1 \n",
353 | "3 1000x187 1000 187 ENSMUSG00000020483 1000x716 ENSMUSG00000020483.14 1 \n",
354 | "4 1000x187 1000 187 ENSMUSG00000025907 1000x716 ENSMUSG00000025907.14 1 \n",
355 | "5 1000x187 1000 187 ENSMUSG00000029635 1000x716 ENSMUSG00000029635.15 1 \n",
356 | "6 1000x187 1000 187 ENSMUSG00000035202 1000x716 ENSMUSG00000035202.7 1 \n",
357 | " ensGV.y gene spot_px_y spot_px_x cell_id x_centroid\n",
358 | "1 ENSMUSG00000002985.16 Apoe 2295 10077 25146 10067.29 \n",
359 | "2 ENSMUSG00000020193.3 Zpbp 2295 10077 25146 10067.29 \n",
360 | "3 ENSMUSG00000020483.14 Dynll2 2295 10077 25146 10067.29 \n",
361 | "4 ENSMUSG00000025907.14 Rb1cc1 2295 10077 25146 10067.29 \n",
362 | "5 ENSMUSG00000029635.15 Cdk8 2295 10077 25146 10067.29 \n",
363 | "6 ENSMUSG00000035202.8 Lars2 2295 10077 25146 10067.29 \n",
364 | " y_centroid N_bc\n",
365 | "1 5306.808 10 \n",
366 | "2 5306.808 10 \n",
367 | "3 5306.808 10 \n",
368 | "4 5306.808 10 \n",
369 | "5 5306.808 10 \n",
370 | "6 5306.808 10 "
371 | ]
372 | },
373 | "metadata": {},
374 | "output_type": "display_data"
375 | },
376 | {
377 | "data": {
378 | "text/html": [
379 | "856994"
380 | ],
381 | "text/latex": [
382 | "856994"
383 | ],
384 | "text/markdown": [
385 | "856994"
386 | ],
387 | "text/plain": [
388 | "[1] 856994"
389 | ]
390 | },
391 | "metadata": {},
392 | "output_type": "display_data"
393 | }
394 | ],
395 | "source": [
396 | "dat_seg_pre=merge(dat,seg[cell_id!=0],by=\"bc\")\n",
397 | "dat_seg_pre[,c(\"x\",\"y\",\"N_bc\"):=list(x[1],y[1],length(unique(bc))),by=\"cell_id\"] #assign the x y coordinates of the first barcode to each cell id such that each cell id only has 1 xy coordinate\n",
398 | "head(dat_seg_pre)\n",
399 | "nrow(dat_seg_pre)"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 16,
405 | "metadata": {},
406 | "outputs": [
407 | {
408 | "data": {
409 | "text/html": [
410 | "\n",
411 | "cell_id | x_centroid | y_centroid | N_bc | gene | x | y | bc | count | spot_px_y | spot_px_x |
\n",
412 | "\n",
413 | "\t25146 | 10067.29 | 5306.808 | 10 | Apoe | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |
\n",
414 | "\t25146 | 10067.29 | 5306.808 | 10 | Zpbp | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |
\n",
415 | "\t25146 | 10067.29 | 5306.808 | 10 | Dynll2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |
\n",
416 | "\t25146 | 10067.29 | 5306.808 | 10 | Rb1cc1 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |
\n",
417 | "\t25146 | 10067.29 | 5306.808 | 10 | Cdk8 | 1000 | 187 | 1000x187 996x186 999x189 | 3 | 2295 | 10077 |
\n",
418 | "\t25146 | 10067.29 | 5306.808 | 10 | Lars2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |
\n",
419 | "\n",
420 | "
\n"
421 | ],
422 | "text/latex": [
423 | "\\begin{tabular}{r|lllllllllll}\n",
424 | " cell\\_id & x\\_centroid & y\\_centroid & N\\_bc & gene & x & y & bc & count & spot\\_px\\_y & spot\\_px\\_x\\\\\n",
425 | "\\hline\n",
426 | "\t 25146 & 10067.29 & 5306.808 & 10 & Apoe & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n",
427 | "\t 25146 & 10067.29 & 5306.808 & 10 & Zpbp & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n",
428 | "\t 25146 & 10067.29 & 5306.808 & 10 & Dynll2 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n",
429 | "\t 25146 & 10067.29 & 5306.808 & 10 & Rb1cc1 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n",
430 | "\t 25146 & 10067.29 & 5306.808 & 10 & Cdk8 & 1000 & 187 & 1000x187 996x186 999x189 & 3 & 2295 & 10077 \\\\\n",
431 | "\t 25146 & 10067.29 & 5306.808 & 10 & Lars2 & 1000 & 187 & 1000x187 & 1 & 2295 & 10077 \\\\\n",
432 | "\\end{tabular}\n"
433 | ],
434 | "text/markdown": [
435 | "\n",
436 | "| cell_id | x_centroid | y_centroid | N_bc | gene | x | y | bc | count | spot_px_y | spot_px_x |\n",
437 | "|---|---|---|---|---|---|---|---|---|---|---|\n",
438 | "| 25146 | 10067.29 | 5306.808 | 10 | Apoe | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n",
439 | "| 25146 | 10067.29 | 5306.808 | 10 | Zpbp | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n",
440 | "| 25146 | 10067.29 | 5306.808 | 10 | Dynll2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n",
441 | "| 25146 | 10067.29 | 5306.808 | 10 | Rb1cc1 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n",
442 | "| 25146 | 10067.29 | 5306.808 | 10 | Cdk8 | 1000 | 187 | 1000x187 996x186 999x189 | 3 | 2295 | 10077 |\n",
443 | "| 25146 | 10067.29 | 5306.808 | 10 | Lars2 | 1000 | 187 | 1000x187 | 1 | 2295 | 10077 |\n",
444 | "\n"
445 | ],
446 | "text/plain": [
447 | " cell_id x_centroid y_centroid N_bc gene x y bc \n",
448 | "1 25146 10067.29 5306.808 10 Apoe 1000 187 1000x187 \n",
449 | "2 25146 10067.29 5306.808 10 Zpbp 1000 187 1000x187 \n",
450 | "3 25146 10067.29 5306.808 10 Dynll2 1000 187 1000x187 \n",
451 | "4 25146 10067.29 5306.808 10 Rb1cc1 1000 187 1000x187 \n",
452 | "5 25146 10067.29 5306.808 10 Cdk8 1000 187 1000x187 996x186 999x189\n",
453 | "6 25146 10067.29 5306.808 10 Lars2 1000 187 1000x187 \n",
454 | " count spot_px_y spot_px_x\n",
455 | "1 1 2295 10077 \n",
456 | "2 1 2295 10077 \n",
457 | "3 1 2295 10077 \n",
458 | "4 1 2295 10077 \n",
459 | "5 3 2295 10077 \n",
460 | "6 1 2295 10077 "
461 | ]
462 | },
463 | "metadata": {},
464 | "output_type": "display_data"
465 | },
466 | {
467 | "data": {
468 | "text/html": [
469 | "22229"
470 | ],
471 | "text/latex": [
472 | "22229"
473 | ],
474 | "text/markdown": [
475 | "22229"
476 | ],
477 | "text/plain": [
478 | "[1] 22229"
479 | ]
480 | },
481 | "metadata": {},
482 | "output_type": "display_data"
483 | },
484 | {
485 | "data": {
486 | "text/html": [
487 | "684443"
488 | ],
489 | "text/latex": [
490 | "684443"
491 | ],
492 | "text/markdown": [
493 | "684443"
494 | ],
495 | "text/plain": [
496 | "[1] 684443"
497 | ]
498 | },
499 | "metadata": {},
500 | "output_type": "display_data"
501 | },
502 | {
503 | "data": {
504 | "text/plain": [
505 | "\n",
506 | " 1 \n",
507 | "22229 "
508 | ]
509 | },
510 | "metadata": {},
511 | "output_type": "display_data"
512 | },
513 | {
514 | "data": {
515 | "text/plain": [
516 | "\n",
517 | " 1 \n",
518 | "22229 "
519 | ]
520 | },
521 | "metadata": {},
522 | "output_type": "display_data"
523 | }
524 | ],
525 | "source": [
526 | "dat_seg=dat_seg_pre[,.(bc=paste0(bc,collapse = \" \"),count=sum(count),spot_px_y=spot_px_y[1],spot_px_x=spot_px_x[1]),by=c(\"cell_id\",\"x_centroid\",\"y_centroid\",\"N_bc\",\"gene\",\"x\",\"y\")]\n",
527 | "head(dat_seg)\n",
528 | "length(unique(dat_seg$cell_id))\n",
529 | "nrow(dat_seg)\n",
530 | "table(dat_seg[,length(unique(cell_id)),by=c(\"x\",\"y\")]$V1) #make sure each coordinate only hase one cell id\n",
531 | "table(dat_seg[,length(unique(paste0(x,\"_\",y))),by=c(\"cell_id\")]$V1) #make sure each cell id only has one coordinate"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": 17,
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "write.table(dat_seg,paste0(dir,\"/\",tag,\"_filtered_red_ut_segmented.tsv\"),sep=\"\\t\",quote=FALSE,row.names = FALSE)"
541 | ]
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | "### Process binned HDST data"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 2,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": [
556 | "#for MOB (standard gtf)\n",
557 | "bin_sizes=c(\"5x\",\"10x\",\"20x\",\"38x\",\"38x-thin\")\n",
558 | "dir=\"MOB_binned\""
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "#for MOB (nc gtf)\n",
568 | "bin_sizes=c(\"5x\")\n",
569 | "dir=\"MOB_binned_nc\""
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 3,
575 | "metadata": {},
576 | "outputs": [],
577 | "source": [
578 | "#for MOB (nc gtf) new sample\n",
579 | "bin_sizes=c(\"5x\")\n",
580 | "dir=\"MOB_binned_nc/E1\""
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 7,
586 | "metadata": {},
587 | "outputs": [],
588 | "source": [
589 | "#for MOB (nc gtf) new sample\n",
590 | "bin_sizes=c(\"5x\")\n",
591 | "dir=\"MOB_binned_nc/D1\""
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 2,
597 | "metadata": {},
598 | "outputs": [],
599 | "source": [
600 | "#for BC (standard gtf)\n",
601 | "bin_sizes=c(\"5x\")\n",
602 | "dir=\"BC_binned\""
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": 2,
608 | "metadata": {},
609 | "outputs": [],
610 | "source": [
611 | "#for BC (nc gtf)\n",
612 | "bin_sizes=c(\"5x\")\n",
613 | "dir=\"BC_binned_nc\""
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 3,
619 | "metadata": {},
620 | "outputs": [],
621 | "source": [
622 | "#for BC (nc gtf)\n",
623 | "bin_sizes=c(\"5x\")\n",
624 | "dir=\"BC_binned_nc/C1\""
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": 7,
630 | "metadata": {},
631 | "outputs": [],
632 | "source": [
633 | "#for BC (nc gtf)\n",
634 | "bin_sizes=c(\"5x\")\n",
635 | "dir=\"BC_binned_nc/D1\""
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 8,
641 | "metadata": {},
642 | "outputs": [
643 | {
644 | "name": "stdout",
645 | "output_type": "stream",
646 | "text": [
647 | "[1] \"BC_binned_nc/D1/hdst-breast-cancer-D1-lowres-5x.csv.gz\"\n"
648 | ]
649 | }
650 | ],
651 | "source": [
652 | "dat_all_bin=data.table()\n",
653 | "for (bin in bin_sizes){\n",
654 | " dat_file=list.files(path = dir,pattern = paste0(\"hdst.*lowres-\",bin,\".csv.*\"),full.names = TRUE)\n",
655 | " bins_file=list.files(path = dir,pattern = paste0(\"hdst.*lowres-\",bin,\"-bins.csv.*\"),full.names = TRUE)\n",
656 | " print(dat_file)\n",
657 | " dat=fread(dat_file)\n",
658 | " bins=fread(bins_file)\n",
659 | " dat[,x:=bins$x,]\n",
660 | " dat[,y:=bins$y,]\n",
661 | " dat_long=melt(dat,id.vars = c(\"x\",\"y\"),variable.name = \"gene\",value.name = \"count\")\n",
662 | " dat_long=dat_long[count>0]\n",
663 | " dat_long[,bin:=bin,]\n",
664 | " dat_long[,bc:=paste0(x,\"_\",y),]\n",
665 | " dat_all_bin=rbindlist(list(dat_all_bin,dat_long))\n",
666 | "}"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 9,
672 | "metadata": {},
673 | "outputs": [
674 | {
675 | "data": {
676 | "text/html": [
677 | "\n",
678 | "x | y | gene | count | bin | bc |
\n",
679 | "\n",
680 | "\t241 | 67 | TSPAN6 | 1 | 5x | 241_67 |
\n",
681 | "\t 76 | 134 | DPM1 | 1 | 5x | 76_134 |
\n",
682 | "\t 94 | 80 | DPM1 | 1 | 5x | 94_80 |
\n",
683 | "\t142 | 155 | DPM1 | 1 | 5x | 142_155 |
\n",
684 | "\t144 | 156 | DPM1 | 1 | 5x | 144_156 |
\n",
685 | "\t185 | 16 | DPM1 | 1 | 5x | 185_16 |
\n",
686 | "\n",
687 | "
\n"
688 | ],
689 | "text/latex": [
690 | "\\begin{tabular}{r|llllll}\n",
691 | " x & y & gene & count & bin & bc\\\\\n",
692 | "\\hline\n",
693 | "\t 241 & 67 & TSPAN6 & 1 & 5x & 241\\_67 \\\\\n",
694 | "\t 76 & 134 & DPM1 & 1 & 5x & 76\\_134 \\\\\n",
695 | "\t 94 & 80 & DPM1 & 1 & 5x & 94\\_80 \\\\\n",
696 | "\t 142 & 155 & DPM1 & 1 & 5x & 142\\_155\\\\\n",
697 | "\t 144 & 156 & DPM1 & 1 & 5x & 144\\_156\\\\\n",
698 | "\t 185 & 16 & DPM1 & 1 & 5x & 185\\_16 \\\\\n",
699 | "\\end{tabular}\n"
700 | ],
701 | "text/markdown": [
702 | "\n",
703 | "| x | y | gene | count | bin | bc |\n",
704 | "|---|---|---|---|---|---|\n",
705 | "| 241 | 67 | TSPAN6 | 1 | 5x | 241_67 |\n",
706 | "| 76 | 134 | DPM1 | 1 | 5x | 76_134 |\n",
707 | "| 94 | 80 | DPM1 | 1 | 5x | 94_80 |\n",
708 | "| 142 | 155 | DPM1 | 1 | 5x | 142_155 |\n",
709 | "| 144 | 156 | DPM1 | 1 | 5x | 144_156 |\n",
710 | "| 185 | 16 | DPM1 | 1 | 5x | 185_16 |\n",
711 | "\n"
712 | ],
713 | "text/plain": [
714 | " x y gene count bin bc \n",
715 | "1 241 67 TSPAN6 1 5x 241_67 \n",
716 | "2 76 134 DPM1 1 5x 76_134 \n",
717 | "3 94 80 DPM1 1 5x 94_80 \n",
718 | "4 142 155 DPM1 1 5x 142_155\n",
719 | "5 144 156 DPM1 1 5x 144_156\n",
720 | "6 185 16 DPM1 1 5x 185_16 "
721 | ]
722 | },
723 | "metadata": {},
724 | "output_type": "display_data"
725 | }
726 | ],
727 | "source": [
728 | "head(dat_all_bin)"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": 10,
734 | "metadata": {},
735 | "outputs": [],
736 | "source": [
737 | "write.table(dat_all_bin,paste0(dir,\"/hdst-lowres.tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)"
738 | ]
739 | },
740 | {
741 | "cell_type": "markdown",
742 | "metadata": {},
743 | "source": [
744 | "### Process standard ST"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 90,
750 | "metadata": {},
751 | "outputs": [],
752 | "source": [
753 | "prep_st=function(mat){\n",
754 | " mat[,x:=unlist(strsplit(V1,\"x\"))[1],by=1:nrow(mat)]\n",
755 | " mat[,y:=unlist(strsplit(V1,\"x\"))[2],by=1:nrow(mat)]\n",
756 | " mat_long=melt(mat[,-c(\"V1\"),],id.vars = c(\"x\",\"y\"),variable.name = \"gene\",value.name = \"count\")\n",
757 | " return(mat_long[count>0])\n",
758 | "}"
759 | ]
760 | },
761 | {
762 | "cell_type": "code",
763 | "execution_count": 91,
764 | "metadata": {},
765 | "outputs": [
766 | {
767 | "name": "stderr",
768 | "output_type": "stream",
769 | "text": [
770 | "Warning message in fread(\"MOB_STST/Rep4_MOB1x.csv\"):\n",
771 | "\"Detected 15941 column names but the data has 15942 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.\""
772 | ]
773 | }
774 | ],
775 | "source": [
776 | "st_dat=fread(\"MOB_STST/Rep4_MOB1x.csv\")"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 92,
782 | "metadata": {},
783 | "outputs": [],
784 | "source": [
785 | "st_long=prep_st(st_dat)"
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": 94,
791 | "metadata": {},
792 | "outputs": [
793 | {
794 | "data": {
795 | "text/html": [
796 | "\n",
797 | "x | y | gene | count |
\n",
798 | "\n",
799 | "\t16.105 | 29.003 | Mx1 | 1 |
\n",
800 | "\t16.105 | 29.003 | Cenpa | 1 |
\n",
801 | "\t16.105 | 29.003 | Snora17 | 1 |
\n",
802 | "\t16.984 | 29.022 | Nlrp5 | 1 |
\n",
803 | "\t16.984 | 29.022 | Sned1 | 1 |
\n",
804 | "\t16.984 | 29.022 | Gm933 | 1 |
\n",
805 | "\n",
806 | "
\n"
807 | ],
808 | "text/latex": [
809 | "\\begin{tabular}{r|llll}\n",
810 | " x & y & gene & count\\\\\n",
811 | "\\hline\n",
812 | "\t 16.105 & 29.003 & Mx1 & 1 \\\\\n",
813 | "\t 16.105 & 29.003 & Cenpa & 1 \\\\\n",
814 | "\t 16.105 & 29.003 & Snora17 & 1 \\\\\n",
815 | "\t 16.984 & 29.022 & Nlrp5 & 1 \\\\\n",
816 | "\t 16.984 & 29.022 & Sned1 & 1 \\\\\n",
817 | "\t 16.984 & 29.022 & Gm933 & 1 \\\\\n",
818 | "\\end{tabular}\n"
819 | ],
820 | "text/markdown": [
821 | "\n",
822 | "x | y | gene | count | \n",
823 | "|---|---|---|---|---|---|\n",
824 | "| 16.105 | 29.003 | Mx1 | 1 | \n",
825 | "| 16.105 | 29.003 | Cenpa | 1 | \n",
826 | "| 16.105 | 29.003 | Snora17 | 1 | \n",
827 | "| 16.984 | 29.022 | Nlrp5 | 1 | \n",
828 | "| 16.984 | 29.022 | Sned1 | 1 | \n",
829 | "| 16.984 | 29.022 | Gm933 | 1 | \n",
830 | "\n",
831 | "\n"
832 | ],
833 | "text/plain": [
834 | " x y gene count\n",
835 | "1 16.105 29.003 Mx1 1 \n",
836 | "2 16.105 29.003 Cenpa 1 \n",
837 | "3 16.105 29.003 Snora17 1 \n",
838 | "4 16.984 29.022 Nlrp5 1 \n",
839 | "5 16.984 29.022 Sned1 1 \n",
840 | "6 16.984 29.022 Gm933 1 "
841 | ]
842 | },
843 | "metadata": {},
844 | "output_type": "display_data"
845 | }
846 | ],
847 | "source": [
848 | "tail(st_long)"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 95,
854 | "metadata": {},
855 | "outputs": [],
856 | "source": [
857 | "write.table(st_long,\"MOB_STST/Rep4_MOB1x_long.tsv\",sep=\"\\t\",quote=FALSE,row.names=FALSE)"
858 | ]
859 | },
860 | {
861 | "cell_type": "markdown",
862 | "metadata": {},
863 | "source": [
864 | "### Annotate barcodes with anatomic features"
865 | ]
866 | },
867 | {
868 | "cell_type": "code",
869 | "execution_count": null,
870 | "metadata": {},
871 | "outputs": [],
872 | "source": [
873 | "library(data.table)\n",
874 | "library(raster)\n",
875 | "library(rgeos)\n",
876 | "library(ggplot2)"
877 | ]
878 | },
879 | {
880 | "cell_type": "code",
881 | "execution_count": null,
882 | "metadata": {},
883 | "outputs": [],
884 | "source": [
885 | "setwd(data_dir) #set wd to directory containing the polygon file as well as the barcodes under tissue file"
886 | ]
887 | },
888 | {
889 | "cell_type": "code",
890 | "execution_count": null,
891 | "metadata": {},
892 | "outputs": [],
893 | "source": [
894 | "annotate_bc=function(sample,plot=TRUE,max_dist=Inf,n_chunks=NULL,flip=TRUE){\n",
895 | " #read data\n",
896 | " poly=fread(paste0(sample,\"_annotations.txt\"))\n",
897 | " print(nrow(poly))\n",
898 | " bc=fread(paste0(sample,\"_barcodes_under_tissue.tsv\"))\n",
899 | " poly[value==\"\",value:=\"Unknown\"]\n",
900 | " if (any(\"polygon\"%in%names(poly))){setnames(poly,\"polygon\",\"x_y\")}\n",
901 | "\n",
902 | " \n",
903 | " #convert polygons table into list\n",
904 | " poly_long=poly[,.(x_y=strsplit(x_y,\" \"),value),by=1:nrow(poly)]\n",
905 | " poly_list=apply(poly_long,1,function(x){r=lapply(strsplit(unlist(x[\"x_y\"]),\",\"),as.numeric);r=lapply(r,function(x){if(length(x)==1){x=c(x,NA)};return(x)});r=t(as.data.table(r));colnames(r)=c(\"x\",\"y\");return(na.omit(r))})\n",
906 | " names(poly_list)=poly_long$value\n",
907 | " \n",
908 | " #make list of polygon objects\n",
909 | " sp=rapply(poly_list, Polygon,hole=FALSE, how = \"replace\")\n",
910 | " sp=lapply(1:length(sp), function(i) {Polygons(sp[i], as.character(i))})\n",
911 | " \n",
912 | " #make spatial poligons object\n",
913 | " pols=SpatialPolygons(sp)\n",
914 | " plot(pols)\n",
915 | " \n",
916 | " #prepare HD_ST coordinates (need to be mirrored at y-axis in some samples)\n",
917 | " if (flip==TRUE){\n",
918 | " print(\"Flipping y axis.\")\n",
919 | " flipped_bc=bc[,.(spot_px_x=spot_px_x,spot_px_y=-(spot_px_y-min(spot_px_y))+(max(spot_px_y))),]\n",
920 | " }else{\n",
921 | " print(\"Not flipping y axis.\")\n",
922 | " flipped_bc=bc[,.(spot_px_x=spot_px_x,spot_px_y=spot_px_y),]\n",
923 | " }\n",
924 | " \n",
925 | " #overlap polygons and HD_ST coordinates\n",
926 | " nr=nrow(flipped_bc)\n",
927 | " print(nr)\n",
928 | "\n",
929 | " if (!is.null(n_chunks)){\n",
930 | " chunks_size=floor(nr/n_chunks)\n",
931 | " e=data.table()\n",
932 | " for (i in 0:n_chunks){\n",
933 | " start=i*chunks_size+1\n",
934 | " end=ifelse(i==n_chunks,nr,(i+1)*chunks_size)\n",
935 | " if (start > nr){break}\n",
936 | " print(paste0(\"Processing chunk \",start,\" to \",end))\n",
937 | " e1 = as.data.table(extract(pols, flipped_bc[start:end]))\n",
938 | " e1[,point.ID:=point.ID+(start-1),]\n",
939 | " e=rbindlist(list(e,e1))\n",
940 | " }\n",
941 | " }else{\n",
942 | " e = as.data.table(extract(pols, flipped_bc))\n",
943 | " } \n",
944 | " \n",
945 | " e[,poly:=poly.ID,]\n",
946 | " e[,poly.ID:=names(poly_list)[poly],]\n",
947 | " \n",
948 | " #find nearest polygons for HD_ST coordinates that don't fall into a polygon\n",
949 | " print(\"Now assigning missing.\")\n",
950 | " missing_id=e[is.na(poly.ID)]$point.ID\n",
951 | " missing=flipped_bc[missing_id,]\n",
952 | " missing[,point.ID:=missing_id,]\n",
953 | " sp_pts=SpatialPoints(missing[,c(\"spot_px_x\", \"spot_px_y\"),])\n",
954 | " dist=gDistance(sp_pts,pols,byid = TRUE)\n",
955 | " if(max_dist!=Inf){\n",
956 | " dist=apply(dist,c(1,2),function(x){ifelse(x>max_dist,NA,x)})\n",
957 | " missing[,np:=apply(dist,2,function(x){res=which.min(x);ifelse(length(res)>0,res,NA)})]\n",
958 | " missing=missing[!is.na(np)]\n",
959 | " }else{\n",
960 | " missing[,np:=apply(dist,2,which.min)]\n",
961 | " }\n",
962 | " missing[,poly:=np,]\n",
963 | " missing[,poly.ID:=names(poly_list)[np],]\n",
964 | " \n",
965 | " #combine annotations from primary and secondary assignment\n",
966 | " e_complete=rbindlist(list(e[!is.na(poly.ID)],missing[,c(\"point.ID\",\"poly.ID\",\"poly\"),]),use.names = TRUE)\n",
967 | " \n",
968 | " #merge with original HD_ST coordinates file\n",
969 | " bc[,point.ID:=1:nrow(bc),] \n",
970 | " bc_annot=unique(merge(bc,e_complete,by=\"point.ID\",all.x=TRUE)) \n",
971 | " bc_annot[is.na(poly.ID),poly.ID:=\"missing\",]\n",
972 | " bc_annot=bc_annot[!duplicated(point.ID)] #just take the first entry if a barcode is assigned to several annotations\n",
973 | " \n",
974 | " if (plot == TRUE){\n",
975 | " pl=ggplot(bc_annot,aes(y=spot_px_y,x=spot_px_x,col=poly.ID))+geom_point(size=0.5)+coord_fixed()\n",
976 | " png(paste0(sample,\"_bc_annot.png\"),height=500,width=700)\n",
977 | " print(pl)\n",
978 | " dev.off()\n",
979 | " }\n",
980 | " write.table(bc_annot,paste0(sample,\"_barcodes_under_tissue_annot.tsv\"),sep=\"\\t\",quote=FALSE,row.names=FALSE)\n",
981 | " return(bc_annot)\n",
982 | "}"
983 | ]
984 | },
985 | {
986 | "cell_type": "markdown",
987 | "metadata": {},
988 | "source": [
989 | "##### MOB"
990 | ]
991 | },
992 | {
993 | "cell_type": "code",
994 | "execution_count": null,
995 | "metadata": {},
996 | "outputs": [],
997 | "source": [
998 | "bc=annotate_bc(\"CN13_D2\",TRUE)"
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "markdown",
1003 | "metadata": {},
1004 | "source": [
1005 | "#### Breast cancer"
1006 | ]
1007 | },
1008 | {
1009 | "cell_type": "code",
1010 | "execution_count": null,
1011 | "metadata": {},
1012 | "outputs": [],
1013 | "source": [
1014 | "bc=annotate_bc(\"CN21_BC24350_E2\",TRUE,max_dist = 5,n_chunks = 10,flip = FALSE)"
1015 | ]
1016 | }
1017 | ],
1018 | "metadata": {
1019 | "kernelspec": {
1020 | "display_name": "R",
1021 | "language": "R",
1022 | "name": "ir"
1023 | },
1024 | "language_info": {
1025 | "codemirror_mode": "r",
1026 | "file_extension": ".r",
1027 | "mimetype": "text/x-r-source",
1028 | "name": "R",
1029 | "pygments_lexer": "r",
1030 | "version": "3.5.0"
1031 | }
1032 | },
1033 | "nbformat": 4,
1034 | "nbformat_minor": 2
1035 | }
1036 |
--------------------------------------------------------------------------------
/segmentation/HD_ST_Master.m:
--------------------------------------------------------------------------------
1 | function [] = HD_ST_Master(st_spot_table_file,st_sc_mask_file,csv_output_file)
2 | %UNTITLED Summary of this function goes here
3 | % Detailed explanation goes here
4 |
5 | % Clear variable space
6 | clear all
7 | clc
8 |
9 | %% Load tsv spot table
10 | st_spot_table = readtable(st_spot_table_file,'Delimiter','\t');
11 |
12 | %% Load mask
13 | st_sc_mask = imread(st_sc_mask_file);
14 | % If image is flipped:
15 | % st_sc_mask= flipud(st_sc_mask_raw);
16 |
17 | % Extract x and y from regionprops
18 | mask_centroid = regionprops(st_sc_mask,'centroid');
19 | for i=1:length(mask_centroid)
20 | x_centroid(i) = mask_centroid(i).Centroid(1)';
21 | y_centroid(i) = mask_centroid(i).Centroid(2)';
22 | end
23 |
24 | %% Extract all CellID's for the spot location
25 | % Flip centroid to align with spots (only if flipped)
26 | st_spot_table.spot_px_y=-(st_spot_table.spot_px_y-min(st_spot_table.spot_px_y))+(max(st_spot_table.spot_px_y));
27 |
28 | % Extract unique values
29 | [unique_spots,unique_value_location,ic] = unique(st_spot_table.bc,'stable');
30 |
31 | % Extract only the unique values
32 | unique_x = round(st_spot_table.spot_px_x(unique_value_location));
33 | unique_y = round(st_spot_table.spot_px_y(unique_value_location));
34 | unique_bc = st_spot_table.bc(unique_value_location);
35 |
36 | % Extract the overlap with the mask
37 | for i=1:size(unique_spots,1)
38 | unique_bc{i,2} = st_sc_mask(unique_y(i,1),unique_x(i,1));
39 | if unique_bc{i,2} == 0
40 | continue
41 | else
42 | unique_bc{i,3} = x_centroid(1,unique_bc{i,2});
43 | unique_bc{i,4} = y_centroid(1,unique_bc{i,2});
44 | end
45 | end
46 |
47 | export_table = cell2table(unique_bc,'VariableNames',...
48 | {'bc','cell_id','x_centroid','y_centroid'});
49 |
50 | %% Plot for QC
51 | % Plot image for unique barcodes
52 | figure()
53 | scatter(cell2mat(unique_bc(:,3)),cell2mat(unique_bc(:,4)));
54 | % Plot mask centroids
55 | figure()
56 | scatter(x_centroid,y_centroid);
57 | % Plot barcode beads
58 | figure()
59 | scatter(st_spot_table.spot_px_x,st_spot_table.spot_px_y);
60 |
61 | %% Export CSV
62 | writetable(export_table,csv_output_file);
63 |
64 | end
65 |
66 |
--------------------------------------------------------------------------------