├── .github
└── workflows
│ └── python-publish.yml
├── CHANGELOG
├── LICENSE
├── README.rst
├── c
└── utils.cpp
├── doc
├── Makefile
├── conf.py
├── images
│ ├── de_novo.png
│ ├── de_novo_celltype.png
│ ├── diagplot_centroid_2.png
│ ├── diagplot_centroid_30.png
│ ├── diagplot_centroid_5.png
│ ├── diagplot_centroid_8.png
│ ├── domain_composition.png
│ ├── domain_composition_all.png
│ ├── domains.png
│ ├── domains_individual.png
│ ├── final.png
│ ├── guided.png
│ ├── kernel_bw.png
│ ├── local_max_threshold_gene.png
│ ├── local_max_threshold_knn.png
│ ├── local_max_threshold_knn2.png
│ ├── local_max_threshold_knn3.png
│ ├── local_max_threshold_total.png
│ ├── mask.png
│ ├── maxima.png
│ ├── segmented_celltype_map.png
│ ├── tsne.png
│ ├── tsne_final.png
│ ├── tsne_merged.png
│ └── tsne_removed.png
├── index.rst
├── ssam.rst
├── userguide.rst
└── userguide
│ ├── 01-tldr.rst
│ ├── 02-installation.rst
│ ├── 03-data.rst
│ ├── 04-kde.rst
│ ├── 05-kernel_shape.rst
│ ├── 06-kernel_bandwidth.rst
│ ├── 07-input_mask.rst
│ ├── 08-guided.rst
│ ├── 09-celltype_map_thresh_g.rst
│ ├── 10-de_novo.rst
│ ├── 11-max_filtering.rst
│ ├── 12-clustering.rst
│ ├── 13-diagnostic.rst
│ ├── 14-cluster_annotation.rst
│ ├── 15-celltype_map_thresh_d.rst
│ ├── 16-visualisation.rst
│ ├── 17-domain.rst
│ ├── 18-composition.rst
│ ├── 19-experimental.rst
│ ├── 20-aaec.rst
│ └── 21-segment_celltype_map.rst
├── requirements.txt
├── setup.py
└── ssam
└── __init__.py
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 | workflow_dispatch:
10 |
11 | jobs:
12 | deploy:
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: '3.x'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install setuptools wheel twine numpy
25 | - name: Build and publish
26 | env:
27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 | run: |
30 | python setup.py sdist
31 | twine upload dist/*
32 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | 2019-10-13
2 | v 1.0.0b – Initial release
3 | 2019-10-19
4 | v 1.0.1 - Added documentations, corrected the default parameters of the methods
5 | 2021-04-16
6 | v 1.0.2 - Added more documentations (read the docs), minor bug fixes
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Notice
2 | ======
3 |
4 | This repository is no longer maintained. Further development of SSAM will be continued in the https://github.com/pnucolab/ssam repository.
5 |
6 | SSAM (Spot-based Spatial cell-type Analysis by Multidimensional mRNA density estimation)
7 | ========================================================================================
8 |
9 | Author: Jeongbin Park (jeongbin.park@charite.de)\ :sup:`1,2` and Wonyl Choi (wonyl@bu.edu)\ :sup:`3`
10 |
11 | :sup:`1`\ Digital Health Center, Berlin Institute of Health (BIH) and Charité – Universitätsmedizin, Berlin, Germany; :sup:`2`\ Faculty of Biosciences, Heidelberg University, Heidelberg, Germany; :sup:`3`\ Department of Computer Science, Boston University, Boston, the United States of America
12 |
13 | (Not referring this :laughing:: https://en.wikipedia.org/wiki/Ssam)
14 |
15 | This project was done under supervision of Dr. Naveed Ishaque (naveed.ishaque@charite.de) and Prof. Roland Eils (roland.eils@charite.de), and in collaboration with the SpaceTx consortium and the Human Cell Atlas project.
16 |
17 | Please also check our example Jupyter notebooks here: https://github.com/eilslabs/ssam_example
18 |
19 | Prerequisites
20 | =============
21 |
22 | Currently SSAM was only tested with Python 3 in Linux environment. In addition to this package, SSAM requires a local R installation with pre-installed packages ``feather`` and ``sctransform``. For details, please follow the instructions here: https://ssam.readthedocs.io/en/release/userguide/01-tldr.html#installation
23 |
24 | Install
25 | =======
26 |
27 | https://ssam.readthedocs.io/en/release/userguide/01-tldr.html#installation
28 |
29 | Documentation
30 | =============
31 |
32 | https://ssam.readthedocs.io/
33 |
34 | Citations
35 | =========
36 |
37 | Jeongbin Park, Wonyl Choi, Sebastian Tiesmeyer, Brian Long, Lars E. Borm, Emma Garren, Thuc Nghi Nguyen, Bosiljka Tasic, Simone Codeluppi, Tobias Graf, Matthias Schlesner, Oliver Stegle, Roland Eils & Naveed Ishaque. "`Cell segmentation-free inference of cell types from in situ transcriptomics data. `_" *Nature Communications* **12**, 3545 (2021).
38 |
39 | License
40 | =======
41 |
42 | Copyright (C) 2018 Jeongbin Park and Wonyl Choi
43 |
44 | This program is free software: you can redistribute it and/or modify
45 | it under the terms of the GNU Affero General Public License as published
46 | by the Free Software Foundation, either version 3 of the License, or
47 | (at your option) any later version.
48 |
49 | This program is distributed in the hope that it will be useful,
50 | but WITHOUT ANY WARRANTY; without even the implied warranty of
51 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
52 | GNU Affero General Public License for more details.
53 |
54 | You should have received a copy of the GNU Affero General Public License
55 | along with this program. If not, see https://www.gnu.org/licenses/.
56 |
--------------------------------------------------------------------------------
/c/utils.cpp:
--------------------------------------------------------------------------------
1 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #if defined(_OPENMP)
8 | #include
9 | #else
10 | typedef int omp_int_t;
11 | inline omp_int_t omp_get_thread_num() { return 0;}
12 | inline omp_int_t omp_get_max_threads() { return 1;}
13 | #endif
14 |
15 | #include
16 | #include "numpy/npy_math.h"
17 | #include "numpy/arrayobject.h"
18 |
19 | #define I2D(X, Y, YL) ((X) * (YL) + (Y))
20 | #define I3D(X, Y, Z, YL, ZL) (((X) * (YL) * (ZL)) + ((Y) * (ZL)) + (Z))
21 |
22 | struct pos2d {
23 | long x;
24 | long y;
25 | };
26 |
27 | struct pos3d {
28 | long x;
29 | long y;
30 | long z;
31 | };
32 |
33 | static double gauss_kernel(double x, double y, double z) {
34 | return exp(-0.5 * (x*x + y*y + z*z)); // this is not normalized
35 | }
36 |
37 | static void kde(double bandwidth, double *x, double *y, double *z, double* query_x, double* query_y, double *query_z, double *rtn, unsigned int num_points, unsigned int num_querys, double (*kernel)(double, double, double), double maxdist, int ncores) {
38 | unsigned int i, j;
39 | double d;
40 | #pragma omp parallel for num_threads(ncores) private(i, j, d)
41 | for (i=0; i(kwlist), &h, &arg1, &arg2, &arg3, &arg4, &arg5, &arg6, &kernel, &ncores)) return NULL;
111 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL;
112 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
113 | if ((arr3 = (PyArrayObject*)PyArray_FROM_OTF(arg3, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
114 | if ((arr4 = (PyArrayObject*)PyArray_FROM_OTF(arg4, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
115 | if ((arr5 = (PyArrayObject*)PyArray_FROM_OTF(arg5, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
116 | if ((arr6 = (PyArrayObject*)PyArray_FROM_OTF(arg6, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
117 |
118 | if (PyArray_NDIM(arr1) != 1 || PyArray_NDIM(arr2) != 1 || PyArray_NDIM(arr3) != 1 ||
119 | PyArray_NDIM(arr4) != 1 || PyArray_NDIM(arr5) != 1 || PyArray_NDIM(arr6) != 1)
120 | {
121 | goto fail;
122 | }
123 |
124 | npts = PyArray_DIMS(arr1)[0];
125 | nqrys = PyArray_DIMS(arr4)[0];
126 | nqrys_npy = nqrys;
127 |
128 | oarr = (PyArrayObject*)PyArray_ZEROS(1, &nqrys_npy, NPY_DOUBLE, NPY_CORDER);
129 |
130 | x = (double *)PyArray_DATA(arr1);
131 | y = (double *)PyArray_DATA(arr2);
132 | z = (double *)PyArray_DATA(arr3);
133 | qx = (double *)PyArray_DATA(arr4);
134 | qy = (double *)PyArray_DATA(arr5);
135 | qz = (double *)PyArray_DATA(arr6);
136 | rtn = (double *)PyArray_DATA(oarr);
137 |
138 | maxdist_gauss = sqrt(2) * h * log((double)(1000000 * npts));
139 | kde(h, x, y, z, qx, qy, qz, rtn, npts, nqrys, gauss_kernel, maxdist_gauss, ncores);
140 |
141 | Py_DECREF(arr1);
142 | Py_DECREF(arr2);
143 | Py_DECREF(arr3);
144 | Py_DECREF(arr4);
145 | Py_DECREF(arr5);
146 | Py_DECREF(arr6);
147 |
148 | return (PyObject *) oarr;
149 |
150 | fail:
151 | Py_XDECREF(arr1);
152 | Py_XDECREF(arr2);
153 | Py_XDECREF(arr3);
154 | Py_XDECREF(arr4);
155 | Py_XDECREF(arr5);
156 | Py_XDECREF(arr6);
157 | return NULL;
158 | }
159 |
160 | static PyObject *flood_fill(PyObject *self, PyObject *args, PyObject *kwargs) {
161 | PyObject *arg1 = NULL;
162 | PyObject *arg2 = NULL;
163 | PyObject* filled_poslist = NULL;
164 | PyArrayObject *arr1 = NULL;
165 | PyArrayObject *arr2 = NULL;
166 | long nvec, nd, ngene = 0;
167 | long *pos, x, y, z, cnt;
168 | double r = 0.6, *vf;
169 | npy_intp *dimsp;
170 | int min_pixels = 10, max_pixels=2000;
171 | int i;
172 | bool *mask;
173 |
174 | static const char *kwlist[] = { "pos", "vf", "r", "min_pixels", "max_pixels", NULL };
175 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|dii", const_cast(kwlist), &arg1, &arg2, &r, &min_pixels, &max_pixels)) return NULL;
176 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_LONG, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL;
177 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
178 | if (PyArray_NDIM(arr1) != 1) goto fail;
179 | nd = PyArray_NDIM(arr2);
180 | dimsp = PyArray_DIMS(arr2);
181 | nvec = 1;
182 | for (i=0; i queue2d;
193 | queue2d.push(pos2d());
194 | queue2d.back().x = pos[0];
195 | queue2d.back().y = pos[1];
196 | while (queue2d.size() > 0) {
197 | x = queue2d.front().x;
198 | y = queue2d.front().y;
199 | PyObject *t = PyTuple_New(2);
200 | PyTuple_SetItem(t, 0, PyLong_FromLong(x));
201 | PyTuple_SetItem(t, 1, PyLong_FromLong(y));
202 | cnt += 1;
203 | if (cnt > max_pixels)
204 | break;
205 | PyList_Append(filled_poslist, t);
206 | queue2d.pop();
207 | if (x < dimsp[0] - 1 && mask[I2D(x + 1, y, dimsp[1])] == false &&
208 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene),
209 | vf + (I2D(x + 1, y, dimsp[1]) * ngene), ngene) > r) {
210 | mask[I2D(x + 1, y, dimsp[1])] = true;
211 | queue2d.push(pos2d());
212 | queue2d.back().x = x + 1;
213 | queue2d.back().y = y;
214 | }
215 | if (x > 1 && mask[I2D(x - 1, y, dimsp[1])] == false &&
216 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene),
217 | vf + (I2D(x - 1, y, dimsp[1]) * ngene), ngene) > r) {
218 | mask[I2D(x - 1, y, dimsp[1])] = true;
219 | queue2d.push(pos2d());
220 | queue2d.back().x = x - 1;
221 | queue2d.back().y = y;
222 | }
223 | if (y < dimsp[1] - 1 && mask[I2D(x, y + 1, dimsp[1])] == false &&
224 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene),
225 | vf + (I2D(x, y + 1, dimsp[1]) * ngene), ngene) > r) {
226 | mask[I2D(x, y + 1, dimsp[1])] = true;
227 | queue2d.push(pos2d());
228 | queue2d.back().x = x;
229 | queue2d.back().y = y + 1;
230 | }
231 | if (y > 1 && mask[I2D(x, y - 1, dimsp[1])] == false &&
232 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene),
233 | vf + (I2D(x, y - 1, dimsp[1]) * ngene), ngene) > r) {
234 | mask[I2D(x, y - 1, dimsp[1])] = true;
235 | queue2d.push(pos2d());
236 | queue2d.back().x = x;
237 | queue2d.back().y = y - 1;
238 | }
239 | }
240 | } else if (nd == 4) {
241 | // 3D
242 | std::queue queue3d;
243 | queue3d.push(pos3d());
244 | queue3d.back().x = pos[0];
245 | queue3d.back().y = pos[1];
246 | queue3d.back().z = pos[2];
247 | while (queue3d.size() > 0) {
248 | x = queue3d.front().x;
249 | y = queue3d.front().y;
250 | z = queue3d.front().z;
251 | PyObject *t = PyTuple_New(3);
252 | PyTuple_SetItem(t, 0, PyLong_FromLong(x));
253 | PyTuple_SetItem(t, 1, PyLong_FromLong(y));
254 | PyTuple_SetItem(t, 2, PyLong_FromLong(z));
255 | PyList_Append(filled_poslist, t);
256 | cnt += 1;
257 | if (cnt > max_pixels)
258 | break;
259 | queue3d.pop();
260 | if (x < dimsp[0] - 1 && mask[I3D(x + 1, y, z, dimsp[1], dimsp[2])] == false &&
261 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
262 | vf + I3D(x + 1, y, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
263 | mask[I3D(x + 1, y, z, dimsp[1], dimsp[2])] = true;
264 | queue3d.push(pos3d());
265 | queue3d.back().x = x + 1;
266 | queue3d.back().y = y;
267 | queue3d.back().z = z;
268 | }
269 | if (x > 1 && mask[I3D(x - 1, y, z, dimsp[1], dimsp[2])] == false &&
270 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
271 | vf + I3D(x - 1, y, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
272 | mask[I3D(x - 1, y, z, dimsp[1], dimsp[2])] = true;
273 | queue3d.push(pos3d());
274 | queue3d.back().x = x - 1;
275 | queue3d.back().y = y;
276 | queue3d.back().z = z;
277 | }
278 | if (y < dimsp[1] - 1 && mask[I3D(x, y + 1, z, dimsp[1], dimsp[2])] == false &&
279 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
280 | vf + I3D(x, y + 1, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
281 | mask[I3D(x, y + 1, z, dimsp[1], dimsp[2])] = true;
282 | queue3d.push(pos3d());
283 | queue3d.back().x = x;
284 | queue3d.back().y = y + 1;
285 | queue3d.back().z = z;
286 | }
287 | if (y > 1 && mask[I3D(x, y - 1, z, dimsp[1], dimsp[2])] == false &&
288 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
289 | vf + I3D(x, y - 1, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
290 | mask[I3D(x, y - 1, z, dimsp[1], dimsp[2])] = true;
291 | queue3d.push(pos3d());
292 | queue3d.back().x = x;
293 | queue3d.back().y = y - 1;
294 | queue3d.back().z = z;
295 | }
296 | if (z < dimsp[2] - 1 && mask[I3D(x, y, z + 1, dimsp[1], dimsp[2])] == false &&
297 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
298 | vf + I3D(x, y, z + 1, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
299 | mask[I3D(x, y, z, dimsp[1], dimsp[2])] = true;
300 | queue3d.push(pos3d());
301 | queue3d.back().x = x;
302 | queue3d.back().y = y;
303 | queue3d.back().z = z + 1;
304 | }
305 | if (z > 1 && mask[I3D(x, y, z - 1, dimsp[1], dimsp[2])] == false &&
306 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene,
307 | vf + I3D(x, y, z - 1, dimsp[1], dimsp[2]) * ngene, ngene) > r) {
308 | mask[I3D(x, y, z - 1, dimsp[1], dimsp[2])] = true;
309 | queue3d.push(pos3d());
310 | queue3d.back().x = x;
311 | queue3d.back().y = y;
312 | queue3d.back().z = z - 1;
313 | }
314 | }
315 | }
316 | free((void*)mask);
317 | Py_DECREF(arr1);
318 | Py_DECREF(arr2);
319 | if (cnt > max_pixels || cnt < min_pixels)
320 | PyList_SetSlice(filled_poslist, 0, PyList_Size(filled_poslist), NULL);
321 | return (PyObject *) filled_poslist;
322 |
323 | fail:
324 | Py_XDECREF(arr1);
325 | Py_XDECREF(arr2);
326 | return NULL;
327 | }
328 |
329 | static PyObject *calc_corrmap(PyObject *self, PyObject *args, PyObject *kwargs) {
330 | PyObject *arg1 = NULL;
331 | PyArrayObject *arr1 = NULL;
332 | PyArrayObject *oarr = NULL;
333 | long i, x, y, z, dx, dy, dz;
334 | long nvec, nd, ngene = 0;
335 | double *vecs, *corrmap;
336 | npy_intp *dimsp;
337 | int ncores = omp_get_max_threads();
338 | int csize = 1;
339 | double *tmpvec;
340 |
341 | static const char *kwlist[] = { "vf", "ncores", "size", NULL };
342 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|ii", const_cast(kwlist), &arg1, &ncores, &csize)) return NULL;
343 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL;
344 | nd = PyArray_NDIM(arr1);
345 | if (nd != 3 && nd != 4) goto fail; // only 2D or 3D array is expected
346 | dimsp = PyArray_DIMS(arr1);
347 | oarr = (PyArrayObject*)PyArray_ZEROS(nd - 1, dimsp, NPY_DOUBLE, NPY_CORDER);
348 | ngene = dimsp[nd-1];
349 | corrmap = (double *)PyArray_DATA(oarr);
350 | vecs = (double *)PyArray_DATA(arr1);
351 | nvec = 1;
352 | for (i=0; i(kwlist), &arg1, &ncores, &csize)) return NULL;
435 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL;
436 | nd = PyArray_NDIM(arr1);
437 | if (nd != 3 && nd != 4) goto fail; // only 2D or 3D array is expected
438 | dimsp = PyArray_DIMS(arr1);
439 | for (i=0; i(kwlist), &arg1, &arg2, &ncores)) return NULL;
513 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL;
514 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail;
515 | if (PyArray_NDIM(arr1) != 1) goto fail;
516 | nd = PyArray_NDIM(arr2);
517 | if((ngene = *PyArray_DIMS(arr1)) != PyArray_DIMS(arr2)[nd-1]) goto fail;
518 |
519 | dimsp = PyArray_DIMS(arr2);
520 | oarr = (PyArrayObject*)PyArray_ZEROS(nd - 1, dimsp, NPY_DOUBLE, NPY_CORDER);
521 |
522 | nvec = 1;
523 | for (i=0; i= 3
588 | static struct PyModuleDef moduledef = {
589 | PyModuleDef_HEAD_INIT,
590 | "analysis_utils",
591 | NULL,
592 | -1,
593 | module_methods
594 | };
595 | #endif
596 |
597 | PyMODINIT_FUNC
598 | PyInit_utils(void)
599 | {
600 | #if PY_MAJOR_VERSION >= 3
601 | PyObject *module = PyModule_Create(&moduledef);
602 | #else
603 | Py_InitModule("utils", module_methods);
604 | #endif
605 | import_array();
606 | #if PY_MAJOR_VERSION >= 3
607 | return module;
608 | #endif
609 | }
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = SSAM
8 | SOURCEDIR = .
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # SSAM documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Nov 22 11:41:04 2018.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 | sys.path.insert(0, os.path.abspath('.'))
23 | sys.path.insert(0, os.path.abspath('..'))
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = ['sphinx.ext.autodoc',
35 | 'sphinx.ext.doctest',
36 | 'sphinx.ext.intersphinx',
37 | 'sphinx.ext.todo',
38 | 'sphinx.ext.mathjax',
39 | 'sphinx.ext.ifconfig',
40 | 'sphinx.ext.viewcode',
41 | 'sphinx.ext.githubpages']
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['templates']
45 |
46 | # The suffix(es) of source filenames.
47 | # You can specify multiple suffix as a list of string:
48 | #
49 | # source_suffix = ['.rst', '.md']
50 | source_suffix = '.rst'
51 |
52 | # The master toctree document.
53 | master_doc = 'index'
54 |
55 | # General information about the project.
56 | project = 'SSAM'
57 | copyright = '2018, Jeongbin Park'
58 | author = 'Jeongbin Park'
59 |
60 | # The version info for the project you're documenting, acts as replacement for
61 | # |version| and |release|, also used in various other places throughout the
62 | # built documents.
63 | #
64 | # The short X.Y version.
65 | version = '1.0.1'
66 | # The full version, including alpha/beta/rc tags.
67 | release = '1.0.1'
68 |
69 | # The language for content autogenerated by Sphinx. Refer to documentation
70 | # for a list of supported languages.
71 | #
72 | # This is also used if you do content translation via gettext catalogs.
73 | # Usually you set "language" from the command line for these cases.
74 | language = None
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | # This patterns also effect to html_static_path and html_extra_path
79 | exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
80 |
81 | # The name of the Pygments (syntax highlighting) style to use.
82 | pygments_style = 'sphinx'
83 |
84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
85 | todo_include_todos = True
86 |
87 |
88 | # -- Options for HTML output ----------------------------------------------
89 |
90 | # The theme to use for HTML and HTML Help pages. See the documentation for
91 | # a list of builtin themes.
92 | #
93 | html_theme = 'sphinx_rtd_theme'
94 | html_theme_path = ["_themes", ]
95 |
96 | # Theme options are theme-specific and customize the look and feel of a theme
97 | # further. For a list of options available for each theme, see the
98 | # documentation.
99 | #
100 | # html_theme_options = {}
101 |
102 | # Add any paths that contain custom static files (such as style sheets) here,
103 | # relative to this directory. They are copied after the builtin static files,
104 | # so a file named "default.css" will overwrite the builtin "default.css".
105 | #html_static_path = ['static']
106 |
107 | # Custom sidebar templates, must be a dictionary that maps document names
108 | # to template names.
109 | #
110 | # This is required for the alabaster theme
111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
112 | #html_sidebars = {
113 | # '**': [
114 | # 'relations.html', # needs 'show_related': True theme option to display
115 | # 'searchbox.html',
116 | # ]
117 | #}
118 |
119 |
120 | # -- Options for HTMLHelp output ------------------------------------------
121 |
122 | # Output file base name for HTML help builder.
123 | htmlhelp_basename = 'SSAMdoc'
124 |
125 |
126 | # -- Options for LaTeX output ---------------------------------------------
127 |
128 | latex_elements = {
129 | # The paper size ('letterpaper' or 'a4paper').
130 | #
131 | # 'papersize': 'letterpaper',
132 |
133 | # The font size ('10pt', '11pt' or '12pt').
134 | #
135 | # 'pointsize': '10pt',
136 |
137 | # Additional stuff for the LaTeX preamble.
138 | #
139 | # 'preamble': '',
140 |
141 | # Latex figure (float) alignment
142 | #
143 | # 'figure_align': 'htbp',
144 | }
145 |
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | # author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 | (master_doc, 'SSAM.tex', 'SSAM Documentation',
151 | 'Jeongbin Park', 'manual'),
152 | ]
153 |
154 |
155 | # -- Options for manual page output ---------------------------------------
156 |
157 | # One entry per manual page. List of tuples
158 | # (source start file, name, description, authors, manual section).
159 | man_pages = [
160 | (master_doc, 'ssam', 'SSAM Documentation',
161 | [author], 1)
162 | ]
163 |
164 |
165 | # -- Options for Texinfo output -------------------------------------------
166 |
167 | # Grouping the document tree into Texinfo files. List of tuples
168 | # (source start file, target name, title, author,
169 | # dir menu entry, description, category)
170 | texinfo_documents = [
171 | (master_doc, 'SSAM', 'SSAM Documentation',
172 | author, 'SSAM', 'One line description of project.',
173 | 'Miscellaneous'),
174 | ]
175 |
176 |
177 |
178 |
179 | # Example configuration for intersphinx: refer to the Python standard library.
180 | intersphinx_mapping = {'https://docs.python.org/': None}
181 |
182 | autodoc_mock_imports = ["ssam.utils"]
183 |
--------------------------------------------------------------------------------
/doc/images/de_novo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/de_novo.png
--------------------------------------------------------------------------------
/doc/images/de_novo_celltype.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/de_novo_celltype.png
--------------------------------------------------------------------------------
/doc/images/diagplot_centroid_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_2.png
--------------------------------------------------------------------------------
/doc/images/diagplot_centroid_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_30.png
--------------------------------------------------------------------------------
/doc/images/diagplot_centroid_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_5.png
--------------------------------------------------------------------------------
/doc/images/diagplot_centroid_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_8.png
--------------------------------------------------------------------------------
/doc/images/domain_composition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domain_composition.png
--------------------------------------------------------------------------------
/doc/images/domain_composition_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domain_composition_all.png
--------------------------------------------------------------------------------
/doc/images/domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domains.png
--------------------------------------------------------------------------------
/doc/images/domains_individual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domains_individual.png
--------------------------------------------------------------------------------
/doc/images/final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/final.png
--------------------------------------------------------------------------------
/doc/images/guided.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/guided.png
--------------------------------------------------------------------------------
/doc/images/kernel_bw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/kernel_bw.png
--------------------------------------------------------------------------------
/doc/images/local_max_threshold_gene.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_gene.png
--------------------------------------------------------------------------------
/doc/images/local_max_threshold_knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn.png
--------------------------------------------------------------------------------
/doc/images/local_max_threshold_knn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn2.png
--------------------------------------------------------------------------------
/doc/images/local_max_threshold_knn3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn3.png
--------------------------------------------------------------------------------
/doc/images/local_max_threshold_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_total.png
--------------------------------------------------------------------------------
/doc/images/mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/mask.png
--------------------------------------------------------------------------------
/doc/images/maxima.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/maxima.png
--------------------------------------------------------------------------------
/doc/images/segmented_celltype_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/segmented_celltype_map.png
--------------------------------------------------------------------------------
/doc/images/tsne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne.png
--------------------------------------------------------------------------------
/doc/images/tsne_final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_final.png
--------------------------------------------------------------------------------
/doc/images/tsne_merged.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_merged.png
--------------------------------------------------------------------------------
/doc/images/tsne_removed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_removed.png
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 |
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 | :caption: Contents
7 |
8 | userguide
9 | ssam
10 |
11 |
12 | Indices and tables
13 | ==================
14 |
15 | * :ref:`genindex`
16 |
--------------------------------------------------------------------------------
/doc/ssam.rst:
--------------------------------------------------------------------------------
1 | Module contents
2 | ---------------
3 |
4 | .. automodule:: ssam
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/userguide.rst:
--------------------------------------------------------------------------------
1 | Spatial gene expression analysis with SSAM
2 | ------------------------------------------
3 |
4 | .. toctree::
5 | :glob:
6 |
7 | userguide/*
8 |
--------------------------------------------------------------------------------
/doc/userguide/01-tldr.rst:
--------------------------------------------------------------------------------
1 | quick start / tldr page
2 | =======================
3 |
4 | This tl;dr guide is for you if you already know what happens in a SSAM
5 | analysis or if you don’t care.
6 |
7 | For everyone else we recommend using the full
8 | `userguide <../userguide.md>`__.
9 |
10 | Installation
11 | ------------
12 |
13 | Setup a ``conda`` environment:
14 |
15 | ::
16 |
17 | conda create -n ssam python=3.6
18 | conda activate ssam
19 | conda install gxx_linux-64 numpy pip R=3.6 pyarrow=0.15.1
20 |
21 | Do this in ``R``:
22 |
23 | ::
24 |
25 | install.packages("sctransform")
26 | install.packages("feather")
27 |
28 | Install SSAM via ``pip``:
29 |
30 | ::
31 |
32 | pip install ssam
33 |
34 | Data download
35 | -------------
36 |
37 | ::
38 |
39 | curl "https://zenodo.org/record/3478502/files/supplemental_data_ssam_2019.zip?download=1" -o zenodo.zip
40 | unzip zenodo.zip
41 |
42 | Data preparation
43 | ----------------
44 |
45 | All following steps in ``python``:
46 |
47 | ::
48 |
49 | import numpy as np
50 | import pandas as pd
51 | import matplotlib.pyplot as plt
52 | import ssam
53 |
54 | df = pd.read_csv(
55 | "zenodo/multiplexed_smFISH/raw_data/smFISH_MCT_CZI_Panel_0_spot_table.csv",
56 | usecols=['x', 'y', 'z', 'target'])
57 |
58 | um_per_pixel = 0.1
59 |
60 | df.x = (df.x - df.x.min()) * um_per_pixel + 10
61 | df.y = (df.y - df.y.min()) * um_per_pixel + 10
62 | df.z = (df.z - df.z.min()) * um_per_pixel + 10
63 | width = df.x.max() - df.x.min() + 10
64 | height = df.y.max() - df.y.min() + 10
65 |
66 | grouped = df.groupby('target').agg(list)
67 | genes = list(grouped.index)
68 | coord_list = []
69 | for target, coords in grouped.iterrows():
70 | coord_list.append(np.array(list(zip(*coords))))
71 |
72 | Create SSAM dataset and vector field
73 | ------------------------------------
74 |
75 | ::
76 |
77 | ds = ssam.SSAMDataset(genes, coord_list, width, height)
78 | analysis = ssam.SSAMAnalysis(
79 | ds,
80 | ncores=10, # used for kde step
81 | save_dir="kde/",
82 | verbose=True)
83 |
84 | analysis.run_kde(bandwidth=2.5, use_mmap=False)
85 |
86 | analysis.find_localmax(
87 | search_size=3,
88 | min_norm=0.2,
89 | min_expression=0.027
90 | )
91 |
92 | analysis.normalize_vectors_sctransform()
93 |
94 | Creating the *de novo* cell map
95 | -------------------------------
96 |
97 | ::
98 |
99 | analysis.cluster_vectors(
100 | min_cluster_size=0,
101 | pca_dims=22,
102 | resolution=0.15,
103 | metric='correlation')
104 |
105 | # post-filtering parameter for cell-type map
106 | filter_method = "local"
107 | filter_params = {
108 | "block_size": 151,
109 | "method": "mean",
110 | "mode": "constant",
111 | "offset": 0.2
112 | }
113 |
114 | analysis.map_celltypes()
115 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask)
116 |
117 | .. figure:: ../images/de_novo.png
118 | :alt: Visualisation of cell type map.
119 |
120 | Visualisation of cell type map.
121 |
122 | Creating the tissue domain map
123 | ------------------------------
124 |
125 | ::
126 |
127 | analysis.bin_celltypemaps(step=10, radius=100)
128 | analysis.find_domains(n_clusters=20, merge_remote=True, merge_thres=0.7, norm_thres=1500)
129 |
130 | plt.figure(figsize=[5, 5])
131 | ds.plot_domains(rotate=1, cmap=cmap)
132 |
133 | .. figure:: ../images/domains.png
134 | :alt: Visualisation of final domain map exhibitin clearly separated
135 | domains.
136 |
137 | Visualisation of final domain map exhibitin clearly separated
138 | domains.
139 |
--------------------------------------------------------------------------------
/doc/userguide/02-installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | A step-by-step guide
5 | --------------------
6 |
7 | The easiest way to prepare a python environment for SSAM is using
8 | `conda `__.
9 | Keeping python projects in isolated environments prevents dependency
10 | version conflicts or conflicts with your OS installation of python which
11 | usually depends on older versions incompatible with current scientific
12 | packages.
13 |
14 | Create your environment:
15 |
16 | ::
17 |
18 | conda create -n ssam python=3.6
19 |
20 | Remember to activate before using it:
21 |
22 | ::
23 |
24 | conda activate ssam
25 |
26 | Now we use conda to install some dependencies into our ssam environment:
27 |
28 | ::
29 |
30 | conda install gxx_linux-64=7.3.0 numpy=1.19.2 pip R=3.6 pyarrow=0.15.1
31 |
32 | Now we can install the R packages ``sctransform`` and ``feather``. Open
33 | R and type:
34 |
35 | ::
36 |
37 | install.packages("sctransform")
38 | install.packages("feather")
39 |
40 | Finally we switch to pip:
41 |
42 | .. raw:: html
43 |
44 |
49 |
50 | ::
51 |
52 | pip install git+https://github.com/HiDiHlabs/ssam.git
53 |
54 | Next we can download and prepare our `data `__.
55 |
56 | SSAM’s source code
57 | ------------------
58 |
59 | In case you want to work with `SSAM’s source
60 | code `__, it is also hosted on github.
61 |
--------------------------------------------------------------------------------
/doc/userguide/03-data.rst:
--------------------------------------------------------------------------------
1 | Data Preparation
2 | ================
3 |
4 | Download VISp data
5 | ------------------
6 |
7 | In this tutorial we work with data of the murine primary visual cortex
8 | (VISp) profiled using multiplexed smFISH. Further details are available
9 | in the SSAM publication (Park, et. al. 2019).
10 |
11 | First, download the data and unpack it:
12 |
13 | ::
14 |
15 | curl "https://zenodo.org/record/3478502/files/supplemental_data_ssam_2019.zip?download=1" -o zenodo.zip
16 | unzip zenodo.zip
17 |
18 | Load data into python
19 | ---------------------
20 |
21 | Let’s start with loading our python packages:
22 |
23 | ::
24 |
25 | import numpy as np
26 | import pandas as pd
27 | import matplotlib.pyplot as plt
28 | import ssam
29 |
30 | Now we can load the mRNA spot table. Each row describes one mRNA spot
31 | and the columns contain its coordinates and target gene. We load the
32 | required columns into a dataframe:
33 |
34 | ::
35 |
36 | df = pd.read_csv(
37 | "zenodo/multiplexed_smFISH/raw_data/smFISH_MCT_CZI_Panel_0_spot_table.csv",
38 | usecols=['x', 'y', 'z', 'target'])
39 |
40 | If your dataset is organized differently, you will have to reshape it
41 | before continuing with the next steps. ## Transform Data
42 |
43 | Because SSAM analysis is rooted in a cellular scale we transform the
44 | coordinates from a laboratory system into micrometers. Also we make them
45 | a bit tidier:
46 |
47 | ::
48 |
49 | um_per_pixel = 0.1
50 |
51 | df.x = (df.x - df.x.min()) * um_per_pixel + 10
52 | df.y = (df.y - df.y.min()) * um_per_pixel + 10
53 | df.z = (df.z - df.z.min()) * um_per_pixel + 10
54 |
55 | Prepare data for SSAM
56 | ---------------------
57 |
58 | To create a ``SSAMDataset`` object we need to provide four arguments: -
59 | a list of gene names profiled in the experiment: ``genes`` - a list of
60 | lists that contains the coordinates of each gene: ``coord_list`` - the
61 | ``width`` of the image - the ``height`` of the image
62 |
63 | The width and height are straightforward to infer from the dimensions of
64 | the image:
65 |
66 | ::
67 |
68 | width = df.x.max() - df.x.min() + 10
69 | height = df.y.max() - df.y.min() + 10
70 |
71 | We group the dataframe by gene and create the list of gene names:
72 |
73 | ::
74 |
75 | grouped = df.groupby('target').agg(list)
76 | genes = list(grouped.index)
77 |
78 | And finally the coordinate list:
79 |
80 | ::
81 |
82 | coord_list = []
83 | for target, coords in grouped.iterrows():
84 | coord_list.append(np.array(list(zip(*coords))))
85 |
86 | Create the ``SSAMDataset`` object
87 | ---------------------------------
88 |
89 | With everything in place we can now instantiate the ``SSAMDataset``
90 | object:
91 |
92 | ::
93 |
94 | ds = ssam.SSAMDataset(genes, coord_list, width, height)
95 |
96 | Now we can start the analysis with the `kernel density
97 | estimation `__ step.
98 |
--------------------------------------------------------------------------------
/doc/userguide/04-kde.rst:
--------------------------------------------------------------------------------
1 | Creating the vector field
2 | =========================
3 |
4 | After the data has been loaded, SSAM converts the discrete mRNA
5 | locations into mRNA desntiy (that can be thought of as continuous “gene
6 | expression clouds” over the tissue) through application of `Kernel
7 | Density Estimation `__.
8 |
9 | KDE
10 | ---
11 |
12 | With our ``SSAMDataset`` object ``ds`` we can now initialize a
13 | ``SSAMAnalysis`` object ``analysis``.
14 |
15 | ::
16 |
17 | analysis = ssam.SSAMAnalysis(
18 | ds,
19 | ncores=10, # used for kde step
20 | save_dir="kde/",
21 | verbose=True)
22 |
23 | And calculate a mRNA density estimate with the ``run_kde`` method.
24 | Important considerations here are the `kernel
25 | function `__ and the `kernel
26 | bandwidth `__. As default, we recommend using a
27 | Gaussian kernel with a bandwidth of 2.5:
28 |
29 | ::
30 |
31 | analysis.run_kde(bandwidth=2.5, use_mmap=False)
32 |
33 | Masking
34 | -------
35 |
36 | If you want to perform the analysis on `only a part of your sample you
37 | can use a mask `__. This can restrict what parts of the image
38 | are used for local maxima sampling (the ``input_mask``), or restrict the
39 | cell-type map generation of SSAM to certain regions (the
40 | ``output_mask``). While this is not required for analysis (infact the
41 | SSAM paper did not apply masks to the osmFISH or MERFISH dataset), here
42 | we define a simply polygon as both the ``input_mask`` and
43 | ``output_mask`` for the VISp region.
44 |
45 | ::
46 |
47 | from matplotlib.path import Path
48 | # manual area annotation
49 | xy = np.array([[1535, 90],
50 | [ 795, 335],
51 | [ 135, 940],
52 | [ 835, 1995],
53 | [1465, 1695],
54 | [2010, 1215]])
55 |
56 | # Extract coordinates from SSAMDataset
57 | x, y = np.meshgrid(np.arange(ds.vf.shape[0]), np.arange(ds.vf.shape[1]))
58 | x, y = x.flatten(), y.flatten()
59 | points = np.vstack((x,y)).T
60 |
61 | path = Path(xy)
62 | input_mask = path.contains_points(points)
63 | input_mask = input_mask.reshape((ds.vf.shape[1], ds.vf.shape[0], 1)).swapaxes(0, 1)
64 | output_mask = input_mask
65 |
66 | We recommend a visual inspection of the mask to make sure it alignes
67 | with the data as you expect it to:
68 |
69 | ::
70 |
71 | from matplotlib.patches import Polygon
72 | from matplotlib.collections import PatchCollection
73 |
74 | patch = Polygon(xy, True)
75 | p = PatchCollection([patch], alpha=0.4)
76 |
77 | plt.figure(figsize=[5, 5])
78 | ds.plot_l1norm(rotate=1, cmap="Greys")
79 | plt.gca().add_collection(p)
80 | plt.axis('off')
81 | plt.savefig('images/mask.png')
82 |
83 | .. figure:: ../images/mask.png
84 | :alt: plot of the mRNA density superimposed with the mask
85 |
86 | plot of the mRNA density superimposed with the mask
87 |
88 | Local maxima search and normalization
89 | -------------------------------------
90 |
91 | In order to reduce the computational burden, we recommend downsampling
92 | the image. While random sampling can be performe, we strongly encourage
93 | downsampling via local maxima selection, followed by `filtering based of
94 | individual and total gene expression `__.
95 |
96 | The local maxima are used to (i) determine the variance stabilisation
97 | parameters for the image, and (ii) be used to determine
98 | `clusters `__ in `de novo analysis `__. In
99 | this section, we will use the local maxima for variance stabilisation.
100 |
101 | Here we apply the ``find_localmax`` function to find the local maxima of
102 | the mRNA density, using a per gene expression threshold of ``0.027`` and
103 | a total gene expression threshold of ``0.2``:
104 |
105 | ::
106 |
107 | analysis.find_localmax(
108 | search_size=3,
109 | min_norm=0.2, # the total gene expression threshold
110 | min_expression=0.027, # the per gene expression threshold
111 | mask=input_mask
112 | )
113 |
114 | Visualization
115 | -------------
116 |
117 | After the local maxima have been identified, they can be visualised. In
118 | cases when many local maxima orginate from outside the tissue area a
119 | `k-NN density threshold can be used to filter “stray” local
120 | maxima `__,
121 | however in this example we use an input mask so it is not a problem.
122 |
123 | ::
124 |
125 | plt.figure(figsize=[5, 5])
126 | ds.plot_l1norm(cmap="Greys", rotate=1)
127 | ds.plot_localmax(c="Blue", rotate=1, s=0.1)
128 |
129 | patch = Polygon(xy, facecolor="black", edgecolor="red", linewidth=10, ls="-")
130 | p = PatchCollection([patch], alpha=0.4)
131 | plt.gca().add_collection(p)
132 |
133 | scalebar = ScaleBar(1, 'um') # 1 pixel = 1um
134 | plt.gca().add_artist(scalebar)
135 | plt.tight_layout()
136 | plt.axis('off')
137 | plt.show()
138 |
139 | .. figure:: ../images/maxima.png
140 | :alt: plot found maxima superimposed with the mask
141 |
142 | plot found maxima superimposed with the mask
143 |
144 | Normalization
145 | -------------
146 |
147 | Once the local maxima have been identified, we can use them for
148 | calculating the variance stabilisation parameters using ``sctransform``.
149 | If you receive an error here, make sure that you have installed the R
150 | packages in the `installation `__ step
151 |
152 | This part of the analysis ends with the normalization of the mRNA
153 | density and the local-maximum vectors.
154 |
155 | ::
156 |
157 | analysis.normalize_vectors_sctransform()
158 |
159 | Now we are rady to continue with mapping the cell types in
160 | `guided `__ or `de novo mode `__.
161 |
--------------------------------------------------------------------------------
/doc/userguide/05-kernel_shape.rst:
--------------------------------------------------------------------------------
1 | The shape of the kernel
2 | =======================
3 |
4 | The shape of the kernel is defined by the `kernel
5 | function `__. The
6 | shape of the kernel determines how the mRNA signal is smoothed.
7 |
8 | We adopt the use of the Gaussian kernel due to it’s popular use in
9 | signal processing, however other kernel functions can be used: - we have
10 | had success in using semi-circle kernels when applied to `ISS data of
11 | the human pancreas `__ -
12 | the `Epanechnikov kernel `__
13 | minimizes AMISE and has therefore been described as optimal
14 |
15 | The following exmaples shows how you can apply a semicircular kernel
16 | instead of a Gaussian.
17 |
18 | ::
19 |
20 | # code to change the shape of the kernel (@sebastiantiesmeyer)
21 |
--------------------------------------------------------------------------------
/doc/userguide/06-kernel_bandwidth.rst:
--------------------------------------------------------------------------------
1 | Kernel bandwidth
2 | ================
3 |
4 | The bandwidth of the kernel controls the amount of smoothing applied.
5 | With a low bandwidth, the smooth is spread less. With a high badnwidth,
6 | the smoothing is spread more.
7 |
8 | The bandwidth should be set according to 2 factors: - the maximum size
9 | of the bandwidth should not smooth the signals outside of cells. by
10 | default we choose a bandwidth of 2.5 um, as this has a FWTM or ~10um,
11 | which is the average size of cells in the mouse SSp. This worked well
12 | for all examples in the SSAM paper. - the minimum size of the bandwidth
13 | should at least smooth signal to adjacent mRNA. From experience, this is
14 | not an issue for most ISH based techniques, but sequencing based
15 | techniques such as ISS can produce very sparse data and may require
16 | higher bandwidths to smooth signal sufficiently.
17 |
18 | Here is a close-up of the osmFISH mouse SSp dataset which investigates
19 | the effect of adjusting the kernel bandwidth. You can see that with a
20 | bandwidth of 1um the smoothing is sufficient, and with a bandwidth of
21 | 5um it is a little too much. The bandwidth of 2.5um appears to be a good
22 | balance of smoothing adjacent signal, while not smooting into the
23 | adjacent area or loosing sparse cell types.
24 |
25 | |image0|
26 |
27 | .. |image0| image:: ../images/kernel_bw.png
28 |
29 |
--------------------------------------------------------------------------------
/doc/userguide/07-input_mask.rst:
--------------------------------------------------------------------------------
1 | Input masks
2 | ===========
3 |
4 | For some tissue images you may want to restrict analysis to certain
5 | parts of the image. For example, the image may have degradation towards
6 | the edges, you may wish to exclude non tissue areas, or even perhaps
7 | restricting SSAM analysis to previously segmented areas.
8 |
9 | SSAM accepts input masks that are defined as polygons.
10 |
11 | Example for the VISp smFISH dataset:
12 |
13 | ::
14 |
15 | from matplotlib.patches import Polygon
16 | from matplotlib.collections import PatchCollection
17 |
18 | plt.figure(figsize=[5, 5])
19 | ds.plot_l1norm(cmap="Greys", rotate=1)
20 | ds.plot_localmax(c="Blue", rotate=1, s=0.1)
21 |
22 | patch = Polygon(xy, facecolor="black", edgecolor="red", linewidth=10, ls="-")
23 | p = PatchCollection([patch], alpha=0.4)
24 | plt.gca().add_collection(p~)
25 | plt.show()
26 |
27 | |image0|
28 |
29 | After the desired region selected, a ``mask`` can be created. In this
30 | case we define an ``input_mask`` and ``output_mask`` which restricts all
31 | data process anf reported output to the selected region.
32 |
33 | ::
34 |
35 | from matplotlib.path import Path
36 |
37 | x, y = np.meshgrid(np.arange(ds.vf.shape[0]), np.arange(ds.vf.shape[1]))
38 | x, y = x.flatten(), y.flatten()
39 | points = np.vstack((x,y)).T
40 |
41 | path = Path(xy)
42 | input_mask = path.contains_points(points)
43 | output_mask = input_mask = input_mask.reshape((ds.vf.shape[1], ds.vf.shape[0], 1)).swapaxes(0, 1)
44 |
45 | .. |image0| image:: ../images/mask.png
46 |
47 |
--------------------------------------------------------------------------------
/doc/userguide/08-guided.rst:
--------------------------------------------------------------------------------
1 | SSAM *guided* analysis
2 | ======================
3 |
4 | The main visual output of SSAM is the creation of the cell-type map,
5 | which is created by classifying pixels in the tissue image based of
6 | either predefined or calculated genes expression signatures. When the
7 | gene expression signatures are already known, one can use SSAM in
8 | *guided* mode. When previously known cell type signatures are known, we
9 | highly recommend running *guided* mode analysis as a quality check.
10 |
11 | Single cell RNA sequencing data
12 | -------------------------------
13 |
14 | We will use scRNA-seq data from `Tasic et al.
15 | 2018 `__ for the guided
16 | analysis. In the paper they identified “shared and distinct
17 | transcriptomic cell types across neocortical areas” in the mouse brain,
18 | also including the mouse VISp (which is our exmaple).
19 |
20 | First we need to load the data:
21 |
22 | ::
23 |
24 | scrna_cl = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/cl.feather")
25 | scrna_cl_df = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/cl_df.feather")
26 | scrna_genes = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/genes.feather")
27 | scrna_counts = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/counts.feather")
28 |
29 | scrna_clusters = scrna_cl['cluster_id']
30 |
31 | scrna_cl_dic = dict(zip(scrna_cl['cell_id'], scrna_cl['cluster_id']))
32 | scrna_cl_metadata_dic = dict(zip(
33 | scrna_cl_df['cluster_id'],
34 | zip(scrna_cl_df['cluster_label'],
35 | scrna_cl_df['cluster_color'], )
36 | ))
37 |
38 | qc_gene_indices = np.sum(scrna_counts > 0, axis=1) > 5
39 | scrna_genes_qc = np.array(scrna_genes)[qc_gene_indices]
40 |
41 | scrna_counts_qc = np.array(scrna_counts).T[:, qc_gene_indices]
42 |
43 | Normalisation
44 | -------------
45 |
46 | Once the data is loaded, we will normalise it using ``run_sctransform``:
47 |
48 | ::
49 |
50 | scrna_data_normalized = np.array(ssam.run_sctransform(scrna_counts_qc)[0])
51 |
52 | Cell-type gene expression signatures
53 | ------------------------------------
54 |
55 | Once the data is normalised, we can calculate the average gene
56 | expression per cell type (the ``centroids``), which can then be used for
57 | classifying pixels in the image
58 |
59 | ::
60 |
61 | selected_genes_idx = [list(scrna_genes_qc).index(g) for g in ds.genes]
62 | scrna_uniq_clusters = np.unique(scrna_clusters)
63 | scrna_centroids = []
64 | for cl in scrna_uniq_clusters:
65 | scrna_centroids.append(np.mean(scrna_data_normalized[:, selected_genes_idx][scrna_clusters == cl], axis=0))
66 |
67 | Generate a *guided* cell-type map
68 | ---------------------------------
69 |
70 | We can now continue to classify pixels in the tissue image using the
71 | cell-type gene expression signatures from the sc-RNAseq data.
72 |
73 | We map the local maxima vectors to the most similar clusters in the
74 | scRNA-seq data using, using a `correlation threshold of classifying
75 | pixels of ``0.6`` `__
76 |
77 | ::
78 |
79 | analysis.map_celltypes(scrna_centroids) # map the scRNAseq cell type signatures to the tissue image
80 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.3, output_mask=output_mask) # post-filter cell-type map to remove spurious pixels
81 |
82 | plt.figure(figsize=[5, 5]) # initiate the plotting area
83 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False) # SSAM plotting function
84 |
85 | |image0|
86 |
87 | Despite the guided mode producing passable results, we highly recommend
88 | using the `de novo mode for more accurate analysis `__.
89 |
90 | .. |image0| image:: ../images/guided.png
91 |
92 |
--------------------------------------------------------------------------------
/doc/userguide/09-celltype_map_thresh_g.rst:
--------------------------------------------------------------------------------
1 | Thresholding the guided cell-type map
2 | =====================================
3 |
4 | After cell-type signatures are provided, the tissue image can be
5 | classified. The classification of each pixel is based on the Pearson
6 | correlation metric (although an `experimental adversarial autoencoder
7 | based classification method `__ can be applied).
8 |
9 | We found that a minimum correlation threshold (``min_r``) of 0.3 worked
10 | well for guided mode based on single cell RNAseq cell-type signatures,
11 | and 0.6 worked well for *de novo* mode.
12 |
13 | Below we show how the cell-type map changes using correlation thresholds
14 | of ``0.15,0.3,0.45`` using the scRNAseq signatures
15 |
16 | ::
17 |
18 | scrna_uniq_labels = [scrna_cl_metadata_dic[i][0] for i in scrna_uniq_clusters]
19 | scrna_colors = [scrna_cl_metadata_dic[i][1] for i in scrna_uniq_clusters]
20 |
21 | analysis.map_celltypes(scrna_centroids)
22 |
23 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.15, output_mask=output_mask) # post-filter cell-
24 | plt.figure(figsize=[5, 5])
25 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False)
26 |
27 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.3, output_mask=output_mask) # post-filter cell-
28 | plt.figure(figsize=[5, 5])
29 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False)
30 |
31 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.45, output_mask=output_mask) # post-filter cell-
32 | plt.figure(figsize=[5, 5])
33 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False)
34 |
--------------------------------------------------------------------------------
/doc/userguide/10-de_novo.rst:
--------------------------------------------------------------------------------
1 | SSAM *de novo* analysis
2 | =======================
3 |
4 | While we believe that the `guided mode of SSAM `__ to be able
5 | to generate good cell-type maps rapidly, the *de novo mode* provide much
6 | more accurate results.
7 |
8 | The steps of the *de novo* analysis are briefly discussed below, with
9 | links to more detailed discussion:
10 |
11 | - `setting cell-type map correlation
12 | threshold `__
13 | - `visualisation of cell-type signatures: heatmap, tSNE,
14 | UMAP `__
15 |
16 | Clustering of expression vectors
17 | --------------------------------
18 |
19 | Once the local maxima have been selected and
20 | `filtered `__, we can perform `clustering
21 | analysis `__. SSAM supports `a number of clustering
22 | methods `__. Here we use the Louvain algorithm using 22
23 | principle components, a resolution of 0.15.
24 |
25 | ::
26 |
27 | analysis.cluster_vectors(
28 | min_cluster_size=0,
29 | pca_dims=22,
30 | resolution=0.15,
31 | metric='correlation')
32 |
33 | Cluster annotation and diagnostics
34 | ----------------------------------
35 |
36 | SSAM provides `diagnostic plots `__ which can be used to
37 | evaluate the quality of clusters, and `facilitates the annotation of
38 | clusters `__.
39 |
40 | Visualisng the clusters
41 | -----------------------
42 |
43 | SSAM supports `cluster visualisation via heatmaps, and 2D embedding
44 | (t-SNE and UMAP) `__. Here we give an example of the
45 | t-SNE plot:
46 |
47 | ::
48 |
49 | plt.figure(figsize=[5, 5])
50 | ds.plot_tsne(pca_dims=22, metric="correlation", s=5, run_tsne=True)
51 | plt.savefig('images/tsne.png')
52 |
53 | .. figure:: ../images/tsne.png
54 | :alt: plot of t-SNE embedding of cell types
55 |
56 | plot of t-SNE embedding of cell types
57 |
58 | Cell type map
59 | -------------
60 |
61 | Once the clusters have been evaluated for quality, we can generate the
62 | *de novo* cell-type map. This involves `classifying all the pixels in
63 | the tissue image based on a correlation
64 | threshold `__. For the *de novo* application
65 | ``0.6`` was found to perform well:
66 |
67 | ::
68 |
69 | analysis.map_celltypes()
70 |
71 | filter_params = {
72 | "block_size": 151,
73 | "method": "mean",
74 | "mode": "constant",
75 | "offset": 0.2
76 | }
77 |
78 | analysis.filter_celltypemaps(min_norm="local", filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask)
79 |
80 | ::
81 |
82 | plt.figure(figsize=[5, 5])
83 | ds.plot_celltypes_map(rotate=1, set_alpha=False)
84 | plt.axis('off')
85 | plt.savefig('images/de_novo.png')
86 |
87 | .. figure:: ../images/de_novo.png
88 | :alt: plot of the de novo generated celltype map
89 |
90 | plot of the de novo generated celltype map
91 |
92 | We can now use our celltype map to infer a map of `tissue
93 | domains `__.
94 |
--------------------------------------------------------------------------------
/doc/userguide/11-max_filtering.rst:
--------------------------------------------------------------------------------
1 | Filtering local maxima
2 | ======================
3 |
4 | As demonstrated in the `SSAM
5 | paper `__, local L1
6 | maxima selection is an effective way of downsampling the entire vector
7 | field for faster computation, and they better represent known gene
8 | expression profiles compared to random downsampling.
9 |
10 | However, local maxima in the vector field can arrise from undesirable
11 | locations, e.g. singleton mRNAs. In order to filter less informative
12 | local maxima.
13 |
14 | We recommend applying threshold for individual genes, and for the total
15 | gene expression.
16 |
17 | Per gene expression threshold
18 | -----------------------------
19 |
20 | The per gene threshold should be at least the height of a single
21 | Gaussian curve over an mRNA. This can easily be empirically determined
22 | by visual analysis. In this multiplexed smFISH exmaple, the per gene
23 | expression threshold, ``exp_thres`` is set to 0.027
24 |
25 | ::
26 |
27 | exp_thres = 0.027
28 | viewport = 0.1
29 | gindices = np.arange(len(ds.genes))
30 | np.random.shuffle(gindices)
31 | plt.figure(figsize=[5, 7])
32 | for i, gidx in enumerate(gindices[:6], start=1):
33 | ax = plt.subplot(5, 2, i)
34 | n, bins, patches = ax.hist(ds.vf[..., gidx][np.logical_and(ds.vf[..., gidx] > 0, ds.vf[..., gidx] < viewport)], bins=100, log=True, histtype=u'step')
35 | ax.set_xlim([0, viewport])
36 | ax.set_ylim([n[0], n[-1]])
37 | ax.axvline(exp_thres, c='red', ls='--')
38 | ax.set_title(ds.genes[gidx])
39 | ax.set_xlabel("Expression")
40 | ax.set_ylabel("Count")
41 | plt.tight_layout()
42 | pass
43 |
44 | |image0|
45 |
46 | Total gene expression threshold
47 | -------------------------------
48 |
49 | The total gene threshold should be empirically determined by examing the
50 | curve of total gene expression of local maxima. This isn’t always easy,
51 | and we highly encourage investigating this thoroughly.
52 |
53 | ::
54 |
55 | norm_thres = 0.2
56 | gidx = 0
57 | plt.figure(figsize=[5, 2])
58 | #plt.hist(ds.vf[..., gidx][ds.vf[..., gidx] > 0], bins=100, log=True)
59 | n, _, _ = plt.hist(ds.vf_norm[np.logical_and(ds.vf_norm > 0, ds.vf_norm < 0.3)], bins=100, log=True, histtype='step')
60 | ax = plt.gca()
61 | ax.axvline(norm_thres, c='red', ls='--')
62 | ax.set_xlabel("L1-norm")
63 | ax.set_ylabel("Count")
64 |
65 | plt.xlim([0, 0.3])
66 | plt.ylim([np.min(n), np.max(n) + 100000])
67 | pass
68 |
69 | |image1|
70 |
71 | Filtering “stray” local maxima using k-nearest neighbour density
72 | ================================================================
73 |
74 | If there is mRNA signal originating from outside the tissue area (due to
75 | background noise), it would improve downstream analysis to remove such
76 | vectors. We observed this in the osMFISH data. These “stray” local
77 | maxima tend to be less dense than local maxima from the tissue area:
78 |
79 | |image2|
80 |
81 | Because of this, they can be effectively filtered using their k-neearest
82 | neighbor density, in this example settting the ``threshold`` to 0.002.
83 |
84 | ::
85 |
86 | from sklearn.neighbors import KDTree
87 | X = np.array([ds.local_maxs[0], ds.local_maxs[1]]).T
88 | kdt = KDTree(X, leaf_size=30, metric='euclidean')
89 | rho = 100 / (np.pi * kdt.query(X, k=100)[0][:, 99] ** 2)
90 |
91 | threshold = 0.002
92 |
93 | plt.figure(figsize=[5, 2.5])
94 | plt.hist(rho, bins=100, histtype='step')
95 | plt.axvline(x=threshold, color='r', linestyle='--')
96 |
97 | ax = plt.gca()
98 | ax.set_xlabel("Local KNN density")
99 | ax.set_ylabel("Count")
100 | pass
101 |
102 | |image3|
103 |
104 | …. and a quick look at the before and after in the osmFISH dataset
105 |
106 | |image4|
107 |
108 | .. |image0| image:: ../images/local_max_threshold_gene.png
109 | .. |image1| image:: ../images/local_max_threshold_total.png
110 | .. |image2| image:: ../images/local_max_threshold_knn.png
111 | .. |image3| image:: ../images/local_max_threshold_knn2.png
112 | .. |image4| image:: ../images/local_max_threshold_knn3.png
113 |
114 |
--------------------------------------------------------------------------------
/doc/userguide/12-clustering.rst:
--------------------------------------------------------------------------------
1 | Clustering Local L-1 Maxima
2 | ===========================
3 |
4 | In the *de novo* mode analysis, after the local maxima have been
5 | identified from the tissue image, they are clustered.
6 |
7 | The default clustering algorithm is based on `Louvain community
8 | detection `__.
9 | SSAM also supports clustering using ``hdbscan`` and ``optics``.
10 |
11 | It can be initiated by:
12 |
13 | ::
14 |
15 | analysis.cluster_vectors(method="louvain",
16 | pca_dims=-1,
17 | min_cluster_size=2,
18 | max_correlation=1.0,
19 | metric="correlation",
20 | outlier_detection_method='medoid-correlation',
21 | outlier_detection_kwargs={},
22 | random_state=0,
23 | **kwargs)
24 |
25 | … where - ``method`` can be ``louvain``, ``hdbscan``, ``optics``. -
26 | ``pca_dims`` are the number of principal componants used for clustering.
27 | - ``min_cluster_size`` is the minimum cluster size. - ``resolution`` is
28 | the resolution for Louvain community detection. - ``prune`` is the
29 | threshold for Jaccard index (weight of SNN network). If it is smaller
30 | than prune, it is set to zero. - ``snn_neighbors`` is the number of
31 | neighbors for SNN network. - ``max_correlation`` is the threshold for
32 | which clusters with higher correlation to this value will be merged. -
33 | ``metric`` is the metric for calculation of distance between vectors in
34 | gene expression space. - ``subclustering`` if set to True, each cluster
35 | will be clustered once again with DBSCAN algorithm to find more
36 | subclusters. - ``dbscan_eps`` is the ``eps`` value for DBSCAN
37 | subclustering. Not used when ‘subclustering’ is set False. -
38 | ``centroid_correction_threshold`` is the threshold for which centroid
39 | will be recalculated with the vectors which have the correlation to the
40 | cluster medoid equal or higher than this value. - ``random_state`` is
41 | the random seed or scikit-learn’s random state object to replicate the
42 | same result
43 |
44 | Removing outliers
45 | -----------------
46 |
47 | The cell type signature is determined as the centroid of the cluster.
48 | This can be affected by outliers, so SSAM supports a number of outlier
49 | removal methods:
50 |
51 | ::
52 |
53 | analysis.remove_outliers(outlier_detection_method='medoid-correlation', outlier_detection_kwargs={}, normalize=True)
54 |
55 | .. where - ``outlier_detection_method`` can be ``medoid-correlation``,
56 | ``robust-covariance``, ``one-class-svm``, ``isolation-forest``,
57 | ``local-outlier-factor`` - ``outlier_detection_kwargs`` are arguments
58 | passed to the outlier detection method
59 |
--------------------------------------------------------------------------------
/doc/userguide/13-diagnostic.rst:
--------------------------------------------------------------------------------
1 | Diagnostic plots
2 | ================
3 |
4 | After unsupervised clustering of gene expression vectors, some clusters
5 | may need to be manually merged or discarded. SSAM supports merging of
6 | clusters based on correlation of gene expression profile, however in
7 | many cases manual inspection is needed to rule out any non-trivial
8 | issues.
9 |
10 | To guide this process, SSAM generates a cluster-wise ‘diagnostic plot’,
11 | which consists of four panels: 1) location of the clustered vectors on
12 | the tissue image, 2) the pixels classified to belong the cluster
13 | signature (the cluster centroid), 3) the mean expression profile of the
14 | clustered vectors, and 4) the t-SNE or UMAP embedding.
15 |
16 | In the three datasets analyzed the clusters to be merged or removed
17 | often showed a discordance between the location of sampled vectors used
18 | to determine the cluster (panel 1) and the pixels classified to belong
19 | to that cluster (panel 2). In case of overclustering, i.e. when a
20 | cell-type signature is split over 2 clusters, the map typically does not
21 | classify the full shape of the cells but instead only fragments (panel
22 | 2), and having almost the same marker gene expression of another cluster
23 | (panel 3). Such clusters can be merged.
24 |
25 | For dubious clusters that should be removed, we observed that vectors
26 | usually originate from outside the tissue region or from image artifacts
27 | (panel 1), or that the gene expression does not show any clear
28 | expression of marker genes or similarity to expected gene expression
29 | profiles (panel 3).
30 |
31 | The remaining clusters are then annotated by comparing cluster marker
32 | genes to known cell-type markers. Note that in many cases, the identity
33 | of clusters can be easily assigned by comparing the centroids of the
34 | clusters to the known cell-type signatures, e.g., from single cell RNA
35 | sequencing.
36 |
37 | To support rapid annotation of cell types to clusters, SSAM additionally
38 | shows the highest correlating known cell-type signature should this data
39 | be available in panel 3.
40 |
41 | Example 1: a large cluster that can be easily annotated
42 | -------------------------------------------------------
43 |
44 | Local maxima (panel 1), correspond to the same area (panel 2), and
45 | matches known gene expression patterns of *Vip Arhgap36 Hmcn1* cell
46 | types from scRNAseq experiments with high correlation (panel 3)
47 |
48 | |image0|
49 |
50 | Example 2: a large cluster that cannot be easily annotated
51 | ----------------------------------------------------------
52 |
53 | Local maxima (panel 1), correspond to the same area (panel 2). The gene
54 | expression profile has a good correlation to *L2/3 IT VISp Adamts2* cell
55 | types, but are lacking the very high expression of *Pde1a*. In this
56 | particular case, one would need to check other clusters matching this
57 | cell type and perhaps merge them, or perhaps this indicates low
58 | efficiency of the *Pde1a* probe in the experiment.
59 |
60 | |image1|
61 |
62 | Example 3: a small cluster that is good
63 | ---------------------------------------
64 |
65 | Despite only 2 local maxima (panel 1), the classified pixels correspond
66 | to the same area (panel 2), and matches known gene expression patterns
67 | (panel 3). This presents a very rare, SSt Chodl cell type.
68 |
69 | |image2|
70 |
71 | Example 4: a small cluster that is questionable
72 | -----------------------------------------------
73 |
74 | Sampled local maxima (panel 1) to no correspond to the classified pixels
75 | (panel 2), and doesnt clearly match known gene expression patterns
76 | (panel 3)
77 |
78 | |image3|
79 |
80 | .. |image0| image:: ../images/diagplot_centroid_2.png
81 | .. |image1| image:: ../images/diagplot_centroid_5.png
82 | .. |image2| image:: ../images/diagplot_centroid_30.png
83 | .. |image3| image:: ../images/diagplot_centroid_8.png
84 |
85 |
--------------------------------------------------------------------------------
/doc/userguide/14-cluster_annotation.rst:
--------------------------------------------------------------------------------
1 | Cluster annotation
2 | ==================
3 |
4 | In a typical single cell RNAseq experiment, the process of annotating
5 | cell types manually can be laborious and as such, `a number of automated
6 | methods have emerged `__.
7 |
8 | In a typical *in situ* transcriptomics experiment, the annotation of
9 | cell types is usually much easier as these assays are usually profile
10 | established cell type markers. Cluster can be annotated easily based on
11 | marker gene expression.
12 |
13 | The `diagnostic plots `__ can be used to compare existing
14 | signatures against those identified *de novo*
15 |
16 | ::
17 |
18 | from scipy.stats import pearsonr, spearmanr
19 |
20 | for idx in range(len(ds.centroids)):
21 | plt.figure(figsize=[50, 15])
22 | ds.plot_diagnostic_plot(idx, known_signatures=[
23 | ("scRNA-seq", scrna_uniq_labels, scrna_centroids, scrna_colors),
24 | ], correlation_methods=[
25 | ("r", pearsonr),
26 | ("rho", spearmanr)
27 | ])
28 | plt.tight_layout()
29 | plt.savefig('diagplots_multiplexed_smFISH/diagplot_centroid_%d.png'%idx)
30 | plt.close()
31 |
32 | This will generate a diagnostic plot for each cluster, which can be used
33 | to assign cluster labels. E.g. the following cluster matches known gene
34 | expression patterns of Vip Arhgap36 Hmcn1 cell types from scRNAseq
35 | experiments with high correlation (panel 3):
36 |
37 | |image0|
38 |
39 | While this is a good example of cluster that can be easily annotated,
40 | some clusters may prepresent noise and would need to be removed, and
41 | when over clustering occurs then clusters may have to be merged. The
42 | `diagnostic plots documentation `__ assist the decision
43 | making process.
44 |
45 | Once each cluster is reviewed, a cell-type be assigned, or removed, or
46 | merged. In the following code snippet, we show an elegent way to
47 | annotate, remove, and merge clusters.
48 |
49 | 1) Determine that (i) clusters with a name will be annotated, (ii)
50 | clusters with a “N/A” will be removed, (iii) clusters with the same
51 | name will be merged
52 |
53 | ::
54 |
55 | denovo_labels = [
56 | "N/A",
57 | "VLMC",
58 | "Vip Arhgap36 Hmcn1 / Vip Igfbp4 Map21l1",
59 | "L2/3 IT Rrad",
60 | "N/A",
61 | "L2/3 IT Adamts2",
62 | "Sst Nts / Sst Rxfp1 Eya1",
63 | "Lamp5 Lsp1",
64 | "N/A",
65 | "Sst Crhr2 Efemp1 / Sst Esm1",
66 |
67 | "Pvalb Calb1 Sst / Pvalb Reln Tac1",
68 | "Astro Aqp4",
69 | "L6 IT Penk Fst",
70 | "L4 IT Superficial",
71 | "L5 IT Col27a1",
72 | "L2/3 IT Adamts2",
73 | "OPC",
74 | "Oligo",
75 | "L4 IT Rspo1",
76 | "L5 NP Trhr Met",
77 |
78 | "L5 IT Hsd11b1 Endou",
79 | "Pvalb Th Sst / Pvalb Reln Tac1",
80 | "L6 CT Ctxn3 Brinp3 / L6 CT Gpr139",
81 | "L5 PT Chrna6",
82 | "L5 IT Batf3",
83 | "L5 PT C1ql2 Cdh13",
84 | "L5 PT Krt80",
85 | "L6 IT Penk Col27a1",
86 | "L6 IT Penk Col27a1",
87 | "L6b Crh",
88 |
89 | "Sst Chodl",
90 | ]
91 |
92 | 2) make objects for storing the index of clusters to be annotated,
93 | removed and merged
94 |
95 | ::
96 |
97 | denovo_labels_final = []
98 | exclude_indices = []
99 | merge_indices = []
100 |
101 | 3) iterate over the ``denovo_labels`` object and populate the
102 | ``denovo_labels_final``, ``exclude_indices``, ``merge_indices``
103 | objects
104 |
105 | ::
106 |
107 | for idx, cl in enumerate(denovo_labels):
108 | if cl == 'N/A':
109 | exclude_indices.append(idx)
110 | continue
111 | if cl in denovo_labels_final:
112 | continue
113 | denovo_labels_final.append(cl)
114 |
115 | for cl in np.unique(denovo_labels):
116 | if cl == 'N/A':
117 | continue
118 | mask = [cl == e for e in denovo_labels]
119 | if np.sum(mask) > 1:
120 | merge_indices.append(np.where(mask)[0])
121 |
122 | 4) plot the removed clusters in t-SNE embedding
123 |
124 | ::
125 |
126 | cmap = plt.get_cmap('jet')
127 | jet_colors = cmap(np.array(list(range(len(ds.centroids)))) / (len(ds.centroids) - 1))
128 | tsne_colors = np.zeros_like(jet_colors)
129 | tsne_colors[..., :] = [0.8, 0.8, 0.8, 1]
130 | tsne_colors[exclude_indices] = [0, 0, 0, 1] #jet_colors[exclude_indices]
131 | import matplotlib.patheffects as PathEffects
132 | plt.figure(figsize=[5, 5])
133 | ds.plot_tsne(pca_dims=33, metric="correlation", s=5, run_tsne=False, colors=tsne_colors)
134 | plt.axis('off')
135 |
136 | |image1|
137 |
138 | 5) plot the merged clusters in t-SNE embedding
139 |
140 | ::
141 |
142 | cmap = plt.get_cmap('rainbow')
143 | jet_colors = cmap(np.array(list(range(len(merge_indices)))) / (len(merge_indices) - 1))
144 | plt.figure(figsize=[5, 5])
145 | tsne_colors = np.zeros([len(ds.centroids), 4])
146 | tsne_colors[..., :] = [0.8, 0.8, 0.8, 1]
147 | for idx, mi in enumerate(merge_indices):
148 | tsne_colors[mi] = jet_colors[idx]
149 | ds.plot_tsne(pca_dims=33, metric="correlation", s=5, run_tsne=False, colors=tsne_colors)
150 | plt.axis('off')
151 |
152 | |image2|
153 |
154 | 6) update the ``analysis`` object with the clusters to remove and merge
155 |
156 | ::
157 |
158 | analysis.exclude_and_merge_clusters(exclude_indices, merge_indices, centroid_correction_threshold=0.6)
159 |
160 | .. |image0| image:: ../images/diagplot_centroid_2.png
161 | .. |image1| image:: ../images/tsne_removed.png
162 | .. |image2| image:: ../images/tsne_merged.png
163 |
164 |
--------------------------------------------------------------------------------
/doc/userguide/15-celltype_map_thresh_d.rst:
--------------------------------------------------------------------------------
1 | Thresholding the de-novo cell-type map
2 | ======================================
3 |
4 | After cell-type signatures are calculated, the tissue image can be
5 | classified. The classification of each pixel is based on the Pearson
6 | correlation metric (although an `experimental adversarial autoencoder
7 | based classification method `__ can be applied).
8 |
9 | We found that a minimum correlation threshold (``min_r``) of 0.3 worked
10 | well for guided mode based on single cell RNAseq cell-type signatures,
11 | and 0.6 worked well for *de novo* mode.
12 |
13 | Below we show how the cell-type map changes using correlation thresholds
14 | of ``0.4,0.6,0.8`` for the guided cell-type map.
15 |
16 | ::
17 |
18 | analysis.map_celltypes()
19 |
20 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.4, fill_blobs=True, min_blob_area=50, output_mask=output_mask)
21 | plt.figure(figsize=[5, 5])
22 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False)
23 |
24 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask)
25 | plt.figure(figsize=[5, 5])
26 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False)
27 |
28 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.8, fill_blobs=True, min_blob_area=50, output_mask=output_mask)
29 | plt.figure(figsize=[5, 5])
30 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False)
31 |
--------------------------------------------------------------------------------
/doc/userguide/16-visualisation.rst:
--------------------------------------------------------------------------------
1 | Visualisation of 2D gene expression embeddings (t-SNE and UMAP)
2 | ===============================================================
3 |
4 | An important part of presenting the summary of the clustering analysis
5 | is 2D visualisation via embedding.
6 |
7 | `UMAP `__ and
8 | `t-SNE `__,
9 | are 2 common dimensionality reduction methods that can be useful for
10 | displaying clustering results.
11 |
12 | Running t-SNE
13 | -------------
14 |
15 | To run the t-SNE on the ``ds`` object:
16 | ``ds.run_tsne(pca_dims=-1,n_iter=5000, perplexity=70, early_exaggeration=10, metric="correlation", exclude_bad_clusters=True, random_state=0, tsne_kwargs={})``
17 |
18 | .. where:
19 |
20 | - ``pca_dims``: Number of PCA dimensions used for the tSNE embedding.
21 | - ``n_iter``: Maximum number of iterations for the tSNE.
22 | - ``perplexity``: The perplexity value of the tSNE (please refer to the
23 | section `How should I set the perplexity in
24 | t-SNE? `__ ).
25 | - ``early_exaggeration``: Early exaggeration parameter for tSNE.
26 | Controls the tightness of the resulting tSNE plot.
27 | - ``metric``: Metric for calculation of distance between vectors in
28 | gene expression space.
29 | - ``exclude_bad_clusters``: If true, the vectors that are excluded by
30 | the clustering algorithm will not be considered for tSNE computation.
31 | - ``random_state``: Random seed or scikit-learn’s random state object
32 | to replicate the same result
33 | - ``tsne_kwargs``: Other keyward parameters for tSNE.
34 |
35 | Running UMAP
36 | ------------
37 |
38 | To run the t-SNE on the ``ds`` object:
39 | ``ds.run_umap(self, pca_dims=-1, metric="correlation", min_dist=0.8, exclude_bad_clusters=True, random_state=0, umap_kwargs={})``
40 |
41 | .. where:
42 |
43 | - ``pca_dims``: Number of PCA dimensions used for the UMAP embedding.
44 | - ``metric``: Metric for calculation of distance between vectors in
45 | gene expression space.
46 | - ``min_dist``: ‘min_dist’ parameter for UMAP.
47 | - ``exclude_bad_clusters``: If true, the vectors that are excluded by
48 | the clustering algorithm will not be considered for UMAP computation.
49 | - ``random_state``: Random seed or scikit-learn’s random state object
50 | to replicate the same result
51 | - ``umap_kwargs``: Other keyward parameters for UMAP.
52 |
53 | Plotting embeddings
54 | -------------------
55 |
56 | Plotting of the t-SNE and UMAP beddings can be performed by:
57 |
58 | ::
59 |
60 | ds.plot_embedding(method='umap')
61 | ds.plot_embedding(method='tSNE')
62 |
63 | |image0|
64 |
65 | .. |image0| image:: ../images/tsne_final.png
66 |
67 |
--------------------------------------------------------------------------------
/doc/userguide/17-domain.rst:
--------------------------------------------------------------------------------
1 | Identifying tissue domains
2 | ==========================
3 |
4 | Cells are organised into tissues and organs. Spatial gene expression not
5 | only allows the identification of cell types *in situ*, but also allows
6 | investigation of how these cells are organised.
7 |
8 | SSAM facilitates the identification of “tissue domains”, which are
9 | regions in the tissue exhibiting similar local cell type composition.
10 | This is based on circular window sampling with a defined ``radius`` and
11 | ``step``, which is then followed by `agglomerative
12 | clustering `__.
13 |
14 | Perform circular window sampling
15 | --------------------------------
16 |
17 | The first step is to sample cell-type composition in circular sweeping
18 | windows. For this, the size of circular window (``radius``) and the step
19 | between each sampling (``step``) has to be defined. The units here are
20 | in um, which is also equivalent to pixels in this example. The following
21 | performs this sampling using a circular window of 100um, with 10um
22 | steps:
23 |
24 | ::
25 |
26 | analysis.bin_celltypemaps(step=10, radius=100)
27 |
28 | Clustering domain signatures
29 | ----------------------------
30 |
31 | After performing the sampling, we continue with identifying domain
32 | signatures through clustering. This is based on agglomerative clustering
33 | to identify the initial clusters (``n_clusters``) of windows which
34 | include a minimum number of classified pixels (``norm_thres``), followed
35 | cluster merging when the correlation between clusters exceeds a
36 | threshold (``merge_thres``). The merging of clusters can be restricted
37 | to adjacent clusters (``merge_remote=FALSE``), or not restricted to
38 | spatial proximity (``merge_remote=True``)
39 |
40 | ::
41 |
42 | analysis.find_domains(n_clusters=20, merge_remote=True, merge_thres=0.7, norm_thres=1500)
43 |
44 | Visualizing identified domains
45 | ------------------------------
46 |
47 | Once the domains have been indentified, they have to be visualised for
48 | evaluation.
49 |
50 | ::
51 |
52 | from matplotlib.colors import ListedColormap
53 | cmap_jet = plt.get_cmap('jet')
54 | num_domains = np.max(ds.inferred_domains_cells) + 1
55 |
56 | fig, axs = plt.subplots(1, num_domains, figsize=(4*num_domains, 4))
57 | for domain_idx in range(num_domains):
58 | ax = axs[domain_idx]
59 | plt.sca(ax)
60 | plt.axis('off')
61 | cmap = ListedColormap([cmap_jet(lbl_idx / num_domains) if domain_idx == lbl_idx else "#cccccc" for lbl_idx in range(num_domains)])
62 | ds.plot_domains(rotate=1, cmap=cmap)
63 | plt.tight_layout()
64 | plt.savefig(f'plots/domains_individual')
65 |
66 | .. figure:: ../images/domains_individual.png
67 | :alt: side by side plot of all tissue domains
68 |
69 | side by side plot of all tissue domains
70 |
71 | Post-processing the identified domains
72 | --------------------------------------
73 |
74 | In certain cases, one may wish to **exclude certain domains**
75 | (``excluded_domain_indices``) as they may originate from tissue
76 | artifacts or contain no information. In our case the third domain (0
77 | based index 2) seems to be an artifact and the fourth one contains no
78 | useful information. The First two domains are obviously part of the same
79 | layer and can therefore be merged.
80 |
81 | Due to possible imaging artifacts such as tiling, some domains might be
82 | split. While it is still possible to tune the ``merge_thres`` in the
83 | clustering step, one can simply perform this as manual post processing.
84 | In the case above, there do not appear to be any domains that require
85 | merging.
86 |
87 | Once the domains to be excluded or merged have been determined, they can
88 | be excluded and removed(!):
89 |
90 | ::
91 |
92 | excluded_domain_indices = [2,3,7,10]
93 | merged_domain_indices = [[0,1],[9,11]]
94 | analysis.exclude_and_merge_domains(excluded_domain_indices, merged_domain_indices)
95 |
96 | The final plot
97 | --------------
98 |
99 | The individual domains represent the established neocortex layering
100 | patterns found in the mouse brain. We can continue with assigning domain
101 | colours, names, and plotting all of the domains together.
102 |
103 | ::
104 |
105 | plt.figure(figsize=[5, 5])
106 | ds.plot_domains(rotate=1)
107 |
108 | |image0|
109 |
110 | .. |image0| image:: ../images/final.png
111 |
112 |
--------------------------------------------------------------------------------
/doc/userguide/18-composition.rst:
--------------------------------------------------------------------------------
1 | Cell-type composition analysis in tissue domains
2 | ================================================
3 |
4 | After identifying `tissue domains `__ that exhibit specific
5 | cell-type composition properties, it may be desirable to report the
6 | cell-type composition properties of the identified domains.
7 |
8 | In the `SSAM
9 | manuscript `__ we used
10 | this functionality to identify that astrocytes cell type representation
11 | of neocortex layer were previously under-reported, and identified the
12 | cell-type composition of novel layering patterns in the primary visual
13 | cortex (VISp).
14 |
15 | Performing the cell-type composition analysis
16 | ---------------------------------------------
17 |
18 | The analysis is initiated on the ``analysis`` object:
19 |
20 | ::
21 |
22 | analysis.calc_cell_type_compositions()
23 |
24 | Plotting the composition of each domain
25 | ---------------------------------------
26 |
27 | Once this has completed, you can plot the cell-type composition of the
28 | different layers using the plot function. In the following exmaple, we
29 | plot the 7 identified layers (``domain_index = 0-6``) in the order that
30 | they would appear in the neocortex:
31 |
32 | ::
33 |
34 | # note - this could be wrapped up into a function
35 | for domain_idx in [1, 0, 2, 3, 4, 5, 6]:
36 | plt.figure(figsize=[5, 5])
37 | ds.plot_celltype_composition(domain_idx,
38 | cell_type_colors=denovo_celltype_colors,
39 | cell_type_orders=heatmap_clusters_index[::-1],
40 | label_cutoff=0.03)
41 | plt.title(domain_labels[domain_idx])
42 |
43 | |image0|
44 |
45 | Plotting the composition of the entire tissue
46 | ---------------------------------------------
47 |
48 | It would be worthwhile to compare the cell-type composition within each
49 | domain, and compare this to what is observed over the entire tissue. The
50 | cell-type compostion over the entire tissue is stored as the last
51 | domain, in this case the 8th element (``domain_index = 7``):
52 |
53 | ::
54 |
55 | # note - this can be wrapped up into a function
56 | plt.figure(figsize=[5, 5])
57 | ds.plot_celltype_composition(domain_index=7,
58 | cell_type_colors=denovo_celltype_colors,
59 | cell_type_orders=heatmap_clusters_index[::-1],
60 | label_cutoff=0.03)
61 | plt.title('All')
62 |
63 | |image1|
64 |
65 | .. |image0| image:: ../images/domain_composition.png
66 | .. |image1| image:: ../images/domain_composition_all.png
67 |
68 |
--------------------------------------------------------------------------------
/doc/userguide/19-experimental.rst:
--------------------------------------------------------------------------------
1 | Experimental features
2 | =====================
3 |
4 | We will endevour to improve the functionality of SSAM by implementing
5 | novel features. So far, these experimental features only works with the ``develop`` branch of SSAM.
6 |
7 | The current novel features supported by SSAM include:
8 |
9 | - `Adversarial Auto Encoder based classification `__
10 |
11 | - `Segmenting the cell-type map `__
12 |
--------------------------------------------------------------------------------
/doc/userguide/20-aaec.rst:
--------------------------------------------------------------------------------
1 | Cell-type classification using Adversarial Autoencoders
2 | =======================================================
3 |
4 | The default classification algorithm is based on Pearson correlation as
5 | this has been `shown to be effective for automatic classification of
6 | cell types `__ for single
7 | cell RNAseq experiments. This proved to be both highly performant and
8 | accurate also for spatial gene expression data. However, it may be
9 | desirable to explore other classification methods.
10 |
11 | One recent and exciting Deep Learning framework that achieve competitive
12 | results in generative modeling and semi-supervised classification tasks
13 | are `adversarial autoencoders `__.
14 |
15 | SSAM implements a modified version of adversarial autoencoder classifier
16 | based on the `original
17 | implementation `__
18 | by `Shahar Azulay `__.
19 |
20 | Mapping cell types using an adversarial autoencoder
21 | ---------------------------------------------------
22 |
23 | In order to use the AAEC classification of pixels instead of the Pearson
24 | correlation based method, simply replace ``analysis.map_celltypes()``
25 | with :
26 |
27 | ::
28 |
29 | analysis.map_celltypes_aaec(epochs=1000, seed=0, batch_size=1000, chunk_size=100000, z_dim=10, noise=0)
30 |
31 |
--------------------------------------------------------------------------------
/doc/userguide/21-segment_celltype_map.rst:
--------------------------------------------------------------------------------
1 | Segmenting the SSAM cell type map
2 | =================================
3 |
4 | While we demonstrate the accuracy of SSAM in reconstructing celltype
5 | maps, we understand that many applications in biology require cell
6 | segmentation. As such, the development branch of SSAM supports
7 | segmentation of the celltype map using the ``watershed`` algorithm.
8 |
9 | **This is an experimental feature!**
10 |
11 | The segmentation of the cell type map can be performed by:
12 |
13 | .. code-block:: python
14 |
15 | # Load DAPI image
16 | with open('zenodo/osmFISH/raw_data/im_nuc_small.pickle', 'rb') as f:
17 | dapi = pickle.load(f)
18 | dapi_small = np.hstack([dapi.T[:1640], np.zeros([1640, 12])]).reshape(ds.vf_norm.shape)
19 |
20 | # Threshold DAPI image to create markers
21 | dapi_threshold = filters.threshold_local(dapi_small[..., 0], 35, offset=-0.0002)
22 | dapi_thresh_im = dapi_small[..., 0] > dapi_threshold
23 | dapi_thresh_im = dapi_thresh_im.reshape(ds.vf_norm.shape).astype(np.uint8) * 255
24 |
25 | # Run watershed segmentation of cell-type maps with DAPI as markers
26 | # After running below, the segmentation data will be available as:
27 | # - Segmentations: ds.watershed_segmentations
28 | # - Cell-type map: ds.watershed_celltype_map
29 | analysis.run_watershed(dapi_thresh_im)
30 |
31 | Below we demonstrate the application of the segmentation on the *de
32 | novo* celltype map generated for the mouse SSp osmFISH data.
33 |
34 | |image0|
35 |
36 | .. |image0| image:: ../images/segmented_celltype_map.png
37 |
38 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | pandas
4 | matplotlib
5 | seaborn
6 | scikit-learn
7 | umap-learn
8 | python-louvain
9 | sparse
10 | scikit-image
11 | pyarrow
12 | packaging
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import io
2 | import setuptools
3 | try:
4 | import numpy as np
5 | except ImportError:
6 | print("Please install Numpy first. e.g. pip install numpy")
7 | exit(1)
8 | from glob import glob
9 |
10 | module_utils = setuptools.extension.Extension('ssam.utils', sources=["c/utils.cpp"], extra_compile_args=["-fopenmp"], extra_link_args=["-fopenmp"], include_dirs=[np.get_include()])
11 |
12 | with io.open("README.rst", "r", encoding="utf-8") as fh:
13 | long_description = fh.read()
14 |
15 | setuptools.setup(
16 | name="ssam",
17 | version="1.0.2",
18 | author="Jeongbin Park",
19 | author_email="j.park@dkfz-heidelberg.de",
20 | description="SSAM",
21 | long_description=long_description,
22 | long_description_content_type="text/markdown",
23 | url="https://github.com/HiDiHlabs/ssam",
24 | packages=setuptools.find_packages(),
25 | classifiers=[
26 | "Programming Language :: Python :: 3",
27 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
28 | "Operating System :: POSIX",
29 | ],
30 | ext_modules = [module_utils],
31 | install_requires=[
32 | "numpy",
33 | "scipy",
34 | "pandas",
35 | "matplotlib",
36 | "seaborn",
37 | "scikit-learn",
38 | "umap-learn",
39 | "python-louvain",
40 | "sparse",
41 | "scikit-image",
42 | "pyarrow",
43 | "packaging",
44 | ]
45 | )
46 |
--------------------------------------------------------------------------------
/ssam/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from matplotlib.colors import to_rgba
5 | import seaborn as sns
6 | import multiprocessing
7 | import os
8 | sns.set()
9 | sns.set_style("whitegrid", {'axes.grid' : False})
10 | from functools import reduce
11 | from sklearn.neighbors import KernelDensity
12 | from sklearn import preprocessing
13 | import scipy
14 | from scipy import ndimage
15 | from sklearn.decomposition import PCA
16 | from sklearn.manifold import TSNE
17 | from umap import UMAP
18 | from multiprocessing import Pool
19 | from contextlib import closing
20 | from tempfile import mkdtemp, TemporaryDirectory
21 | from sklearn.neighbors import kneighbors_graph
22 | from sklearn.cluster import KMeans
23 | import community
24 | import networkx as nx
25 | from sklearn.cluster import DBSCAN
26 | import sparse
27 | from skimage import filters
28 | from skimage.morphology import disk
29 | from skimage import measure
30 | from matplotlib.colors import ListedColormap
31 | import pickle
32 | import subprocess
33 | from scipy.spatial.distance import cdist
34 | from sklearn.cluster import AgglomerativeClustering
35 | from PIL import Image
36 | from scipy.ndimage import zoom
37 | import pyarrow
38 | import time
39 | from packaging import version
40 |
41 | from .utils import corr, calc_ctmap, calc_corrmap, flood_fill, calc_kde
42 |
43 | def _fast_gaussian_kde(args):
44 | # TODO: 1) support sampling distance
45 | # 2) support other kernels
46 | (bandwidth, save_dir, gene_name, shape, locations, sampling_distance) = args
47 |
48 | print('Processing gene %s...'%gene_name)
49 |
50 | maxdist = int(bandwidth * 4)
51 | span = np.linspace(-maxdist,maxdist,maxdist*2+1)
52 | X, Y, Z = np.meshgrid(span,span,span)
53 |
54 | def create_kernel(x, y, z):
55 | X_=(-x+X)/bandwidth
56 | Y_=(-y+Y)/bandwidth
57 | Z_=(-z+Z)/bandwidth
58 | return np.exp(-0.5*(X_**2+Y_**2+Z_**2))
59 |
60 | pd = np.zeros(shape)
61 | for loc in locations:
62 | int_loc = [int(i) for i in loc]
63 | rem_loc = [i%1 for i in loc]
64 |
65 | kernel = create_kernel(*rem_loc)
66 |
67 | pos_start = [i - maxdist for i in int_loc]
68 | pos_end = [i + maxdist + 1 for i in int_loc]
69 |
70 | kernel_pos_start = [abs(i) if i < 0 else 0 for i in pos_start]
71 | kernel_pos_end = [maxdist*2+1 - (i-j) if i > j else maxdist*2+1 for i, j in zip(pos_end, shape)]
72 |
73 | pos_start = [0 if i < 0 else i for i in pos_start]
74 | pos_end = [j if i >= j else i for i, j in zip(pos_end, shape)]
75 |
76 | slices = tuple([slice(i, j) for i, j in zip(pos_start, pos_end)])
77 | kernel_slices = tuple([slice(i, j) for i, j in zip(kernel_pos_start, kernel_pos_end)])
78 | pd[slices] += kernel.swapaxes(0, 1)[kernel_slices]
79 |
80 | pd /= pd.sum()
81 | pd *= len(locations)
82 |
83 | return pd
84 |
85 | def run_sctransform(data, clip_range=None, verbose=True, debug_path=None, plot_model_pars=False, **kwargs):
86 | """
87 | Run 'sctransform' R package and returns the normalized matrix and the model parameters.
88 | Package 'feather' is used for the data exchange between R and Python.
89 | :param data: N x D ndarray to normlize (N is number of samples, D is number of dimensions).
90 | :type data: numpy.ndarray
91 | :param kwargs: Any keyword arguments passed to R function `vst`.
92 | :returns: A 2-tuple, which contains two pandas.dataframe:
93 | (1) normalized N x D matrix.
94 | (2) determined model parameters.
95 | """
96 | def _log(m):
97 | if verbose:
98 | print(m)
99 |
100 | vst_options = ['%s = "%s"'%(k, v) if type(v) is str else '%s = %s'%(k, v) for k, v in kwargs.items()]
101 | if len(vst_options) == 0:
102 | vst_opt_str = ''
103 | else:
104 | vst_opt_str = ', ' + ', '.join(vst_options)
105 | with TemporaryDirectory() as tmpdirname:
106 | if debug_path:
107 | tmpdirname = debug_path
108 | ifn, ofn, pfn, rfn = [os.path.join(tmpdirname, e) for e in ["in.feather", "out.feather", "fit_params.feather", "script.R"]]
109 | _log("Writing temporary files...")
110 | if isinstance(data, pd.DataFrame):
111 | df = data
112 | else:
113 | df = pd.DataFrame(data, columns=[str(e) for e in range(data.shape[1])])
114 | if version.parse(pyarrow.__version__) >= version.parse("1.0.0"):
115 | df.to_feather(ifn, version=1)
116 | else:
117 | df.to_feather(ifn)
118 | rcmd = 'library(feather); library(sctransform); mat <- t(as.matrix(read_feather("{0}"))); colnames(mat) <- 1:ncol(mat); res <- vst(mat{1}, return_gene_attr=TRUE, return_cell_attr=TRUE); write_feather(as.data.frame(t(res$y)), "{2}"); write_feather(as.data.frame(res$model_pars_fit), "{3}");'.format(ifn, vst_opt_str, ofn, pfn)
119 | if plot_model_pars:
120 | plot_path = os.path.join(tmpdirname, 'model_pars.png')
121 | rcmd += 'png(file="%s", width=3600, height=1200, res=300); plot_model_pars(res, show_var=TRUE); dev.off();'%plot_path
122 | rcmd = rcmd.replace('\\', '\\\\')
123 | with open(rfn, "w") as f:
124 | f.write(rcmd)
125 | _log("Running scTransform via Rscript...")
126 | proc = subprocess.Popen(["Rscript", rfn], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
127 | while not proc.poll():
128 | c = proc.stdout.read(1)
129 | if not c:
130 | break
131 | if verbose:
132 | try:
133 | sys.stdout.write(c.decode("utf-8"))
134 | except:
135 | pass
136 | time.sleep(0.0001)
137 | _log("Reading output files...")
138 | o, p = pd.read_feather(ofn), pd.read_feather(pfn)
139 | if plot_model_pars:
140 | try:
141 | from matplotlib.image import imread
142 | import matplotlib.pyplot as plt
143 | img = imread(plot_path)
144 | dpi = 80
145 | fig = plt.figure(figsize=(img.shape[1]/dpi, img.shape[0]/dpi), dpi=dpi)
146 | plt.imshow(img, interpolation='nearest')
147 | plt.gca().set_axis_off()
148 | plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
149 | plt.margins(0, 0)
150 | plt.gca().xaxis.set_major_locator(plt.NullLocator())
151 | plt.gca().yaxis.set_major_locator(plt.NullLocator())
152 | plt.show()
153 | except:
154 | print("Warning: plotting failed, perhaps matplotlib is not available?")
155 | _log("Clipping residuals...")
156 | if clip_range is None:
157 | r = np.sqrt(data.shape[0]/30.0)
158 | clip_range = (-r, r)
159 | o.clip(*clip_range)
160 | return o, p
161 |
162 |
163 | class SSAMDataset(object):
164 | """
165 | A class to store intial values and results of SSAM analysis.
166 |
167 | :param genes: The genes that will be used for the analysis.
168 | :type genes: list(str)
169 | :param locations: Location of the mRNAs in um, given as a list of
170 | N x D ndarrays (N is number of mRNAs, D is number of dimensions).
171 | :type locations: list(numpy.ndarray)
172 | :param width: Width of the image in um.
173 | :type width: float
174 | :param height: Height of the image in um.
175 | :type height: float
176 | :param depth: Depth of the image in um. Depth == 1 means 2D image.
177 | :type depth: float
178 | """
179 |
180 | def __init__(self, genes, locations, width, height, depth=1):
181 | if depth < 1 or width < 1 or height < 1:
182 | raise ValueError("Invalid image dimension")
183 | self.shape = (width, height, depth)
184 | self.ndim = 2 if depth == 1 else 3
185 | self.genes = list(genes)
186 | self.locations = []
187 | for l in list(locations):
188 | if l.shape[-1] == 3:
189 | self.locations.append(l)
190 | elif l.shape[-1] == 2:
191 | self.locations.append(np.concatenate((l, np.zeros([l.shape[0], 1])), axis=1))
192 | else:
193 | raise ValueError("Invalid mRNA locations")
194 | self.__vf = None
195 | self.__vf_norm = None
196 | self.normalized_vectors = None
197 | self.expanded_vectors = None
198 | self.cluster_labels = None
199 | #self.corr_map = None
200 | self.tsne = None
201 | self.umap = None
202 | self.normalized_vf = None
203 | self.excluded_clusters = None
204 | self.celltype_binned_counts = None
205 |
206 | @property
207 | def vf(self):
208 | """
209 | Vector field as a numpy.ndarray.
210 | """
211 | return self.__vf
212 |
213 | @vf.setter
214 | def vf(self, vf):
215 | self.__vf = vf
216 | self.__vf_norm = None
217 |
218 | @property
219 | def vf_norm(self):
220 | """
221 | `L1-norm `_ of the vector field as a numpy.ndarray.
222 | """
223 |
224 | if self.vf is None:
225 | return None
226 | if self.__vf_norm is None:
227 | self.__vf_norm = np.sum(self.vf, axis=len(self.vf.shape) - 1)
228 | return self.__vf_norm
229 |
230 | def plot_l1norm(self, cmap="viridis", rotate=0, z=None):
231 | """
232 | Plot the `L1-norm `_ of the vector field.
233 |
234 | :param cmap: Colormap used for the plot.
235 | :type cmap: str or matplotlib.colors.Colormap
236 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3.
237 | :type rotate: int
238 | :param z: Z index to slice 3D vector field.
239 | If not given, the slice at the middle will be plotted.
240 | :type z: int
241 | """
242 | if z is None:
243 | z = int(self.vf_norm.shape[2] / 2)
244 | if rotate < 0 or rotate > 3:
245 | raise ValueError("rotate can only be 0, 1, 2, 3")
246 | im = np.array(self.vf_norm, copy=True)
247 | if rotate == 1 or rotate == 3:
248 | im = im.swapaxes(0, 1)
249 | plt.imshow(im[..., z], cmap=cmap)
250 | if rotate == 1:
251 | plt.gca().invert_xaxis()
252 | elif rotate == 2:
253 | plt.gca().invert_xaxis()
254 | plt.gca().invert_yaxis()
255 | elif rotate == 3:
256 | plt.gca().invert_yaxis()
257 |
258 | def plot_localmax(self, c=None, cmap=None, s=1, rotate=0):
259 | """
260 | Scatter plot the local maxima.
261 |
262 | :param c: Color of the scatter dots. Overrides `cmap` parameter.
263 | :type c: str or list(str), or list(float) or list(list(float))
264 | :param cmap: Colormap of the scatter dots.
265 | :type cmap: str or matplotlib.colors.Colormap
266 | :param s: Size of the scatter dots.
267 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3.
268 | :type rotate: int
269 | """
270 | if rotate < 0 or rotate > 3:
271 | raise ValueError("rotate can only be 0, 1, 2, 3")
272 | if rotate == 0 or rotate == 2:
273 | dim0, dim1 = 1, 0
274 | elif rotate == 1 or rotate == 3:
275 | dim0, dim1 = 0, 1
276 | plt.scatter(self.local_maxs[dim0], self.local_maxs[dim1], s=s, c=c, cmap=cmap)
277 | plt.xlim([0, self.vf_norm.shape[dim0]])
278 | plt.ylim([self.vf_norm.shape[dim1], 0])
279 | if rotate == 1:
280 | plt.gca().invert_xaxis()
281 | elif rotate == 2:
282 | plt.gca().invert_xaxis()
283 | plt.gca().invert_yaxis()
284 | elif rotate == 3:
285 | plt.gca().invert_yaxis()
286 |
287 | def __run_pca(self, exclude_bad_clusters, pca_dims, random_state):
288 | if exclude_bad_clusters:
289 | good_vecs = self.normalized_vectors[self.filtered_cluster_labels != -1, :]
290 | else:
291 | good_vecs = self.normalized_vectors
292 | return PCA(n_components=pca_dims, random_state=random_state).fit_transform(good_vecs)
293 |
294 | def plot_tsne(self, run_tsne=False, pca_dims=10, n_iter=5000, perplexity=70, early_exaggeration=10,
295 | metric="correlation", exclude_bad_clusters=True, s=None, random_state=0, colors=[], excluded_color="#00000033", cmap="jet", tsne_kwargs={}):
296 | """
297 | Scatter plot the tSNE embedding.
298 |
299 | :param run_tsne: If false, this method tries to load precomputed tSNE result before running tSNE.
300 | :type run_tsne: bool
301 | :param pca_dims: Number of PCA dimensions used for the tSNE embedding.
302 | :type pca_dims: int
303 | :param n_iter: Maximum number of iterations for the tSNE.
304 | :type n_iter: int
305 | :param perplexity: The perplexity value of the tSNE (please refer to the section `How should I set the perplexity in t-SNE?` in this `link `_).
306 | :type perplexity: float
307 | :param early_exaggeration: Early exaggeration parameter for tSNE. Controls the tightness of the resulting tSNE plot.
308 | :type early_exaggeration: float
309 | :param metric: Metric for calculation of distance between vectors in gene expression space.
310 | :type metric: str
311 | :param exclude_bad_clusters: If true, the vectors that are excluded by the clustering algorithm will not be considered for tSNE computation.
312 | :type exclude_bad_clusters: bool
313 | :param s: Size of the scatter dots.
314 | :type s: float
315 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result
316 | :type random_state: int or random state object
317 | :param colors: Color of each clusters.
318 | :type colors: list(str), list(list(float))
319 | :param excluded_color: Color of the vectors excluded by the clustering algorithm.
320 | :type excluded_color: str of list(float)
321 | :param cmap: Colormap for the clusters.
322 | :type cmap: str or matplotlib.colors.Colormap
323 | :param tsne_kwargs: Other keyward parameters for tSNE.
324 | :type tsne_kwargs: dict
325 | """
326 | if self.filtered_cluster_labels is None:
327 | exclude_bad_clusters = False
328 | if run_tsne or self.tsne is None:
329 | pcs = self.__run_pca(exclude_bad_clusters, pca_dims, random_state)
330 | self.tsne = TSNE(n_iter=n_iter, perplexity=perplexity, early_exaggeration=early_exaggeration, metric=metric, random_state=random_state, **tsne_kwargs).fit_transform(pcs[:, :pca_dims])
331 | if self.filtered_cluster_labels is not None:
332 | cols = self.filtered_cluster_labels[self.filtered_cluster_labels != -1]
333 | else:
334 | cols = None
335 | if len(colors) > 0:
336 | cmap = ListedColormap(colors)
337 | if not exclude_bad_clusters and self.filtered_cluster_labels is not None:
338 | plt.scatter(self.tsne[:, 0][self.filtered_cluster_labels == -1], self.tsne[:, 1][self.filtered_cluster_labels == -1], s=s, c=excluded_color)
339 | plt.scatter(self.tsne[:, 0][self.filtered_cluster_labels != -1], self.tsne[:, 1][self.filtered_cluster_labels != -1], s=s, c=cols, cmap=cmap)
340 | else:
341 | plt.scatter(self.tsne[:, 0], self.tsne[:, 1], s=s, c=cols, cmap=cmap)
342 | return
343 |
344 | def plot_umap(self, run_umap=False, pca_dims=10, metric="correlation", exclude_bad_clusters=True, s=None, random_state=0, colors=[], excluded_color="#00000033", cmap="jet", umap_kwargs={}):
345 | """
346 | Scatter plot the UMAP embedding.
347 |
348 | :param run_umap: If false, this method tries to load precomputed UMAP result before running UMAP.
349 | :type run_tsne: bool
350 | :param pca_dims: Number of PCA dimensions used for the UMAP embedding.
351 | :type pca_dims: int
352 | :param metric: Metric for calculation of distance between vectors in gene expression space.
353 | :type metric: str
354 | :param exclude_bad_clusters: If true, the vectors that are excluded by the clustering algorithm will not be considered for tSNE computation.
355 | :type exclude_bad_clusters: bool
356 | :param s: Size of the scatter dots.
357 | :type s: float
358 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result
359 | :type random_state: int or random state object
360 | :param colors: Color of each clusters.
361 | :type colors: list(str), list(list(float))
362 | :param excluded_color: Color of the vectors excluded by the clustering algorithm.
363 | :type excluded_color: str of list(float)
364 | :param cmap: Colormap for the clusters.
365 | :type cmap: str or matplotlib.colors.Colormap
366 | :param umap_kwargs: Other keyward parameters for UMAP.
367 | :type umap_kwargs: dict
368 | """
369 | if self.filtered_cluster_labels is None:
370 | exclude_bad_clusters = False
371 | if run_umap or self.umap is None:
372 | pcs = self.__run_pca(exclude_bad_clusters, pca_dims, random_state)
373 | self.umap = UMAP(metric=metric, random_state=random_state, **umap_kwargs).fit_transform(pcs[:, :pca_dims])
374 | if self.filtered_cluster_labels is not None:
375 | cols = self.filtered_cluster_labels[self.filtered_cluster_labels != -1]
376 | else:
377 | cols = None
378 | if len(colors) > 0:
379 | cmap = ListedColormap(colors)
380 | if not exclude_bad_clusters and self.filtered_cluster_labels is not None:
381 | plt.scatter(self.umap[:, 0][self.filtered_cluster_labels == -1], self.umap[:, 1][self.filtered_cluster_labels == -1], s=s, c=excluded_color)
382 | plt.scatter(self.umap[:, 0][self.filtered_cluster_labels != -1], self.umap[:, 1][self.filtered_cluster_labels != -1], s=s, c=cols, cmap=cmap)
383 | else:
384 | plt.scatter(self.umap[:, 0], self.umap[:, 1], s=s, c=cols, cmap=cmap)
385 | return
386 |
387 | def plot_expanded_mask(self, cmap='Greys'): # TODO
388 | """
389 | Plot the expanded area of the vectors (Not fully implemented yet).
390 |
391 | :param cmap: Colormap for the mask.
392 | """
393 | plt.imshow(self.expanded_mask, vmin=0, vmax=1, cmap=cmap)
394 | return
395 |
396 | def plot_correlation_map(self, cmap='hot'): # TODO
397 | """
398 | Plot the correlations near the vectors in the vector field (Not fully implemented yet).
399 |
400 | :param cmap: Colormap for the image.
401 | """
402 | plt.imshow(self.corr_map, vmin=0.995, vmax=1.0, cmap=cmap)
403 | plt.colorbar()
404 | return
405 |
406 | def plot_celltypes_map(self, background="black", centroid_indices=[], colors=None, cmap='jet', rotate=0, min_r=0.6, set_alpha=False, z=None):
407 | """
408 | Plot the merged cell-type map.
409 |
410 | :param background: Set background color of the cell-type map.
411 | :type background: str or list(float)
412 | :param centroid_indices: The centroids which will be in the cell type map. If not given, the cell-type map is drawn with all centroids.
413 | :type centroid_indices: list(int)
414 | :param colors: Color of the clusters. Overrides `cmap` parameter.
415 | :type colors: list(str), list(list(float))
416 | :param cmap: Colormap for the clusters.
417 | :type cmap: str or matplotlib.colors.Colormap
418 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3.
419 | :type rotate: int
420 | :param min_r: Minimum correlation threshold for the cell-type map.
421 | This value is only for the plotting, does not affect to the cell-type maps generated by `filter_celltypemaps`.
422 | :type min_r: float
423 | :param set_alpha: Set alpha of each pixel based on the correlation.
424 | Not properly implemented yet, doesn't work properly with the background other than black.
425 | :type set_alpha: bool
426 | :param z: Z index to slice 3D cell-type map.
427 | If not given, the slice at the middle will be used.
428 | :type z: int
429 | """
430 | if z is None:
431 | z = int(self.shape[2] / 2)
432 | num_ctmaps = np.max(self.filtered_celltype_maps) + 1
433 |
434 | if len(centroid_indices) == 0:
435 | centroid_indices = list(range(num_ctmaps))
436 |
437 | if colors is None:
438 | cmap_internal = plt.get_cmap(cmap)
439 | colors = cmap_internal([float(i) / (num_ctmaps - 1) for i in range(num_ctmaps)])
440 |
441 | all_colors = [background if not j in centroid_indices else colors[i] for i, j in enumerate(range(num_ctmaps))]
442 | cmap_internal = ListedColormap(all_colors)
443 |
444 | celltype_maps_internal = np.array(self.filtered_celltype_maps[..., z], copy=True)
445 | empty_mask = celltype_maps_internal == -1
446 | celltype_maps_internal[empty_mask] = 0
447 | sctmap = cmap_internal(celltype_maps_internal)
448 | sctmap[empty_mask] = (0, 0, 0, 0)
449 |
450 | if set_alpha:
451 | alpha = np.array(self.max_correlations[..., z], copy=True)
452 | alpha[alpha < 0] = 0 # drop negative correlations
453 | alpha = min_r + alpha / (np.max(alpha) / (1.0 - min_r))
454 | sctmap[..., 3] = alpha
455 |
456 | if rotate == 1 or rotate == 3:
457 | sctmap = sctmap.swapaxes(0, 1)
458 |
459 | plt.gca().set_facecolor(background)
460 | plt.imshow(sctmap)
461 |
462 | if rotate == 1:
463 | plt.gca().invert_xaxis()
464 | elif rotate == 2:
465 | plt.gca().invert_xaxis()
466 | plt.gca().invert_yaxis()
467 | elif rotate == 3:
468 | plt.gca().invert_yaxis()
469 |
470 | return
471 |
472 | def plot_domains(self, background='white', colors=None, cmap='jet', rotate=0, domain_background=False, background_alpha=0.3, z=None):
473 | """
474 | Plot tissue domains.
475 |
476 | :param background: Background color of the plot.
477 | :type background: str or list(float)
478 | :param colors: Color of the domains. Overrides `cmap` parameter.
479 | :type colors: list(str), list(list(float))
480 | :param cmap: Colormap for the domains.
481 | :type cmap: str or matplotlib.colors.Colormap
482 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3.
483 | :type rotate: int
484 | :param domain_background: Show the area of the inferred domains behind the domain map.
485 | :type domain_background: bool
486 | :param background_alpha: The alpha value of the area of the inferred domains.
487 | :type background_alpha: float
488 | :param z: Z index to slice 3D domain map.
489 | If not given, the slice at the middle will be used.
490 | :type z: int
491 | """
492 | if z is None:
493 | z = int(self.shape[2] / 2)
494 |
495 | inferred_domains = self.inferred_domains[..., z]
496 | inferred_domains_cells = self.inferred_domains_cells[..., z]
497 |
498 | if rotate == 1 or rotate == 3:
499 | inferred_domains = inferred_domains.swapaxes(0, 1)
500 | inferred_domains_cells = inferred_domains_cells.swapaxes(0, 1)
501 |
502 | if colors is None:
503 | cmap_internal = plt.get_cmap(cmap)
504 | colors_domains = cmap_internal(np.linspace(0, 1, np.max(inferred_domains) + 1))
505 | colors_cells = cmap_internal(np.linspace(0, 1, np.max(inferred_domains_cells) + 1))
506 |
507 | colors_domains[:, 3] = background_alpha
508 | if -1 in inferred_domains:
509 | colors_domains = [[0, 0, 0, 0]] + list(colors_domains)
510 | if -1 in inferred_domains_cells:
511 | colors_cells = [[0, 0, 0, 0]] + list(colors_cells)
512 |
513 | plt.gca().set_facecolor(background)
514 | if domain_background:
515 | plt.imshow(inferred_domains, cmap=ListedColormap(colors_domains))
516 | plt.imshow(inferred_domains_cells, cmap=ListedColormap(colors_cells))
517 |
518 | if rotate == 1:
519 | plt.gca().invert_xaxis()
520 | elif rotate == 2:
521 | plt.gca().invert_xaxis()
522 | plt.gca().invert_yaxis()
523 | elif rotate == 3:
524 | plt.gca().invert_yaxis()
525 |
526 | return
527 |
528 | def plot_diagnostic_plot(self, centroid_index, cluster_name=None, cluster_color=None, cmap=None, rotate=0, z=None, use_embedding="tsne", known_signatures=[], correlation_methods=[]):
529 | """
530 | Plot the diagnostic plot. This method requires `plot_tsne` or `plot_umap` was run at least once before.
531 |
532 | :param centroid_index: Index of the centroid for the diagnostic plot.
533 | :type centroid_index: int
534 | :param cluster_name: The name of the cluster.
535 | :type cluster_name: str
536 | :param cluster_color: The color of the cluster. Overrides `cmap` parameter.
537 | :type cluster_color: str or list(float)
538 | :param cmap: The colormap for the clusters. The cluster color is determined using the `centroid_index` th color of the given colormap.
539 | :type cmap: str or matplotlib.colors.Colormap
540 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3.
541 | :type rotate: int
542 | :param z: Z index to slice 3D vector norm and cell-type map plots.
543 | If not given, the slice at the middle will be used.
544 | :type z: int
545 | :param use_embedding: The type of the embedding for the last panel. Possible values are "tsne" or "umap".
546 | :type use_embedding: str
547 | :param known_signatures: The list of known signatures, which will be displayed in the 3rd panel. Each signature can be 3-tuple or 4-tuple,
548 | containing 1) the name of signature, 2) gene labels of the signature, 3) gene expression values of the signature, 4) optionally the color of the signature.
549 | :type known_signatures: list(tuple)
550 | :param correlation_methods: The correlation method used to determine max correlation of the centroid to the `known_signatures`. Each method should be 2-tuple,
551 | containing 1) the name of the correaltion, 2) the correaltion function (compatiable with the correlation methods available in `scipy.stats `_)
552 | :type correlation_methods: list(tuple)
553 | """
554 | if z is None:
555 | z = int(self.vf_norm.shape[2] / 2)
556 | p, e = self.centroids[centroid_index], self.centroids_stdev[centroid_index]
557 | if cluster_name is None:
558 | cluster_name = "Cluster #%d"%centroid_index
559 |
560 | if cluster_color is None:
561 | if cmap is None:
562 | cmap = plt.get_cmap("jet")
563 | cluster_color = cmap(centroid_index / (len(self.centroids) - 1))
564 |
565 | if len(correlation_methods) == 0:
566 | correlation_methods = [("r", corr), ]
567 | total_signatures = len(correlation_methods) * len(known_signatures) + 1
568 |
569 | ax = plt.subplot(1, 4, 1)
570 | mask = self.filtered_cluster_labels == centroid_index
571 | plt.scatter(self.local_maxs[0][mask], self.local_maxs[1][mask], c=[cluster_color])
572 | self.plot_l1norm(rotate=rotate, cmap="Greys", z=z)
573 |
574 | ax = plt.subplot(1, 4, 2)
575 | ctmap = np.zeros([self.filtered_celltype_maps.shape[0], self.filtered_celltype_maps.shape[1], 4])
576 | ctmap[self.filtered_celltype_maps[..., z] == centroid_index] = to_rgba(cluster_color)
577 | ctmap[np.logical_and(self.filtered_celltype_maps[..., z] != centroid_index, self.filtered_celltype_maps[..., 0] > -1)] = [0.9, 0.9, 0.9, 1]
578 | if rotate == 1 or rotate == 3:
579 | ctmap = ctmap.swapaxes(0, 1)
580 | ax.imshow(ctmap)
581 | if rotate == 1:
582 | ax.invert_xaxis()
583 | elif rotate == 2:
584 | ax.invert_xaxis()
585 | ax.invert_yaxis()
586 | elif rotate == 3:
587 | ax.invert_yaxis()
588 |
589 | ax = plt.subplot(total_signatures, 4, 3)
590 | ax.bar(self.genes, p, yerr=e)
591 | ax.set_title(cluster_name)
592 | plt.xlim([-1, len(self.genes)])
593 | plt.xticks(rotation=90)
594 |
595 | subplot_idx = 0
596 | for signature in known_signatures:
597 | sig_title, sig_labels, sig_values = signature[:3]
598 | sig_colors_defined = False
599 | if len(signature) == 4:
600 | sig_colors = signature[3]
601 | sig_colors_defined = True
602 | for corr_label, corr_func in correlation_methods:
603 | corr_results = [corr_func(p, sig_value) for sig_value in sig_values]
604 | corr_results = [e[0] if hasattr(e, "__getitem__") else e for e in corr_results]
605 | max_corr_idx = np.argmax(corr_results)
606 | ax = plt.subplot(total_signatures, 4, 7+subplot_idx*4)
607 | lbl = sig_labels[max_corr_idx]
608 | if sig_colors_defined:
609 | col = sig_colors[max_corr_idx]
610 | else:
611 | col = cluster_color
612 | ax.bar(self.genes, sig_values[max_corr_idx], color=col)
613 | ax.set_title("%s in %s (max %s, %.3f)"%(lbl, sig_title, corr_label, corr_results[max_corr_idx]))
614 | plt.xlim([-1, len(self.genes)])
615 | plt.xticks(rotation=90)
616 | subplot_idx += 1
617 |
618 | if use_embedding == 'tsne':
619 | embedding = self.tsne
620 | fig_title = "t-SNE, %d vectors"%sum(self.filtered_cluster_labels == centroid_index)
621 | elif use_embedding == 'umap':
622 | embedding = self.umap
623 | fig_title = "UMAP, %d vectors"%sum(self.filtered_cluster_labels == centroid_index)
624 | good_vectors = self.filtered_cluster_labels[self.filtered_cluster_labels != -1]
625 | ax = plt.subplot(1, 4, 4)
626 | ax.scatter(embedding[:, 0][good_vectors != centroid_index], embedding[:, 1][good_vectors != centroid_index], c=[[0.8, 0.8, 0.8, 1],], s=80)
627 | ax.scatter(embedding[:, 0][good_vectors == centroid_index], embedding[:, 1][good_vectors == centroid_index], c=[cluster_color], s=80)
628 | ax.get_xaxis().set_visible(False)
629 | ax.get_yaxis().set_visible(False)
630 | ax.set_title(fig_title)
631 |
632 | def plot_celltype_composition(self, domain_index, cell_type_colors=None, cell_type_cmap='jet', cell_type_orders=None, label_cutoff=0.03, pctdistance=1.15, **kwargs):
633 | """
634 | Plot composition of cell types in each domain.
635 |
636 | :param domain_index: Index of the domain.
637 | :type domain_index: int
638 | :param cell_type_colors: The colors of the cell types. Overrides `cell_type_cmap` parameter.
639 | :type cell_type_colors: str or list(float)
640 | :param cell_type_cmap: The colormap for the cell types.
641 | :type cell_type_cmap: str or matplotlib.colors.Colormap
642 | :param label_cutoff: The minimum cutoff of the labeling of the percentage. From 0 to 1.
643 | :type label_cutoff: float
644 | :param pctdistance: The distance from center of the pie to the labels.
645 | :type pctdistance: float
646 | :param kwargs: More kewward arguments for the matplotlib.pyplot.pie.
647 | """
648 | if cell_type_colors is None:
649 | cmap = plt.get_cmap(cell_type_cmap)
650 | cell_type_colors = cmap(np.arange(0, len(self.centroids)) / (len(self.centroids) - 1))
651 |
652 | if cell_type_orders is not None:
653 | ctcs = np.array(cell_type_colors)[cell_type_orders]
654 | p = self.inferred_domains_compositions[domain_index][cell_type_orders]
655 | else:
656 | ctcs = cell_type_colors
657 | p = self.inferred_domains_compositions[domain_index]
658 | plt.pie(p,
659 | colors=ctcs,
660 | autopct=lambda e: '%.1f %%'%e if e > 3 else '',
661 | pctdistance=pctdistance, **kwargs)
662 |
663 | def plot_spatial_relationships(self, cluster_labels, *args, **kwargs):
664 | """
665 | Plot spatial relationship between cell types, presented as a heatmap.
666 |
667 | :param cluster_labels: x- and y-axis label of the heatmap.
668 | :type cluster_labels: list(str)
669 | :param args: More arguments for the seaborn.heatmap.
670 | :param kwargs: More keyword arguments for the seaborn.heatmap.
671 | """
672 | sns.heatmap(self.spatial_relationships, *args, xticklabels=cluster_labels, yticklabels=cluster_labels, **kwargs)
673 |
674 | def get_celltype_correlation(self, idx):
675 | """
676 | Get correlation values of a cell type map between the given cluster's centroid to the vector field.
677 |
678 | :param idx: Index of a cluster
679 | :type idx: int
680 | :return: Correlation values of a cell type map of the specified cluster's centroid
681 | :rtype: numpy.ndarray
682 | """
683 | rtn = np.zeros_like(self.max_correlations) - 1
684 | rtn[self.celltype_maps == idx] = self.max_correlations[self.celltype_maps == idx]
685 | return rtn
686 |
687 |
688 | class SSAMAnalysis(object):
689 | """
690 | A class to run SSAM analysis.
691 |
692 | :param dataset: A SSAMDataset object.
693 | :type dataset: SSAMDataset
694 | :param ncores: Number of cores for parallel computation. If a negative value is given,
695 | ((# of all available cores on system) - abs(ncores)) cores will be used.
696 | :type ncores: int
697 | :param save_dir: Directory to store intermediate data (e.g. density / vector field).
698 | Any data which already exists will be loaded and reused.
699 | :type save_dir: str
700 | :param verbose: If True, then it prints out messages during the analysis.
701 | :type verbose: bool
702 | """
703 | def __init__(self, dataset, ncores=-1, save_dir="", verbose=False):
704 |
705 | self.dataset = dataset
706 | if not ncores > 0:
707 | ncores += multiprocessing.cpu_count()
708 | if ncores > multiprocessing.cpu_count():
709 | ncores = multiprocessing.cpu_count()
710 | if not ncores > 0:
711 | raise ValueError("Invalid number of cores.")
712 | self.ncores = ncores
713 | self.use_savedir = True
714 | if len(save_dir) == 0:
715 | save_dir = mkdtemp()
716 | self.use_savedir = False
717 | if not os.path.exists(save_dir):
718 | os.makedirs(save_dir)
719 | self.save_dir = save_dir
720 | self.verbose = verbose
721 |
722 | def __m__(self, message):
723 | if self.verbose:
724 | print(message)
725 |
726 | def run_kde(self, kernel="gaussian", bandwidth=2.5, sampling_distance=1.0, use_mmap=False):
727 | """
728 | Run KDE to estimate density of mRNA.
729 |
730 | :param kernel: Kernel for density estimation.
731 | :type kernel: str
732 | :param bandwidth: Parameter to adjust width of kernel.
733 | Set it 2.5 to make FWTM of Gaussian kernel to be ~10um (assume that avg. cell diameter is ~10um).
734 | :type bandwidth: float
735 | :param sampling_distance: Grid spacing in um.
736 | :type sampling_distance: float
737 | :param use_mmap: Use MMAP to reduce memory usage during analysis.
738 | Turning on this option can reduce the amount of memory used by SSAM analysis, but also lower the analysis speed.
739 | :type use_mmap: bool
740 | """
741 | def save_pickle(fn, o):
742 | with open(fn, "wb") as f:
743 | return pickle.dump(o, f, protocol=4)
744 | def load_pickle(fn):
745 | with open(fn, "rb") as f:
746 | return pickle.load(f)
747 |
748 | steps = [int(np.ceil(e / sampling_distance)) for e in self.dataset.shape]
749 | total_steps = np.prod(steps)
750 | vf_shape = tuple(steps + [len(self.dataset.genes), ])
751 | vf_filename = os.path.join(self.save_dir, 'vf_sd%s_bw%s'%(
752 | ('%f' % sampling_distance).rstrip('0').rstrip('.'),
753 | ('%f' % bandwidth).rstrip('0').rstrip('.')
754 | ))
755 | if (use_mmap and not os.path.exists(vf_filename + '.dat')) or \
756 | (not use_mmap and not os.path.exists(vf_filename + '.pkl') and not os.path.exists(vf_filename + '.dat')):
757 | # If VF file doesn't exist, then run KDE
758 | if use_mmap:
759 | vf = np.memmap(vf_filename + '.dat.tmp', dtype='double', mode='w+', shape=vf_shape)
760 | else:
761 | vf = np.zeros(vf_shape)
762 | chunksize = min(int(np.ceil(total_steps / self.ncores)), 100000)
763 | def yield_chunk():
764 | chunk = np.zeros(shape=[chunksize, len(steps)], dtype=int)
765 | cnt = 0
766 | remaining_cnt = total_steps
767 | for x in range(steps[0]):
768 | for y in range(steps[1]):
769 | for z in range(steps[2]):
770 | chunk[cnt, :] = [x, y, z]
771 | cnt += 1
772 | if cnt == chunksize:
773 | yield chunk
774 | remaining_cnt -= cnt
775 | cnt = 0
776 | chunk = np.zeros(shape=[min(chunksize, remaining_cnt), len(steps)], dtype=int)
777 | if cnt > 0:
778 | yield chunk
779 |
780 | def yield_chunks():
781 | chunks = []
782 | for chunk in yield_chunk():
783 | chunks.append(chunk)
784 | if len(chunks) == self.ncores:
785 | yield chunks
786 | chunks = []
787 | if len(chunks) > 0:
788 | yield chunks
789 |
790 | pool = None
791 | for gidx, gene_name in enumerate(self.dataset.genes):
792 | pdf_filename = os.path.join(self.save_dir, 'pdf_sd%s_bw%s_%s.npy'%(
793 | ('%f' % sampling_distance).rstrip('0').rstrip('.'),
794 | ('%f' % bandwidth).rstrip('0').rstrip('.'),
795 | gene_name)
796 | )
797 | if os.path.exists(pdf_filename):
798 | self.__m__("Loading %s..."%gene_name)
799 | pdf = np.load(pdf_filename)
800 | else:
801 | self.__m__("Running KDE for %s..."%gene_name)
802 | pdf = np.zeros(shape=vf_shape[:-1])
803 | if kernel != "gaussian":
804 | kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(self.dataset.locations[gidx])
805 | if pool is None:
806 | pool = multiprocessing.Pool(self.ncores)
807 | else:
808 | X, Y, Z = [self.dataset.locations[gidx][:, i] for i in range(3)]
809 | for chunks in yield_chunks():
810 | if kernel == "gaussian":
811 | pdf_chunks = [calc_kde(bandwidth, X, Y, Z, chunk[:, 0], chunk[:, 1], chunk[:, 2], 0, self.ncores) for chunk in chunks]
812 | else:
813 | pdf_chunks = pool.map(kde.score_samples, [chunk * sampling_distance for chunk in chunks])
814 | for pdf_chunk, pos_chunk in zip(pdf_chunks, chunks):
815 | if kernel == "gaussian":
816 | pdf[pos_chunk[:, 0], pos_chunk[:, 1], pos_chunk[:, 2]] = pdf_chunk
817 | else:
818 | pdf[pos_chunk[:, 0], pos_chunk[:, 1], pos_chunk[:, 2]] = np.exp(pdf_chunk)
819 | pdf /= np.sum(pdf)
820 | np.save(pdf_filename, pdf)
821 | vf[..., gidx] = pdf * len(self.dataset.locations[gidx])
822 | if use_mmap:
823 | vf.flush()
824 | os.rename(vf_filename + '.dat.tmp', vf_filename + '.dat')
825 | vf = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape)
826 | elif self.use_savedir:
827 | save_pickle(vf_filename + '.pkl', vf)
828 | elif not use_mmap:
829 | if os.path.exists(vf_filename + '.pkl'):
830 | vf = load_pickle(vf_filename + '.pkl')
831 | else: # == os.path.exists(vf_filename + '.dat'):
832 | vf_tmp = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape)
833 | vf = np.array(vf_tmp, copy=True)
834 | if self.use_savedir:
835 | save_pickle(vf_filename + '.pkl', vf)
836 | elif use_mmap:
837 | vf = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape)
838 | self.dataset.vf = vf
839 | return
840 |
841 | def run_fast_kde(self, kernel='gaussian', bandwidth=2.5, sampling_distance=1.0, re_run=False, use_mmap=False):
842 | """
843 | Run KDE faster than `run_kde` method. This method uses precomputed kernels to estimate density of mRNA.
844 |
845 | :param kernel: Kernel for density estimation. Currently only Gaussian kernel is supported.
846 | :type kernel: str
847 | :param bandwidth: Parameter to adjust width of kernel.
848 | Set it 2.5 to make FWTM of Gaussian kernel to be ~10um (assume that avg. cell diameter is ~10um).
849 | :type bandwidth: float
850 | :param sampling_distance: Grid spacing in um. Currently only 1 um is supported.
851 | :type sampling_distance: float
852 | :param re_run: Recomputes KDE, ignoring all existing precomputed densities in the data directory.
853 | :type re_run: bool
854 | :param use_mmap: Use MMAP to reduce memory usage during analysis. Currently not implemented, this option should be always disabled.
855 | :type use_mmap: bool
856 | """
857 | if kernel != 'gaussian':
858 | raise NotImplementedError('Only Gaussian kernel is supported.')
859 | if sampling_distance != 1.0:
860 | raise NotImplementedError('Sampling distance should be 1.')
861 | if use_mmap:
862 | raise NotImplementedError('MMAP is not supported yet.')
863 |
864 | def save_pickle(fn, o):
865 | with open(fn, "wb") as f:
866 | return pickle.dump(o, f, protocol=4)
867 | def load_pickle(fn):
868 | with open(fn, "rb") as f:
869 | return pickle.load(f)
870 |
871 | vf_filename = os.path.join(self.save_dir, 'vf_sd%s_bw%s.pkl'%(
872 | ('%f' % sampling_distance).rstrip('0').rstrip('.'),
873 | ('%f' % bandwidth).rstrip('0').rstrip('.')
874 | ))
875 |
876 | if os.path.exists(vf_filename) and not re_run:
877 | self.dataset.vf = load_pickle(vf_filename)
878 | return
879 |
880 | self.dataset.vf = np.zeros(self.dataset.shape+(len(self.dataset.genes),))
881 | idcs = np.argsort([len(i) for i in self.dataset.locations])[::-1]
882 | pdf_filenames = [os.path.join(self.save_dir, 'pdf_sd%s_bw%s_%s.npy'%(
883 | ('%f' % sampling_distance).rstrip('0').rstrip('.'),
884 | ('%f' % bandwidth).rstrip('0').rstrip('.'),
885 | self.dataset.genes[gidx])
886 | ) for gidx in idcs]
887 |
888 | if not re_run:
889 | idcs = np.where([not os.path.exists(fn) for fn in pdf_filenames])[0]
890 | for gidx in np.where([os.path.exists(fn) for fn in pdf_filenames])[0]:
891 | print("Loading gene %s..."%self.dataset.genes[gidx])
892 | self.dataset.vf[..., gidx] = np.load(pdf_filenames[gidx])
893 |
894 | if len(idcs) > 0:
895 | with closing(Pool(self.ncores, maxtasksperchild=1)) as p:
896 | res = p.imap(_fast_gaussian_kde,[(bandwidth,
897 | self.save_dir,
898 | self.dataset.genes[gidx],
899 | self.dataset.shape,
900 | self.dataset.locations[gidx],
901 | sampling_distance) for gidx in idcs])
902 | for gidx, pd in zip(idcs, res): # imap returns result in the same order as the input array
903 | self.dataset.vf[..., gidx] = pd
904 | np.save(pdf_filenames[gidx], pd)
905 | p.close()
906 | p.join()
907 | save_pickle(vf_filename, self.dataset.vf)
908 |
909 | def calc_correlation_map(self, corr_size=3):
910 | """
911 | Calculate local correlation map of the vector field.
912 |
913 | :param corr_size: Size of square (or cube) that is used to compute the local correlation values.
914 | This value should be an odd number.
915 | :type corr_size: int
916 | """
917 |
918 | corr_map = calc_corrmap(self.dataset.vf, ncores=self.ncores, size=int(corr_size/2))
919 | self.dataset.corr_map = np.array(corr_map, copy=True)
920 | return
921 |
922 | def find_localmax(self, search_size=3, min_norm=0, min_expression=0, mask=None):
923 | """
924 | Find local maxima vectors in the norm of the vector field.
925 |
926 | :param search_size: Size of square (or cube in 3D) that is used to search for the local maxima.
927 | This value should be an odd number.
928 | :type search_size: int
929 | :param min_norm: Minimum value of norm at the local maxima.
930 | :type min_norm: float
931 | :param min_expression: Minimum value of gene expression in a unit pixel at the local maxima.
932 | mask: numpy.ndarray, optional
933 | If given, find vectors in the masked region, instead of the whole image.
934 | :type min_expression: float
935 | """
936 |
937 | max_mask = self.dataset.vf_norm == ndimage.maximum_filter(self.dataset.vf_norm, size=search_size)
938 | max_mask &= self.dataset.vf_norm > min_norm
939 | if min_expression > 0:
940 | exp_mask = np.zeros_like(max_mask)
941 | for i in range(len(self.dataset.genes)):
942 | exp_mask |= self.dataset.vf[..., i] > min_expression
943 | max_mask &= exp_mask
944 | if mask is not None:
945 | max_mask &= mask
946 | local_maxs = np.where(max_mask)
947 | self.__m__("Found %d local max vectors."%len(local_maxs[0]))
948 | self.dataset.local_maxs = local_maxs
949 | return
950 |
951 | def expand_localmax(self, r=0.99, min_pixels=7, max_pixels=1000):
952 | """
953 | Merge the vectors nearby the local max vectors.
954 | Only the vectors with the large Pearson correlation values are merged.
955 |
956 | :param r: Minimum Pearson's correlation coefficient to look for the nearby vectors.
957 | :type r: float
958 | :param min_pixels: Minimum number of pixels to merge.
959 | :type min_pixels: float
960 | :param max_pixels: Maximum number of pixels to merge.
961 | :type max_pixels: float
962 | """
963 |
964 | expanded_vecs = []
965 | self.__m__("Expanding local max vectors...")
966 | fill_dx = np.meshgrid(range(3), range(3), range(3))
967 | fill_dx = np.array(list(zip(*[np.ravel(e) - 1 for e in fill_dx])))
968 | mask = np.zeros(self.dataset.vf.shape[:-1]) # TODO: sparse?
969 | nlocalmaxs = len(self.dataset.local_maxs[0])
970 | valid_pos_list = []
971 | for cnt, idx in enumerate(range(nlocalmaxs), start=1):
972 | local_pos = tuple(i[idx] for i in self.dataset.local_maxs)
973 | filled_pos = tuple(zip(*flood_fill(local_pos, self.dataset.vf, r, min_pixels, max_pixels)))
974 | if len(filled_pos) > 0:
975 | mask[filled_pos] = 1
976 | valid_pos_list.append(local_pos)
977 | expanded_vecs.append(np.sum(self.dataset.vf[filled_pos], axis=0))
978 | if cnt % 100 == 0:
979 | self.__m__("Processed %d/%d..."%(cnt, nlocalmaxs))
980 | self.__m__("Processed %d/%d..."%(cnt, nlocalmaxs))
981 | self.dataset.expanded_vectors = np.array(expanded_vecs)
982 | self.dataset.expanded_mask = mask
983 | self.dataset.valid_local_maxs = valid_pos_list
984 | return
985 |
986 | def normalize_vectors_sctransform(self, use_expanded_vectors=False, normalize_vf=True, vst_kwargs={}):
987 | """
988 | Normalize and regularize vectors using SCtransform
989 |
990 | :param use_expanded_vectors: If True, use averaged vectors nearby local maxima
991 | of the vector field.
992 | :type use_expanded_vectors: bool
993 | :param normalize_vf: If True, the vector field is also normalized
994 | using the same parameters used to normalize the local maxima.
995 | :type normalize_vf: bool
996 | :param vst_kwargs: Optional keywords arguments for sctransform's vst function.
997 | :type vst_kwargs: dict
998 | """
999 | if use_expanded_vectors:
1000 | vec = np.array(self.dataset.expanded_vectors, copy=True)
1001 | else:
1002 | vec = np.array(self.dataset.vf[self.dataset.local_maxs], copy=True)
1003 |
1004 | norm_vec, fit_params = run_sctransform(vec, **vst_kwargs)
1005 | self.dataset.normalized_vectors = np.array(norm_vec)
1006 |
1007 | if normalize_vf:
1008 | vf_nonzero = self.dataset.vf[self.dataset.vf_norm > 0]
1009 | nvec = vf_nonzero.shape[0]
1010 | fit_params = np.array(fit_params).T
1011 | regressor_data = np.ones([nvec, 2])
1012 | regressor_data[:, 1] = np.log10(np.sum(vf_nonzero, axis=1))
1013 |
1014 | mu = np.exp(np.dot(regressor_data, fit_params[1:, :]))
1015 | with np.errstate(divide='ignore', invalid='ignore'):
1016 | res = (vf_nonzero - mu) / np.sqrt(mu + mu**2 / fit_params[0, :])
1017 | self.dataset.normalized_vf = np.zeros_like(self.dataset.vf)
1018 | self.dataset.normalized_vf[self.dataset.vf_norm > 0] = np.nan_to_num(res)
1019 | return
1020 |
1021 | def normalize_vectors(self, use_expanded_vectors=False, normalize_gene=False, normalize_vector=False, normalize_median=False, size_after_normalization=1e4, log_transform=False, scale=False):
1022 | """
1023 | Normalize and regularize vectors
1024 |
1025 | :param use_expanded_vectors: If True, use averaged vectors nearby local maxima of the vector field.
1026 | :type use_expanded_vectors: bool
1027 | :param normalize_gene: If True, normalize vectors by sum of each gene expression across all vectors.
1028 | :type normalize_gene: bool
1029 | :param normalize_vector: If True, normalize vectors by sum of all gene expression of each vector.
1030 | :type normalize_vector: bool
1031 | :param log_transform: If True, vectors are log transformed.
1032 | :type log_transform: bool
1033 | :param scale: If True, vectors are z-scaled (mean centered and scaled by stdev).
1034 | :type scale: bool
1035 | """
1036 | if use_expanded_vectors:
1037 | vec = np.array(self.dataset.expanded_vectors, copy=True)
1038 | else:
1039 | vec = np.array(self.dataset.vf[self.dataset.local_maxs], copy=True)
1040 | if normalize_gene:
1041 | vec = preprocessing.normalize(vec, norm=norm, axis=0) * size_after_normalization # Normalize per gene
1042 | if normalize_vector:
1043 | vec = preprocessing.normalize(vec, norm="l1", axis=1) * size_after_normalization # Normalize per vector
1044 | if normalize_median:
1045 | def n(v):
1046 | s, m = np.sum(v, axis=1), np.median(v, axis=1)
1047 | s[m > 0] = s[m > 0] / m[m > 0]
1048 | s[m == 0] = 0
1049 | v[s > 0] = v[s > 0] / s[s > 0][:, np.newaxis]
1050 | v[v == 0] = 0
1051 | return v
1052 | vec = n(vec)
1053 | if log_transform:
1054 | vec = np.log2(vec + 1)
1055 | if scale:
1056 | vec = preprocessing.scale(vec)
1057 | self.dataset.normalized_vectors = vec
1058 | return
1059 |
1060 | def __correct_cluster_labels(self, cluster_labels, centroid_correction_threshold):
1061 | new_labels = np.array(cluster_labels, copy=True)
1062 | if centroid_correction_threshold < 1.0:
1063 | for cidx in np.unique(cluster_labels):
1064 | if cidx == -1:
1065 | continue
1066 | prev_midx = -1
1067 | while True:
1068 | vecs = self.dataset.normalized_vectors[new_labels == cidx]
1069 | vindices = np.where(new_labels == cidx)[0]
1070 | midx = vindices[np.argmin(np.sum(cdist(vecs, vecs), axis=0))]
1071 | if midx == prev_midx:
1072 | break
1073 | prev_midx = midx
1074 | m = self.dataset.normalized_vectors[midx]
1075 | for vidx, v in zip(vindices, vecs):
1076 | if corr(v, m) < centroid_correction_threshold:
1077 | new_labels[vidx] = -1
1078 | return new_labels
1079 |
1080 | def __calc_centroid(self, cluster_labels):
1081 | centroids = []
1082 | centroids_stdev = []
1083 | #medoids = []
1084 | for lbl in sorted(list(set(cluster_labels))):
1085 | if lbl == -1:
1086 | continue
1087 | cl_vecs = self.dataset.normalized_vectors[cluster_labels == lbl, :]
1088 | #cl_dists = scipy.spatial.distance.cdist(cl_vecs, cl_vecs, metric)
1089 | #medoid = cl_vecs[np.argmin(np.sum(cl_dists, axis=0))]
1090 | centroid = np.mean(cl_vecs, axis=0)
1091 | centroid_stdev = np.std(cl_vecs, axis=0)
1092 | #medoids.append(medoid)
1093 | centroids.append(centroid)
1094 | centroids_stdev.append(centroid_stdev)
1095 | return centroids, centroids_stdev#, medoids
1096 |
1097 | def cluster_vectors(self, pca_dims=10, min_cluster_size=0, resolution=0.6, prune=1.0/15.0, snn_neighbors=30, max_correlation=1.0,
1098 | metric="correlation", subclustering=False, dbscan_eps=0.4, centroid_correction_threshold=0.8, random_state=0):
1099 | """
1100 | Cluster the given vectors using the specified clustering method.
1101 |
1102 | :param pca_dims: Number of principal componants used for clustering.
1103 | :type pca_dims: int
1104 | :param min_cluster_size: Set minimum cluster size.
1105 | :type min_cluster_size: int
1106 | :param resolution: Resolution for Louvain community detection.
1107 | :type resolution: float
1108 | :param prune: Threshold for Jaccard index (weight of SNN network). If it is smaller than prune, it is set to zero.
1109 | :type prune: float
1110 | :param snn_neighbors: Number of neighbors for SNN network.
1111 | :type snn_neighbors: int
1112 | :param max_correlation: Clusters with higher correlation to this value will be merged.
1113 | :type max_correlation: bool
1114 | :param metric: Metric for calculation of distance between vectors in gene expression space.
1115 | :type metric: str
1116 | :param subclustering: If True, each cluster will be clustered once again with DBSCAN algorithm to find more subclusters.
1117 | :type subclustering: bool
1118 | :param centroid_correction_threshold: Centroid will be recalculated with the vectors
1119 | which have the correlation to the cluster medoid equal or higher than this value.
1120 | :type centroid_correction_threshold: float
1121 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result
1122 | :type random_state: int or random state object
1123 | """
1124 |
1125 | vecs_normalized = self.dataset.normalized_vectors
1126 | vecs_normalized_dimreduced = PCA(n_components=pca_dims, random_state=random_state).fit_transform(vecs_normalized)
1127 |
1128 | def cluster_vecs(vecs):
1129 | k = min(snn_neighbors, vecs.shape[0])
1130 | knn_graph = kneighbors_graph(vecs, k, mode='connectivity', include_self=True, metric=metric).todense()
1131 | intersections = np.dot(knn_graph, knn_graph.T)
1132 | snn_graph = intersections / (k + (k - intersections)) # borrowed from Seurat
1133 | snn_graph[snn_graph < prune] = 0
1134 | G = nx.from_numpy_matrix(snn_graph)
1135 | partition = community.best_partition(G, resolution=resolution, random_state=random_state)
1136 | lbls = np.array(list(partition.values()))
1137 | return lbls
1138 |
1139 | def remove_small_clusters(lbls, lbls2=None):
1140 | small_clusters = []
1141 | cluster_indices = []
1142 | lbls = np.array(lbls)
1143 | for lbl in np.unique(lbls):
1144 | if lbl == -1:
1145 | continue
1146 | cnt = np.sum(lbls == lbl)
1147 | if cnt < min_cluster_size:
1148 | small_clusters.append(lbl)
1149 | else:
1150 | cluster_indices.append(lbl)
1151 | for lbl in small_clusters:
1152 | lbls[lbls == lbl] = -1
1153 | tmp = np.array(lbls, copy=True)
1154 | for i, idx in enumerate(cluster_indices):
1155 | lbls[tmp == idx] = i
1156 | if lbls2 is not None:
1157 | for lbl in small_clusters:
1158 | lbls2[lbls2 == lbl] = -1
1159 | tmp = np.array(lbls2, copy=True)
1160 | for i, idx in enumerate(cluster_indices):
1161 | lbls2[tmp == idx] = i
1162 | return lbls, lbls2
1163 | else:
1164 | return lbls
1165 |
1166 | if subclustering:
1167 | super_lbls = cluster_vecs(vecs_normalized_dimreduced)
1168 | dbscan = DBSCAN(eps=dbscan_eps, min_samples=min_cluster_size, metric=metric)
1169 | all_lbls = np.zeros_like(super_lbls)
1170 | global_lbl_idx = 0
1171 | for super_lbl in set(list(super_lbls)):
1172 | super_lbl_idx = np.where(super_lbls == super_lbl)[0]
1173 | if super_lbl == -1:
1174 | all_lbls[super_lbl_idx] = -1
1175 | continue
1176 | sub_lbls = dbscan.fit(vecs_normalized_dimreduced[super_lbl_idx]).labels_
1177 | for sub_lbl in set(list(sub_lbls)):
1178 | if sub_lbl == -1:
1179 | all_lbls[tuple([super_lbl_idx[sub_lbls == sub_lbl]])] = -1
1180 | continue
1181 | all_lbls[tuple([super_lbl_idx[sub_lbls == sub_lbl]])] = global_lbl_idx
1182 | global_lbl_idx += 1
1183 | else:
1184 | all_lbls = cluster_vecs(vecs_normalized_dimreduced)
1185 |
1186 | new_labels = self.__correct_cluster_labels(all_lbls, centroid_correction_threshold)
1187 | new_labels, all_lbls = remove_small_clusters(new_labels, all_lbls)
1188 | centroids, centroids_stdev = self.__calc_centroid(new_labels)
1189 |
1190 | merge_candidates = []
1191 | if max_correlation < 1.0:
1192 | Z = scipy.cluster.hierarchy.linkage(centroids, metric='correlation')
1193 | clbls = scipy.cluster.hierarchy.fcluster(Z, 1 - max_correlation, 'distance')
1194 | for i in set(clbls):
1195 | leaf_indices = np.where(clbls == i)[0]
1196 | if len(leaf_indices) > 1:
1197 | merge_candidates.append(leaf_indices)
1198 | removed_indices = []
1199 | for cand in merge_candidates:
1200 | for i in cand[1:]:
1201 | all_lbls[all_lbls == i] = cand[0]
1202 | removed_indices.append(i)
1203 | for i in sorted(removed_indices, reverse=True):
1204 | all_lbls[all_lbls > i] -= 1
1205 |
1206 | new_labels = self.__correct_cluster_labels(all_lbls, centroid_correction_threshold)
1207 | new_labels, all_lbls = remove_small_clusters(new_labels, all_lbls)
1208 | centroids, centroids_stdev = self.__calc_centroid(new_labels)
1209 |
1210 | self.dataset.cluster_labels = all_lbls
1211 | self.dataset.filtered_cluster_labels = new_labels
1212 | self.dataset.centroids = np.array(centroids)
1213 | self.dataset.centroids_stdev = np.array(centroids_stdev)
1214 | #self.dataset.medoids = np.array(medoids)
1215 |
1216 | self.__m__("Found %d clusters"%len(centroids))
1217 | return
1218 |
1219 | def rescue_cluster(self, gene_names, expression_thresholds=[]):
1220 | assert len(gene_names) > 0
1221 | assert len(expression_thresholds) == 0 or len(gene_names) == len(expression_thresholds)
1222 |
1223 | expression_thresholds = list(expression_thresholds)
1224 | lm_vectors = self.dataset.vf[self.dataset.local_maxs[0], self.dataset.local_maxs[1], self.dataset.local_maxs[2], :]
1225 | lm_mask = np.ones(len(lm_vectors), dtype=bool)
1226 | for i in range(len(gene_names)):
1227 | rg_idx = self.dataset.genes.index(gene_names[i])
1228 | if len(expression_thresholds) == 0:
1229 | expression_threshold = filters.threshold_otsu(self.dataset.vf[..., rg_idx])
1230 | else:
1231 | expression_threshold = float(expression_thresholds[i])
1232 | lm_mask = np.logical_and(lm_mask, lm_vectors[:, rg_idx] > expression_threshold)
1233 |
1234 | rg_vectors = lm_vectors[lm_mask]
1235 | rg_centroid = np.mean(rg_vectors, axis=0)
1236 | rg_centroid_stdev = np.std(rg_vectors, axis=0)
1237 |
1238 | self.dataset.cluster_labels[lm_mask] = len(self.dataset.centroids)
1239 | self.dataset.filtered_cluster_labels[lm_mask] = len(self.dataset.centroids)
1240 | self.dataset.centroids = np.append(self.dataset.centroids, [rg_centroid], axis=0)
1241 | self.dataset.centroids_stdev = np.append(self.dataset.centroids_stdev, [rg_centroid_stdev], axis=0)
1242 |
1243 | def exclude_and_merge_clusters(self, exclude=[], merge=[], centroid_correction_threshold=0.8):
1244 | """
1245 | Exclude bad clusters (including the vectors in the clusters), and merge similar clusters for the downstream analysis.
1246 |
1247 | :param exclude: List of cluster indices to be excluded.
1248 | :type exclude: list(int)
1249 | :param merge: List of list of cluster indices to be merged.
1250 | :type merge: list(list(int))
1251 | :param centroid_correction_threshold: Centroid will be recalculated with the vectors
1252 | which have the correlation to the cluster medoid equal or higher than this value.
1253 | :type centroid_correction_threshold: float
1254 | """
1255 | exclude = list(exclude)
1256 | merge = np.array(merge)
1257 | for centroids in merge:
1258 | centroids = np.unique(centroids)
1259 | for centroid in centroids[1:][::-1]:
1260 | self.dataset.cluster_labels[self.dataset.cluster_labels == centroid] = centroids[0]
1261 | exclude.append(centroid)
1262 | exclude = sorted(exclude)
1263 |
1264 | mask = np.ones(len(self.dataset.centroids), np.bool)
1265 | mask[exclude] = False
1266 |
1267 | #self.dataset.centroids = self.dataset.centroids[mask]
1268 | #self.dataset.centroids_stdev = self.dataset.centroids_stdev[mask]
1269 | #self.dataset.medoids = self.dataset.medoids[mask]
1270 |
1271 | mask = np.ones(len(self.dataset.cluster_labels), np.bool)
1272 | for centroid in exclude:
1273 | # There will be no vectors for already merged centroids - so there is no problem
1274 | mask[np.array(self.dataset.cluster_labels) == centroid] = False
1275 | self.dataset.cluster_labels = self.dataset.cluster_labels[mask]
1276 | self.dataset.local_maxs = tuple([lm[mask] for lm in self.dataset.local_maxs])
1277 |
1278 | for centroid in exclude[::-1]:
1279 | self.dataset.cluster_labels[self.dataset.cluster_labels > centroid] -= 1
1280 | self.dataset.normalized_vectors = self.dataset.normalized_vectors[mask, :]
1281 |
1282 | new_labels = self.__correct_cluster_labels(self.dataset.cluster_labels, centroid_correction_threshold)
1283 | centroids, centroids_stdev = self.__calc_centroid(new_labels)
1284 |
1285 | self.dataset.centroids = centroids
1286 | self.dataset.centroids_stdev = centroids_stdev
1287 | self.dataset.filtered_cluster_labels = new_labels
1288 |
1289 | return
1290 |
1291 | def map_celltypes(self, centroids=None):
1292 | """
1293 | Create correlation maps between the centroids and the vector field.
1294 | Each correlation map corresponds each cell type map.
1295 |
1296 | :param centroids: If given, map celltypes with the given cluster centroids.
1297 | :type centroids: list(np.array(int))
1298 | """
1299 |
1300 | if self.dataset.normalized_vf is None:
1301 | normalized_vf = self.dataset.vf
1302 | else:
1303 | normalized_vf = self.dataset.normalized_vf
1304 |
1305 | if centroids is None:
1306 | centroids = self.dataset.centroids
1307 | else:
1308 | self.dataset.centroids = centroids
1309 |
1310 | max_corr = np.zeros_like(self.dataset.vf_norm) - 1 # range from -1 to +1
1311 | max_corr_idx = np.zeros_like(self.dataset.vf_norm, dtype=int) - 1 # -1 for background
1312 | for cidx, centroid in enumerate(centroids):
1313 | ctmap = calc_ctmap(centroid, normalized_vf, self.ncores)
1314 | ctmap = np.nan_to_num(ctmap)
1315 | mask = max_corr < ctmap
1316 | max_corr[mask] = ctmap[mask]
1317 | max_corr_idx[mask] = cidx
1318 | self.dataset.max_correlations = max_corr
1319 | self.dataset.celltype_maps = max_corr_idx
1320 | return
1321 |
1322 | def filter_celltypemaps(self, min_r=0.6, min_norm=0.1, fill_blobs=True, min_blob_area=0, filter_params={}, output_mask=None):
1323 | """
1324 | Post-filter cell type maps created by `map_celltypes`.
1325 |
1326 | :param min_r: minimum threshold of the correlation.
1327 | :type min_r: float
1328 | :param min_norm: minimum threshold of the vector norm.
1329 | If a string is given instead, then the threshold is automatically determined using
1330 | sklearn's `threshold filter functions `_ (The functions start with `threshold_`).
1331 | :type min_norm: str or float
1332 | :param fill_blobs: If true, then the algorithm automatically fill holes in each blob.
1333 | :type fill_blobs: bool
1334 | :param min_blob_area: The blobs with its area less than this value will be removed.
1335 | :type min_blob_area: int
1336 | :param filter_params: Filter parameters used for the sklearn's threshold filter functions.
1337 | Not used when `min_norm` is float.
1338 | :type filter_params: dict
1339 | :param output_mask: If given, the cell type maps will be filtered using the output mask.
1340 | :type output_mask: np.ndarray(bool)
1341 | """
1342 |
1343 | if isinstance(min_norm, str):
1344 | # filter_params dict will be used for kwd params for filter_* functions.
1345 | # some functions doesn't support param 'offset', therefore temporariliy remove it from here
1346 | filter_offset = filter_params.pop('offset', 0)
1347 |
1348 | filtered_ctmaps = np.zeros_like(self.dataset.celltype_maps) - 1
1349 | mask = np.zeros_like(self.dataset.vf_norm, dtype=bool)
1350 | for cidx in range(len(self.dataset.centroids)):
1351 | ctcorr = self.dataset.get_celltype_correlation(cidx)
1352 | if isinstance(min_norm, str):
1353 | for z in range(self.dataset.shape[2]):
1354 | if min_norm in ["local", "niblack", "sauvola", "localotsu"]:
1355 | im = np.zeros(self.dataset.vf_norm.shape[:-1])
1356 | im[ctcorr[..., z] > min_r] = self.dataset.vf_norm[..., z][ctcorr[..., z] > min_r]
1357 | if min_norm == "localotsu":
1358 | max_norm = np.max(im)
1359 | im /= max_norm
1360 | selem = disk(filter_params['radius'])
1361 | min_norm_cut = filters.rank.otsu(im, selem) * max_norm
1362 | else:
1363 | filter_func = getattr(filters, "threshold_" + min_norm)
1364 | if min_norm in ["local", "niblack", "sauvola"]:
1365 | min_norm_cut = filter_func(im, **filter_params)
1366 | else:
1367 | highr_norm = self.dataset.vf_norm[..., z][ctcorr[..., z] > min_r]
1368 | #sigma = np.std(highr_norm)
1369 | if len(highr_norm) == 0 or np.max(highr_norm) == np.min(highr_norm):
1370 | min_norm_cut = np.max(self.dataset.vf_norm)
1371 | else:
1372 | min_norm_cut = filter_func(highr_norm, **filter_params)
1373 | min_norm_cut += filter_offset # manually apply filter offset
1374 | mask[..., z][np.logical_and(self.dataset.vf_norm[..., z] > min_norm_cut, ctcorr[..., z] > min_r)] = 1
1375 | else:
1376 | mask[np.logical_and(self.dataset.vf_norm > min_norm, ctcorr > min_r)] = 1
1377 |
1378 | if min_blob_area > 0 or fill_blobs:
1379 | blob_labels = measure.label(mask, background=0)
1380 | for bp in measure.regionprops(blob_labels):
1381 | if min_blob_area > 0 and bp.filled_area < min_blob_area:
1382 | for c in bp.coords:
1383 | mask[c[0], c[1], c[2]] = 0 # fill with zeros
1384 | #mask[c[0], c[1]] = 0 # fill with zeros
1385 | continue
1386 | if fill_blobs and bp.area != bp.filled_area:
1387 | minx, miny, minz, maxx, maxy, maxz = bp.bbox
1388 | mask[minx:maxx, miny:maxy, minz:maxz] |= bp.filled_image
1389 | #minr, minc, maxr, maxc = bp.bbox
1390 | #mask[minr:maxr, minc:maxc] |= bp.filled_image
1391 |
1392 | filtered_ctmaps[np.logical_and(mask == 1, np.logical_or(self.dataset.celltype_maps == -1, self.dataset.celltype_maps == cidx))] = cidx
1393 |
1394 | if isinstance(min_norm, str):
1395 | # restore offset param
1396 | filter_params['offset'] = filter_offset
1397 |
1398 | if output_mask is not None:
1399 | filtered_ctmaps[~output_mask.astype(bool)] = -1
1400 | self.dataset.filtered_celltype_maps = filtered_ctmaps
1401 |
1402 | def bin_celltypemaps(self, step=10, radius=100):
1403 | """
1404 | Sweep a sphere window along a lattice on the image, and count the number of cell types in each window.
1405 |
1406 | :param step: The lattice spacing.
1407 | :type step: int
1408 | :param radius: The radius of the sphere window.
1409 | :type radius: int
1410 | """
1411 | def make_sphere_mask(radius):
1412 | dia = radius*2+1
1413 | X, Y, Z = np.ogrid[:dia, :dia, :dia]
1414 | dist_from_center = np.sqrt((X - radius)**2 + (Y - radius)**2 + (Z - radius)**2)
1415 | mask = dist_from_center <= radius
1416 | return mask
1417 |
1418 | centers = np.array(self.dataset.vf_norm.shape) // 2
1419 | steps = np.array(np.floor(centers / step) * 2 + np.array(self.dataset.vf_norm.shape) % 2, dtype=int)
1420 | starts = centers - step * np.floor(centers / step)
1421 | ends = starts + steps * step
1422 | X, Y, Z = [np.arange(s, e, step, dtype=int) for s, e in zip(starts, ends)]
1423 |
1424 | ct_centers = np.zeros([len(X), len(Y), len(Z)], dtype=int)
1425 | ct_counts = np.zeros([len(X), len(Y), len(Z), len(self.dataset.centroids)], dtype=int)
1426 |
1427 | ncelltypes = np.max(self.dataset.filtered_celltype_maps) + 1
1428 | cnt_matrix = np.zeros([ncelltypes, ncelltypes])
1429 | sphere_mask = make_sphere_mask(radius)
1430 |
1431 | for xidx, x in enumerate(X):
1432 | for yidx, y in enumerate(Y):
1433 | for zidx, z in enumerate(Z):
1434 | mask_slices = [slice(0, radius*2+1), slice(0, radius*2+1), slice(0, radius*2+1)]
1435 | s = [x - radius, y - radius, z - radius ]
1436 | e = [x + radius + 1, y + radius + 1, z + radius + 1]
1437 |
1438 | for ms_idx, ms in enumerate(s):
1439 | if ms < 0:
1440 | mask_slices[ms_idx] = slice(abs(ms), mask_slices[ms_idx].stop)
1441 | s[ms_idx] = 0
1442 | for me_idx, me in enumerate(e):
1443 | ctmap_size = self.dataset.filtered_celltype_maps.shape[me_idx]
1444 | #ctmap_size = 50
1445 | if me > ctmap_size:
1446 | mask_slices[me_idx] = slice(mask_slices[me_idx].start, (radius * 2 + 1) + ctmap_size - me)
1447 | e[me_idx] = ctmap_size
1448 |
1449 | w = self.dataset.filtered_celltype_maps[s[0]:e[0],
1450 | s[1]:e[1],
1451 | s[2]:e[2]][sphere_mask[tuple(mask_slices)]] + 1
1452 |
1453 | ct_centers[xidx, yidx, zidx] = self.dataset.filtered_celltype_maps[x, y, z]
1454 | ct_counts[xidx, yidx, zidx] = np.bincount(np.ravel(w), minlength=len(self.dataset.centroids) + 1)[1:]
1455 |
1456 | self.dataset.celltype_binned_centers = ct_centers
1457 | self.dataset.celltype_binned_counts = ct_counts
1458 | return
1459 |
1460 | def find_domains(self, centroid_indices=[], n_clusters=10, norm_thres=0, merge_thres=0.6, merge_remote=True):
1461 | """
1462 | Find domains in the image, using the result of `bin_celltypemaps`.
1463 |
1464 | :param centroid_indices: The indices of centroids which will be used for determine tissue domains.
1465 | :type centroid_indices: list(int)
1466 | :param n_clusters: Initial number of clusters (domains) of agglomerative clustering.
1467 | :type n_clusters: int
1468 | :param norm_thres: Threshold for the total number of cell types in each window.
1469 | The window which contains the number of cell-type pixels less than this value will be ignored.
1470 | :type norm_thres: int
1471 | :param merge_thres: Threshold for merging domains. The centroids of the domains
1472 | which have higher correlation to this value will be merged.
1473 | :type merge_thres: float
1474 | :param merge_remote: If true, allow merging clusters that are not adjacent to each other.
1475 | :type merge_remote: bool
1476 | """
1477 | def find_neighbors(m, l):
1478 | neighbors = set()
1479 | for x, y, z in zip(*np.where(m == l)):
1480 | neighbors.add(m[x - 1, y , z ])
1481 | neighbors.add(m[x + 1, y , z ])
1482 | neighbors.add(m[x , y - 1, z ])
1483 | neighbors.add(m[x , y + 1, z ])
1484 | neighbors.add(m[x , y , z - 1])
1485 | neighbors.add(m[x , y , z + 1])
1486 | neighbors.add(m[x - 1, y - 1, z ])
1487 | neighbors.add(m[x + 1, y - 1, z ])
1488 | neighbors.add(m[x - 1, y + 1, z ])
1489 | neighbors.add(m[x + 1, y + 1, z ])
1490 | neighbors.add(m[x - 1, y , z - 1])
1491 | neighbors.add(m[x + 1, y , z - 1])
1492 | neighbors.add(m[x - 1, y , z + 1])
1493 | neighbors.add(m[x + 1, y , z + 1])
1494 | neighbors.add(m[x , y - 1, z - 1])
1495 | neighbors.add(m[x , y + 1, z - 1])
1496 | neighbors.add(m[x , y - 1, z + 1])
1497 | neighbors.add(m[x , y + 1, z + 1])
1498 | neighbors.add(m[x - 1, y - 1, z - 1])
1499 | neighbors.add(m[x + 1, y - 1, z - 1])
1500 | neighbors.add(m[x - 1, y - 1, z + 1])
1501 | neighbors.add(m[x + 1, y - 1, z + 1])
1502 | neighbors.add(m[x - 1, y + 1, z - 1])
1503 | neighbors.add(m[x + 1, y + 1, z - 1])
1504 | neighbors.add(m[x - 1, y + 1, z + 1])
1505 | neighbors.add(m[x + 1, y + 1, z + 1])
1506 | return neighbors
1507 |
1508 | if self.dataset.celltype_binned_counts is None:
1509 | raise AssertionError("Run 'bin_celltypemap()' method first!")
1510 |
1511 | if len(centroid_indices) > 0:
1512 | binned_ctmaps = self.dataset.celltype_binned_counts[..., centroid_indices]
1513 | else:
1514 | binned_ctmaps = self.dataset.celltype_binned_counts
1515 |
1516 | binned_ctmaps_norm = np.sum(binned_ctmaps, axis=3)
1517 |
1518 | ctvf_vecs = binned_ctmaps[binned_ctmaps_norm > norm_thres]
1519 | ctvf_vecs_normalized = preprocessing.normalize(ctvf_vecs, norm='l1', axis=1)
1520 |
1521 | clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', affinity='euclidean').fit(ctvf_vecs_normalized)
1522 | labels_predicted = clustering.labels_ + 1
1523 |
1524 | layer_map = np.zeros(binned_ctmaps_norm.shape)
1525 | layer_map[binned_ctmaps_norm > norm_thres] = labels_predicted
1526 | layer_map = measure.label(layer_map)
1527 |
1528 | if merge_thres < 1.0:
1529 | while True:
1530 | uniq_labels = np.array(list(set(list(np.ravel(layer_map))) - set([0])))
1531 | if not merge_remote:
1532 | layer_map_padded = np.pad(layer_map, 1, mode='constant', constant_values=0)
1533 | neighbors_dic = {}
1534 | for lbl in uniq_labels:
1535 | neighbors_dic[lbl] = find_neighbors(layer_map_padded, lbl)
1536 | cluster_centroids = []
1537 | for lbl in uniq_labels:
1538 | cluster_centroids.append(np.mean(binned_ctmaps[layer_map == lbl], axis=0))
1539 | max_corr = 0
1540 | #max_corr_indices = (0, 0, )
1541 | for i in range(len(uniq_labels)):
1542 | for j in range(i+1, len(uniq_labels)):
1543 | lbl_i, lbl_j = uniq_labels[i], uniq_labels[j]
1544 | if lbl_i == 0 or lbl_j == 0:
1545 | continue
1546 | corr_ij = corr(cluster_centroids[i], cluster_centroids[j])
1547 | if corr_ij > max_corr and (merge_remote or lbl_j in neighbors_dic[lbl_i]):
1548 | max_corr = corr_ij
1549 | max_corr_indices = (lbl_i, lbl_j, )
1550 | if max_corr > merge_thres:
1551 | layer_map[layer_map == max_corr_indices[1]] = max_corr_indices[0]
1552 | else:
1553 | break
1554 |
1555 | """
1556 | if min_size > 0:
1557 | labeled_layer_map = measure.label(layer_map)
1558 | labeled_layer_map_padded = np.pad(labeled_layer_map, 1, mode='constant', constant_values=0)
1559 | for prop in measure.regionprops(labeled_layer_map):
1560 | if prop.area < min_size:
1561 | find_neighbors(layer_map_padded, )
1562 | """
1563 |
1564 | uniq_labels = sorted(set(list(np.ravel(layer_map))) - set([0]))
1565 | for i, lbl in enumerate(uniq_labels, start=1):
1566 | layer_map[layer_map == lbl] = i
1567 |
1568 | resized_layer_map = zoom(layer_map, np.array(self.dataset.vf_norm.shape)/np.array(layer_map.shape), order=0) - 1
1569 | resized_layer_map2 = np.array(resized_layer_map, copy=True)
1570 | resized_layer_map2[self.dataset.filtered_celltype_maps == -1] = -1
1571 |
1572 | self.dataset.inferred_domains = resized_layer_map
1573 | self.dataset.inferred_domains_cells = resized_layer_map2
1574 |
1575 | def exclude_and_merge_domains(self, exclude=[], merge=[]):
1576 | """
1577 | Manually exclude or merge domains.
1578 |
1579 | :param exclude: Indices of the domains which will be excluded.
1580 | :type exclude: list(int)
1581 | :param merge: List of indices of the domains which will be merged.
1582 | :type merge: list(list(int))
1583 | """
1584 | for i in exclude:
1585 | self.dataset.inferred_domains[self.dataset.inferred_domains == i] = -1
1586 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == i] = -1
1587 |
1588 | for i in merge:
1589 | for j in i[1:]:
1590 | self.dataset.inferred_domains[self.dataset.inferred_domains == j] = i[0]
1591 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == j] = i[0]
1592 |
1593 | uniq_indices = np.unique(self.dataset.inferred_domains_cells)
1594 | if -1 in uniq_indices:
1595 | uniq_indices = uniq_indices[1:]
1596 |
1597 | for new_idx, i in enumerate(uniq_indices):
1598 | self.dataset.inferred_domains[self.dataset.inferred_domains == i] = new_idx
1599 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == i] = new_idx
1600 |
1601 | def calc_cell_type_compositions(self):
1602 | """
1603 | Calculate cell type compositions in each domain.
1604 | """
1605 | cell_type_compositions = []
1606 | for i in range(np.max(self.dataset.inferred_domains) + 1):
1607 | counts = np.bincount(self.dataset.filtered_celltype_maps[self.dataset.inferred_domains == i] + 1, minlength=len(self.dataset.centroids) + 1)
1608 | cell_type_compositions.append(counts[1:])
1609 |
1610 | masked_ctmap = self.dataset.filtered_celltype_maps[self.dataset.filtered_celltype_maps != -1]
1611 | counts_all = np.array(np.bincount(masked_ctmap, minlength=len(self.dataset.centroids)), dtype=float)
1612 | cell_type_compositions.append(counts_all) # Add proportion from the whole tissue
1613 | cell_type_compositions = preprocessing.normalize(cell_type_compositions, axis=1, norm='l1')
1614 | self.dataset.inferred_domains_compositions = cell_type_compositions
1615 |
1616 |
1617 | def calc_spatial_relationship(self):
1618 | """
1619 | Calculate spatial relationship between the domains using the result of `bin_celltypemap`.
1620 | """
1621 | if self.dataset.celltype_binned_counts is None:
1622 | raise AssertionError("Run 'bin_celltypemap()' method first!")
1623 |
1624 | ct_centers = self.dataset.celltype_binned_centers
1625 |
1626 | sparel = np.zeros([len(self.dataset.centroids), len(self.dataset.centroids)])
1627 | for idx in np.unique(ct_centers):
1628 | sparel[idx, :] = np.sum(self.dataset.celltype_binned_counts[ct_centers == idx], axis=0)
1629 |
1630 | self.dataset.spatial_relationships = preprocessing.normalize(sparel, axis=1, norm='l1')
1631 |
--------------------------------------------------------------------------------