├── .github └── workflows │ └── python-publish.yml ├── CHANGELOG ├── LICENSE ├── README.rst ├── c └── utils.cpp ├── doc ├── Makefile ├── conf.py ├── images │ ├── de_novo.png │ ├── de_novo_celltype.png │ ├── diagplot_centroid_2.png │ ├── diagplot_centroid_30.png │ ├── diagplot_centroid_5.png │ ├── diagplot_centroid_8.png │ ├── domain_composition.png │ ├── domain_composition_all.png │ ├── domains.png │ ├── domains_individual.png │ ├── final.png │ ├── guided.png │ ├── kernel_bw.png │ ├── local_max_threshold_gene.png │ ├── local_max_threshold_knn.png │ ├── local_max_threshold_knn2.png │ ├── local_max_threshold_knn3.png │ ├── local_max_threshold_total.png │ ├── mask.png │ ├── maxima.png │ ├── segmented_celltype_map.png │ ├── tsne.png │ ├── tsne_final.png │ ├── tsne_merged.png │ └── tsne_removed.png ├── index.rst ├── ssam.rst ├── userguide.rst └── userguide │ ├── 01-tldr.rst │ ├── 02-installation.rst │ ├── 03-data.rst │ ├── 04-kde.rst │ ├── 05-kernel_shape.rst │ ├── 06-kernel_bandwidth.rst │ ├── 07-input_mask.rst │ ├── 08-guided.rst │ ├── 09-celltype_map_thresh_g.rst │ ├── 10-de_novo.rst │ ├── 11-max_filtering.rst │ ├── 12-clustering.rst │ ├── 13-diagnostic.rst │ ├── 14-cluster_annotation.rst │ ├── 15-celltype_map_thresh_d.rst │ ├── 16-visualisation.rst │ ├── 17-domain.rst │ ├── 18-composition.rst │ ├── 19-experimental.rst │ ├── 20-aaec.rst │ └── 21-segment_celltype_map.rst ├── requirements.txt ├── setup.py └── ssam └── __init__.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | workflow_dispatch: 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine numpy 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 2019-10-13 2 | v 1.0.0b – Initial release 3 | 2019-10-19 4 | v 1.0.1 - Added documentations, corrected the default parameters of the methods 5 | 2021-04-16 6 | v 1.0.2 - Added more documentations (read the docs), minor bug fixes 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Notice 2 | ====== 3 | 4 | This repository is no longer maintained. Further development of SSAM will be continued in the https://github.com/pnucolab/ssam repository. 5 | 6 | SSAM (Spot-based Spatial cell-type Analysis by Multidimensional mRNA density estimation) 7 | ======================================================================================== 8 | 9 | Author: Jeongbin Park (jeongbin.park@charite.de)\ :sup:`1,2` and Wonyl Choi (wonyl@bu.edu)\ :sup:`3` 10 | 11 | :sup:`1`\ Digital Health Center, Berlin Institute of Health (BIH) and Charité – Universitätsmedizin, Berlin, Germany; :sup:`2`\ Faculty of Biosciences, Heidelberg University, Heidelberg, Germany; :sup:`3`\ Department of Computer Science, Boston University, Boston, the United States of America 12 | 13 | (Not referring this :laughing:: https://en.wikipedia.org/wiki/Ssam) 14 | 15 | This project was done under supervision of Dr. Naveed Ishaque (naveed.ishaque@charite.de) and Prof. Roland Eils (roland.eils@charite.de), and in collaboration with the SpaceTx consortium and the Human Cell Atlas project. 16 | 17 | Please also check our example Jupyter notebooks here: https://github.com/eilslabs/ssam_example 18 | 19 | Prerequisites 20 | ============= 21 | 22 | Currently SSAM was only tested with Python 3 in Linux environment. In addition to this package, SSAM requires a local R installation with pre-installed packages ``feather`` and ``sctransform``. For details, please follow the instructions here: https://ssam.readthedocs.io/en/release/userguide/01-tldr.html#installation 23 | 24 | Install 25 | ======= 26 | 27 | https://ssam.readthedocs.io/en/release/userguide/01-tldr.html#installation 28 | 29 | Documentation 30 | ============= 31 | 32 | https://ssam.readthedocs.io/ 33 | 34 | Citations 35 | ========= 36 | 37 | Jeongbin Park, Wonyl Choi, Sebastian Tiesmeyer, Brian Long, Lars E. Borm, Emma Garren, Thuc Nghi Nguyen, Bosiljka Tasic, Simone Codeluppi, Tobias Graf, Matthias Schlesner, Oliver Stegle, Roland Eils & Naveed Ishaque. "`Cell segmentation-free inference of cell types from in situ transcriptomics data. `_" *Nature Communications* **12**, 3545 (2021). 38 | 39 | License 40 | ======= 41 | 42 | Copyright (C) 2018 Jeongbin Park and Wonyl Choi 43 | 44 | This program is free software: you can redistribute it and/or modify 45 | it under the terms of the GNU Affero General Public License as published 46 | by the Free Software Foundation, either version 3 of the License, or 47 | (at your option) any later version. 48 | 49 | This program is distributed in the hope that it will be useful, 50 | but WITHOUT ANY WARRANTY; without even the implied warranty of 51 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 52 | GNU Affero General Public License for more details. 53 | 54 | You should have received a copy of the GNU Affero General Public License 55 | along with this program. If not, see https://www.gnu.org/licenses/. 56 | -------------------------------------------------------------------------------- /c/utils.cpp: -------------------------------------------------------------------------------- 1 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #if defined(_OPENMP) 8 | #include 9 | #else 10 | typedef int omp_int_t; 11 | inline omp_int_t omp_get_thread_num() { return 0;} 12 | inline omp_int_t omp_get_max_threads() { return 1;} 13 | #endif 14 | 15 | #include 16 | #include "numpy/npy_math.h" 17 | #include "numpy/arrayobject.h" 18 | 19 | #define I2D(X, Y, YL) ((X) * (YL) + (Y)) 20 | #define I3D(X, Y, Z, YL, ZL) (((X) * (YL) * (ZL)) + ((Y) * (ZL)) + (Z)) 21 | 22 | struct pos2d { 23 | long x; 24 | long y; 25 | }; 26 | 27 | struct pos3d { 28 | long x; 29 | long y; 30 | long z; 31 | }; 32 | 33 | static double gauss_kernel(double x, double y, double z) { 34 | return exp(-0.5 * (x*x + y*y + z*z)); // this is not normalized 35 | } 36 | 37 | static void kde(double bandwidth, double *x, double *y, double *z, double* query_x, double* query_y, double *query_z, double *rtn, unsigned int num_points, unsigned int num_querys, double (*kernel)(double, double, double), double maxdist, int ncores) { 38 | unsigned int i, j; 39 | double d; 40 | #pragma omp parallel for num_threads(ncores) private(i, j, d) 41 | for (i=0; i(kwlist), &h, &arg1, &arg2, &arg3, &arg4, &arg5, &arg6, &kernel, &ncores)) return NULL; 111 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL; 112 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 113 | if ((arr3 = (PyArrayObject*)PyArray_FROM_OTF(arg3, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 114 | if ((arr4 = (PyArrayObject*)PyArray_FROM_OTF(arg4, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 115 | if ((arr5 = (PyArrayObject*)PyArray_FROM_OTF(arg5, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 116 | if ((arr6 = (PyArrayObject*)PyArray_FROM_OTF(arg6, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 117 | 118 | if (PyArray_NDIM(arr1) != 1 || PyArray_NDIM(arr2) != 1 || PyArray_NDIM(arr3) != 1 || 119 | PyArray_NDIM(arr4) != 1 || PyArray_NDIM(arr5) != 1 || PyArray_NDIM(arr6) != 1) 120 | { 121 | goto fail; 122 | } 123 | 124 | npts = PyArray_DIMS(arr1)[0]; 125 | nqrys = PyArray_DIMS(arr4)[0]; 126 | nqrys_npy = nqrys; 127 | 128 | oarr = (PyArrayObject*)PyArray_ZEROS(1, &nqrys_npy, NPY_DOUBLE, NPY_CORDER); 129 | 130 | x = (double *)PyArray_DATA(arr1); 131 | y = (double *)PyArray_DATA(arr2); 132 | z = (double *)PyArray_DATA(arr3); 133 | qx = (double *)PyArray_DATA(arr4); 134 | qy = (double *)PyArray_DATA(arr5); 135 | qz = (double *)PyArray_DATA(arr6); 136 | rtn = (double *)PyArray_DATA(oarr); 137 | 138 | maxdist_gauss = sqrt(2) * h * log((double)(1000000 * npts)); 139 | kde(h, x, y, z, qx, qy, qz, rtn, npts, nqrys, gauss_kernel, maxdist_gauss, ncores); 140 | 141 | Py_DECREF(arr1); 142 | Py_DECREF(arr2); 143 | Py_DECREF(arr3); 144 | Py_DECREF(arr4); 145 | Py_DECREF(arr5); 146 | Py_DECREF(arr6); 147 | 148 | return (PyObject *) oarr; 149 | 150 | fail: 151 | Py_XDECREF(arr1); 152 | Py_XDECREF(arr2); 153 | Py_XDECREF(arr3); 154 | Py_XDECREF(arr4); 155 | Py_XDECREF(arr5); 156 | Py_XDECREF(arr6); 157 | return NULL; 158 | } 159 | 160 | static PyObject *flood_fill(PyObject *self, PyObject *args, PyObject *kwargs) { 161 | PyObject *arg1 = NULL; 162 | PyObject *arg2 = NULL; 163 | PyObject* filled_poslist = NULL; 164 | PyArrayObject *arr1 = NULL; 165 | PyArrayObject *arr2 = NULL; 166 | long nvec, nd, ngene = 0; 167 | long *pos, x, y, z, cnt; 168 | double r = 0.6, *vf; 169 | npy_intp *dimsp; 170 | int min_pixels = 10, max_pixels=2000; 171 | int i; 172 | bool *mask; 173 | 174 | static const char *kwlist[] = { "pos", "vf", "r", "min_pixels", "max_pixels", NULL }; 175 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|dii", const_cast(kwlist), &arg1, &arg2, &r, &min_pixels, &max_pixels)) return NULL; 176 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_LONG, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL; 177 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 178 | if (PyArray_NDIM(arr1) != 1) goto fail; 179 | nd = PyArray_NDIM(arr2); 180 | dimsp = PyArray_DIMS(arr2); 181 | nvec = 1; 182 | for (i=0; i queue2d; 193 | queue2d.push(pos2d()); 194 | queue2d.back().x = pos[0]; 195 | queue2d.back().y = pos[1]; 196 | while (queue2d.size() > 0) { 197 | x = queue2d.front().x; 198 | y = queue2d.front().y; 199 | PyObject *t = PyTuple_New(2); 200 | PyTuple_SetItem(t, 0, PyLong_FromLong(x)); 201 | PyTuple_SetItem(t, 1, PyLong_FromLong(y)); 202 | cnt += 1; 203 | if (cnt > max_pixels) 204 | break; 205 | PyList_Append(filled_poslist, t); 206 | queue2d.pop(); 207 | if (x < dimsp[0] - 1 && mask[I2D(x + 1, y, dimsp[1])] == false && 208 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene), 209 | vf + (I2D(x + 1, y, dimsp[1]) * ngene), ngene) > r) { 210 | mask[I2D(x + 1, y, dimsp[1])] = true; 211 | queue2d.push(pos2d()); 212 | queue2d.back().x = x + 1; 213 | queue2d.back().y = y; 214 | } 215 | if (x > 1 && mask[I2D(x - 1, y, dimsp[1])] == false && 216 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene), 217 | vf + (I2D(x - 1, y, dimsp[1]) * ngene), ngene) > r) { 218 | mask[I2D(x - 1, y, dimsp[1])] = true; 219 | queue2d.push(pos2d()); 220 | queue2d.back().x = x - 1; 221 | queue2d.back().y = y; 222 | } 223 | if (y < dimsp[1] - 1 && mask[I2D(x, y + 1, dimsp[1])] == false && 224 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene), 225 | vf + (I2D(x, y + 1, dimsp[1]) * ngene), ngene) > r) { 226 | mask[I2D(x, y + 1, dimsp[1])] = true; 227 | queue2d.push(pos2d()); 228 | queue2d.back().x = x; 229 | queue2d.back().y = y + 1; 230 | } 231 | if (y > 1 && mask[I2D(x, y - 1, dimsp[1])] == false && 232 | __corr__(vf + (I2D(pos[0], pos[1], dimsp[1]) * ngene), 233 | vf + (I2D(x, y - 1, dimsp[1]) * ngene), ngene) > r) { 234 | mask[I2D(x, y - 1, dimsp[1])] = true; 235 | queue2d.push(pos2d()); 236 | queue2d.back().x = x; 237 | queue2d.back().y = y - 1; 238 | } 239 | } 240 | } else if (nd == 4) { 241 | // 3D 242 | std::queue queue3d; 243 | queue3d.push(pos3d()); 244 | queue3d.back().x = pos[0]; 245 | queue3d.back().y = pos[1]; 246 | queue3d.back().z = pos[2]; 247 | while (queue3d.size() > 0) { 248 | x = queue3d.front().x; 249 | y = queue3d.front().y; 250 | z = queue3d.front().z; 251 | PyObject *t = PyTuple_New(3); 252 | PyTuple_SetItem(t, 0, PyLong_FromLong(x)); 253 | PyTuple_SetItem(t, 1, PyLong_FromLong(y)); 254 | PyTuple_SetItem(t, 2, PyLong_FromLong(z)); 255 | PyList_Append(filled_poslist, t); 256 | cnt += 1; 257 | if (cnt > max_pixels) 258 | break; 259 | queue3d.pop(); 260 | if (x < dimsp[0] - 1 && mask[I3D(x + 1, y, z, dimsp[1], dimsp[2])] == false && 261 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 262 | vf + I3D(x + 1, y, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 263 | mask[I3D(x + 1, y, z, dimsp[1], dimsp[2])] = true; 264 | queue3d.push(pos3d()); 265 | queue3d.back().x = x + 1; 266 | queue3d.back().y = y; 267 | queue3d.back().z = z; 268 | } 269 | if (x > 1 && mask[I3D(x - 1, y, z, dimsp[1], dimsp[2])] == false && 270 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 271 | vf + I3D(x - 1, y, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 272 | mask[I3D(x - 1, y, z, dimsp[1], dimsp[2])] = true; 273 | queue3d.push(pos3d()); 274 | queue3d.back().x = x - 1; 275 | queue3d.back().y = y; 276 | queue3d.back().z = z; 277 | } 278 | if (y < dimsp[1] - 1 && mask[I3D(x, y + 1, z, dimsp[1], dimsp[2])] == false && 279 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 280 | vf + I3D(x, y + 1, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 281 | mask[I3D(x, y + 1, z, dimsp[1], dimsp[2])] = true; 282 | queue3d.push(pos3d()); 283 | queue3d.back().x = x; 284 | queue3d.back().y = y + 1; 285 | queue3d.back().z = z; 286 | } 287 | if (y > 1 && mask[I3D(x, y - 1, z, dimsp[1], dimsp[2])] == false && 288 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 289 | vf + I3D(x, y - 1, z, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 290 | mask[I3D(x, y - 1, z, dimsp[1], dimsp[2])] = true; 291 | queue3d.push(pos3d()); 292 | queue3d.back().x = x; 293 | queue3d.back().y = y - 1; 294 | queue3d.back().z = z; 295 | } 296 | if (z < dimsp[2] - 1 && mask[I3D(x, y, z + 1, dimsp[1], dimsp[2])] == false && 297 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 298 | vf + I3D(x, y, z + 1, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 299 | mask[I3D(x, y, z, dimsp[1], dimsp[2])] = true; 300 | queue3d.push(pos3d()); 301 | queue3d.back().x = x; 302 | queue3d.back().y = y; 303 | queue3d.back().z = z + 1; 304 | } 305 | if (z > 1 && mask[I3D(x, y, z - 1, dimsp[1], dimsp[2])] == false && 306 | __corr__(vf + I3D(pos[0], pos[1], pos[2], dimsp[1], dimsp[2]) * ngene, 307 | vf + I3D(x, y, z - 1, dimsp[1], dimsp[2]) * ngene, ngene) > r) { 308 | mask[I3D(x, y, z - 1, dimsp[1], dimsp[2])] = true; 309 | queue3d.push(pos3d()); 310 | queue3d.back().x = x; 311 | queue3d.back().y = y; 312 | queue3d.back().z = z - 1; 313 | } 314 | } 315 | } 316 | free((void*)mask); 317 | Py_DECREF(arr1); 318 | Py_DECREF(arr2); 319 | if (cnt > max_pixels || cnt < min_pixels) 320 | PyList_SetSlice(filled_poslist, 0, PyList_Size(filled_poslist), NULL); 321 | return (PyObject *) filled_poslist; 322 | 323 | fail: 324 | Py_XDECREF(arr1); 325 | Py_XDECREF(arr2); 326 | return NULL; 327 | } 328 | 329 | static PyObject *calc_corrmap(PyObject *self, PyObject *args, PyObject *kwargs) { 330 | PyObject *arg1 = NULL; 331 | PyArrayObject *arr1 = NULL; 332 | PyArrayObject *oarr = NULL; 333 | long i, x, y, z, dx, dy, dz; 334 | long nvec, nd, ngene = 0; 335 | double *vecs, *corrmap; 336 | npy_intp *dimsp; 337 | int ncores = omp_get_max_threads(); 338 | int csize = 1; 339 | double *tmpvec; 340 | 341 | static const char *kwlist[] = { "vf", "ncores", "size", NULL }; 342 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|ii", const_cast(kwlist), &arg1, &ncores, &csize)) return NULL; 343 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL; 344 | nd = PyArray_NDIM(arr1); 345 | if (nd != 3 && nd != 4) goto fail; // only 2D or 3D array is expected 346 | dimsp = PyArray_DIMS(arr1); 347 | oarr = (PyArrayObject*)PyArray_ZEROS(nd - 1, dimsp, NPY_DOUBLE, NPY_CORDER); 348 | ngene = dimsp[nd-1]; 349 | corrmap = (double *)PyArray_DATA(oarr); 350 | vecs = (double *)PyArray_DATA(arr1); 351 | nvec = 1; 352 | for (i=0; i(kwlist), &arg1, &ncores, &csize)) return NULL; 435 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL; 436 | nd = PyArray_NDIM(arr1); 437 | if (nd != 3 && nd != 4) goto fail; // only 2D or 3D array is expected 438 | dimsp = PyArray_DIMS(arr1); 439 | for (i=0; i(kwlist), &arg1, &arg2, &ncores)) return NULL; 513 | if ((arr1 = (PyArrayObject*)PyArray_FROM_OTF(arg1, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) return NULL; 514 | if ((arr2 = (PyArrayObject*)PyArray_FROM_OTF(arg2, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY)) == NULL) goto fail; 515 | if (PyArray_NDIM(arr1) != 1) goto fail; 516 | nd = PyArray_NDIM(arr2); 517 | if((ngene = *PyArray_DIMS(arr1)) != PyArray_DIMS(arr2)[nd-1]) goto fail; 518 | 519 | dimsp = PyArray_DIMS(arr2); 520 | oarr = (PyArrayObject*)PyArray_ZEROS(nd - 1, dimsp, NPY_DOUBLE, NPY_CORDER); 521 | 522 | nvec = 1; 523 | for (i=0; i= 3 588 | static struct PyModuleDef moduledef = { 589 | PyModuleDef_HEAD_INIT, 590 | "analysis_utils", 591 | NULL, 592 | -1, 593 | module_methods 594 | }; 595 | #endif 596 | 597 | PyMODINIT_FUNC 598 | PyInit_utils(void) 599 | { 600 | #if PY_MAJOR_VERSION >= 3 601 | PyObject *module = PyModule_Create(&moduledef); 602 | #else 603 | Py_InitModule("utils", module_methods); 604 | #endif 605 | import_array(); 606 | #if PY_MAJOR_VERSION >= 3 607 | return module; 608 | #endif 609 | } -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = SSAM 8 | SOURCEDIR = . 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # SSAM documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Nov 22 11:41:04 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('.')) 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.intersphinx', 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.ifconfig', 40 | 'sphinx.ext.viewcode', 41 | 'sphinx.ext.githubpages'] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = 'SSAM' 57 | copyright = '2018, Jeongbin Park' 58 | author = 'Jeongbin Park' 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | version = '1.0.1' 66 | # The full version, including alpha/beta/rc tags. 67 | release = '1.0.1' 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This patterns also effect to html_static_path and html_extra_path 79 | exclude_patterns = ['build', 'Thumbs.db', '.DS_Store'] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'sphinx' 83 | 84 | # If true, `todo` and `todoList` produce output, else they produce nothing. 85 | todo_include_todos = True 86 | 87 | 88 | # -- Options for HTML output ---------------------------------------------- 89 | 90 | # The theme to use for HTML and HTML Help pages. See the documentation for 91 | # a list of builtin themes. 92 | # 93 | html_theme = 'sphinx_rtd_theme' 94 | html_theme_path = ["_themes", ] 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | # html_theme_options = {} 101 | 102 | # Add any paths that contain custom static files (such as style sheets) here, 103 | # relative to this directory. They are copied after the builtin static files, 104 | # so a file named "default.css" will overwrite the builtin "default.css". 105 | #html_static_path = ['static'] 106 | 107 | # Custom sidebar templates, must be a dictionary that maps document names 108 | # to template names. 109 | # 110 | # This is required for the alabaster theme 111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 112 | #html_sidebars = { 113 | # '**': [ 114 | # 'relations.html', # needs 'show_related': True theme option to display 115 | # 'searchbox.html', 116 | # ] 117 | #} 118 | 119 | 120 | # -- Options for HTMLHelp output ------------------------------------------ 121 | 122 | # Output file base name for HTML help builder. 123 | htmlhelp_basename = 'SSAMdoc' 124 | 125 | 126 | # -- Options for LaTeX output --------------------------------------------- 127 | 128 | latex_elements = { 129 | # The paper size ('letterpaper' or 'a4paper'). 130 | # 131 | # 'papersize': 'letterpaper', 132 | 133 | # The font size ('10pt', '11pt' or '12pt'). 134 | # 135 | # 'pointsize': '10pt', 136 | 137 | # Additional stuff for the LaTeX preamble. 138 | # 139 | # 'preamble': '', 140 | 141 | # Latex figure (float) alignment 142 | # 143 | # 'figure_align': 'htbp', 144 | } 145 | 146 | # Grouping the document tree into LaTeX files. List of tuples 147 | # (source start file, target name, title, 148 | # author, documentclass [howto, manual, or own class]). 149 | latex_documents = [ 150 | (master_doc, 'SSAM.tex', 'SSAM Documentation', 151 | 'Jeongbin Park', 'manual'), 152 | ] 153 | 154 | 155 | # -- Options for manual page output --------------------------------------- 156 | 157 | # One entry per manual page. List of tuples 158 | # (source start file, name, description, authors, manual section). 159 | man_pages = [ 160 | (master_doc, 'ssam', 'SSAM Documentation', 161 | [author], 1) 162 | ] 163 | 164 | 165 | # -- Options for Texinfo output ------------------------------------------- 166 | 167 | # Grouping the document tree into Texinfo files. List of tuples 168 | # (source start file, target name, title, author, 169 | # dir menu entry, description, category) 170 | texinfo_documents = [ 171 | (master_doc, 'SSAM', 'SSAM Documentation', 172 | author, 'SSAM', 'One line description of project.', 173 | 'Miscellaneous'), 174 | ] 175 | 176 | 177 | 178 | 179 | # Example configuration for intersphinx: refer to the Python standard library. 180 | intersphinx_mapping = {'https://docs.python.org/': None} 181 | 182 | autodoc_mock_imports = ["ssam.utils"] 183 | -------------------------------------------------------------------------------- /doc/images/de_novo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/de_novo.png -------------------------------------------------------------------------------- /doc/images/de_novo_celltype.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/de_novo_celltype.png -------------------------------------------------------------------------------- /doc/images/diagplot_centroid_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_2.png -------------------------------------------------------------------------------- /doc/images/diagplot_centroid_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_30.png -------------------------------------------------------------------------------- /doc/images/diagplot_centroid_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_5.png -------------------------------------------------------------------------------- /doc/images/diagplot_centroid_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/diagplot_centroid_8.png -------------------------------------------------------------------------------- /doc/images/domain_composition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domain_composition.png -------------------------------------------------------------------------------- /doc/images/domain_composition_all.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domain_composition_all.png -------------------------------------------------------------------------------- /doc/images/domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domains.png -------------------------------------------------------------------------------- /doc/images/domains_individual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/domains_individual.png -------------------------------------------------------------------------------- /doc/images/final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/final.png -------------------------------------------------------------------------------- /doc/images/guided.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/guided.png -------------------------------------------------------------------------------- /doc/images/kernel_bw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/kernel_bw.png -------------------------------------------------------------------------------- /doc/images/local_max_threshold_gene.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_gene.png -------------------------------------------------------------------------------- /doc/images/local_max_threshold_knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn.png -------------------------------------------------------------------------------- /doc/images/local_max_threshold_knn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn2.png -------------------------------------------------------------------------------- /doc/images/local_max_threshold_knn3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_knn3.png -------------------------------------------------------------------------------- /doc/images/local_max_threshold_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/local_max_threshold_total.png -------------------------------------------------------------------------------- /doc/images/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/mask.png -------------------------------------------------------------------------------- /doc/images/maxima.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/maxima.png -------------------------------------------------------------------------------- /doc/images/segmented_celltype_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/segmented_celltype_map.png -------------------------------------------------------------------------------- /doc/images/tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne.png -------------------------------------------------------------------------------- /doc/images/tsne_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_final.png -------------------------------------------------------------------------------- /doc/images/tsne_merged.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_merged.png -------------------------------------------------------------------------------- /doc/images/tsne_removed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HiDiHlabs/ssam/76615319930635210dbfd9eba1f20c1632c0a673/doc/images/tsne_removed.png -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | :caption: Contents 7 | 8 | userguide 9 | ssam 10 | 11 | 12 | Indices and tables 13 | ================== 14 | 15 | * :ref:`genindex` 16 | -------------------------------------------------------------------------------- /doc/ssam.rst: -------------------------------------------------------------------------------- 1 | Module contents 2 | --------------- 3 | 4 | .. automodule:: ssam 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/userguide.rst: -------------------------------------------------------------------------------- 1 | Spatial gene expression analysis with SSAM 2 | ------------------------------------------ 3 | 4 | .. toctree:: 5 | :glob: 6 | 7 | userguide/* 8 | -------------------------------------------------------------------------------- /doc/userguide/01-tldr.rst: -------------------------------------------------------------------------------- 1 | quick start / tldr page 2 | ======================= 3 | 4 | This tl;dr guide is for you if you already know what happens in a SSAM 5 | analysis or if you don’t care. 6 | 7 | For everyone else we recommend using the full 8 | `userguide <../userguide.md>`__. 9 | 10 | Installation 11 | ------------ 12 | 13 | Setup a ``conda`` environment: 14 | 15 | :: 16 | 17 | conda create -n ssam python=3.6 18 | conda activate ssam 19 | conda install gxx_linux-64 numpy pip R=3.6 pyarrow=0.15.1 20 | 21 | Do this in ``R``: 22 | 23 | :: 24 | 25 | install.packages("sctransform") 26 | install.packages("feather") 27 | 28 | Install SSAM via ``pip``: 29 | 30 | :: 31 | 32 | pip install ssam 33 | 34 | Data download 35 | ------------- 36 | 37 | :: 38 | 39 | curl "https://zenodo.org/record/3478502/files/supplemental_data_ssam_2019.zip?download=1" -o zenodo.zip 40 | unzip zenodo.zip 41 | 42 | Data preparation 43 | ---------------- 44 | 45 | All following steps in ``python``: 46 | 47 | :: 48 | 49 | import numpy as np 50 | import pandas as pd 51 | import matplotlib.pyplot as plt 52 | import ssam 53 | 54 | df = pd.read_csv( 55 | "zenodo/multiplexed_smFISH/raw_data/smFISH_MCT_CZI_Panel_0_spot_table.csv", 56 | usecols=['x', 'y', 'z', 'target']) 57 | 58 | um_per_pixel = 0.1 59 | 60 | df.x = (df.x - df.x.min()) * um_per_pixel + 10 61 | df.y = (df.y - df.y.min()) * um_per_pixel + 10 62 | df.z = (df.z - df.z.min()) * um_per_pixel + 10 63 | width = df.x.max() - df.x.min() + 10 64 | height = df.y.max() - df.y.min() + 10 65 | 66 | grouped = df.groupby('target').agg(list) 67 | genes = list(grouped.index) 68 | coord_list = [] 69 | for target, coords in grouped.iterrows(): 70 | coord_list.append(np.array(list(zip(*coords)))) 71 | 72 | Create SSAM dataset and vector field 73 | ------------------------------------ 74 | 75 | :: 76 | 77 | ds = ssam.SSAMDataset(genes, coord_list, width, height) 78 | analysis = ssam.SSAMAnalysis( 79 | ds, 80 | ncores=10, # used for kde step 81 | save_dir="kde/", 82 | verbose=True) 83 | 84 | analysis.run_kde(bandwidth=2.5, use_mmap=False) 85 | 86 | analysis.find_localmax( 87 | search_size=3, 88 | min_norm=0.2, 89 | min_expression=0.027 90 | ) 91 | 92 | analysis.normalize_vectors_sctransform() 93 | 94 | Creating the *de novo* cell map 95 | ------------------------------- 96 | 97 | :: 98 | 99 | analysis.cluster_vectors( 100 | min_cluster_size=0, 101 | pca_dims=22, 102 | resolution=0.15, 103 | metric='correlation') 104 | 105 | # post-filtering parameter for cell-type map 106 | filter_method = "local" 107 | filter_params = { 108 | "block_size": 151, 109 | "method": "mean", 110 | "mode": "constant", 111 | "offset": 0.2 112 | } 113 | 114 | analysis.map_celltypes() 115 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask) 116 | 117 | .. figure:: ../images/de_novo.png 118 | :alt: Visualisation of cell type map. 119 | 120 | Visualisation of cell type map. 121 | 122 | Creating the tissue domain map 123 | ------------------------------ 124 | 125 | :: 126 | 127 | analysis.bin_celltypemaps(step=10, radius=100) 128 | analysis.find_domains(n_clusters=20, merge_remote=True, merge_thres=0.7, norm_thres=1500) 129 | 130 | plt.figure(figsize=[5, 5]) 131 | ds.plot_domains(rotate=1, cmap=cmap) 132 | 133 | .. figure:: ../images/domains.png 134 | :alt: Visualisation of final domain map exhibitin clearly separated 135 | domains. 136 | 137 | Visualisation of final domain map exhibitin clearly separated 138 | domains. 139 | -------------------------------------------------------------------------------- /doc/userguide/02-installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | A step-by-step guide 5 | -------------------- 6 | 7 | The easiest way to prepare a python environment for SSAM is using 8 | `conda `__. 9 | Keeping python projects in isolated environments prevents dependency 10 | version conflicts or conflicts with your OS installation of python which 11 | usually depends on older versions incompatible with current scientific 12 | packages. 13 | 14 | Create your environment: 15 | 16 | :: 17 | 18 | conda create -n ssam python=3.6 19 | 20 | Remember to activate before using it: 21 | 22 | :: 23 | 24 | conda activate ssam 25 | 26 | Now we use conda to install some dependencies into our ssam environment: 27 | 28 | :: 29 | 30 | conda install gxx_linux-64=7.3.0 numpy=1.19.2 pip R=3.6 pyarrow=0.15.1 31 | 32 | Now we can install the R packages ``sctransform`` and ``feather``. Open 33 | R and type: 34 | 35 | :: 36 | 37 | install.packages("sctransform") 38 | install.packages("feather") 39 | 40 | Finally we switch to pip: 41 | 42 | .. raw:: html 43 | 44 | 49 | 50 | :: 51 | 52 | pip install git+https://github.com/HiDiHlabs/ssam.git 53 | 54 | Next we can download and prepare our `data `__. 55 | 56 | SSAM’s source code 57 | ------------------ 58 | 59 | In case you want to work with `SSAM’s source 60 | code `__, it is also hosted on github. 61 | -------------------------------------------------------------------------------- /doc/userguide/03-data.rst: -------------------------------------------------------------------------------- 1 | Data Preparation 2 | ================ 3 | 4 | Download VISp data 5 | ------------------ 6 | 7 | In this tutorial we work with data of the murine primary visual cortex 8 | (VISp) profiled using multiplexed smFISH. Further details are available 9 | in the SSAM publication (Park, et. al. 2019). 10 | 11 | First, download the data and unpack it: 12 | 13 | :: 14 | 15 | curl "https://zenodo.org/record/3478502/files/supplemental_data_ssam_2019.zip?download=1" -o zenodo.zip 16 | unzip zenodo.zip 17 | 18 | Load data into python 19 | --------------------- 20 | 21 | Let’s start with loading our python packages: 22 | 23 | :: 24 | 25 | import numpy as np 26 | import pandas as pd 27 | import matplotlib.pyplot as plt 28 | import ssam 29 | 30 | Now we can load the mRNA spot table. Each row describes one mRNA spot 31 | and the columns contain its coordinates and target gene. We load the 32 | required columns into a dataframe: 33 | 34 | :: 35 | 36 | df = pd.read_csv( 37 | "zenodo/multiplexed_smFISH/raw_data/smFISH_MCT_CZI_Panel_0_spot_table.csv", 38 | usecols=['x', 'y', 'z', 'target']) 39 | 40 | If your dataset is organized differently, you will have to reshape it 41 | before continuing with the next steps. ## Transform Data 42 | 43 | Because SSAM analysis is rooted in a cellular scale we transform the 44 | coordinates from a laboratory system into micrometers. Also we make them 45 | a bit tidier: 46 | 47 | :: 48 | 49 | um_per_pixel = 0.1 50 | 51 | df.x = (df.x - df.x.min()) * um_per_pixel + 10 52 | df.y = (df.y - df.y.min()) * um_per_pixel + 10 53 | df.z = (df.z - df.z.min()) * um_per_pixel + 10 54 | 55 | Prepare data for SSAM 56 | --------------------- 57 | 58 | To create a ``SSAMDataset`` object we need to provide four arguments: - 59 | a list of gene names profiled in the experiment: ``genes`` - a list of 60 | lists that contains the coordinates of each gene: ``coord_list`` - the 61 | ``width`` of the image - the ``height`` of the image 62 | 63 | The width and height are straightforward to infer from the dimensions of 64 | the image: 65 | 66 | :: 67 | 68 | width = df.x.max() - df.x.min() + 10 69 | height = df.y.max() - df.y.min() + 10 70 | 71 | We group the dataframe by gene and create the list of gene names: 72 | 73 | :: 74 | 75 | grouped = df.groupby('target').agg(list) 76 | genes = list(grouped.index) 77 | 78 | And finally the coordinate list: 79 | 80 | :: 81 | 82 | coord_list = [] 83 | for target, coords in grouped.iterrows(): 84 | coord_list.append(np.array(list(zip(*coords)))) 85 | 86 | Create the ``SSAMDataset`` object 87 | --------------------------------- 88 | 89 | With everything in place we can now instantiate the ``SSAMDataset`` 90 | object: 91 | 92 | :: 93 | 94 | ds = ssam.SSAMDataset(genes, coord_list, width, height) 95 | 96 | Now we can start the analysis with the `kernel density 97 | estimation `__ step. 98 | -------------------------------------------------------------------------------- /doc/userguide/04-kde.rst: -------------------------------------------------------------------------------- 1 | Creating the vector field 2 | ========================= 3 | 4 | After the data has been loaded, SSAM converts the discrete mRNA 5 | locations into mRNA desntiy (that can be thought of as continuous “gene 6 | expression clouds” over the tissue) through application of `Kernel 7 | Density Estimation `__. 8 | 9 | KDE 10 | --- 11 | 12 | With our ``SSAMDataset`` object ``ds`` we can now initialize a 13 | ``SSAMAnalysis`` object ``analysis``. 14 | 15 | :: 16 | 17 | analysis = ssam.SSAMAnalysis( 18 | ds, 19 | ncores=10, # used for kde step 20 | save_dir="kde/", 21 | verbose=True) 22 | 23 | And calculate a mRNA density estimate with the ``run_kde`` method. 24 | Important considerations here are the `kernel 25 | function `__ and the `kernel 26 | bandwidth `__. As default, we recommend using a 27 | Gaussian kernel with a bandwidth of 2.5: 28 | 29 | :: 30 | 31 | analysis.run_kde(bandwidth=2.5, use_mmap=False) 32 | 33 | Masking 34 | ------- 35 | 36 | If you want to perform the analysis on `only a part of your sample you 37 | can use a mask `__. This can restrict what parts of the image 38 | are used for local maxima sampling (the ``input_mask``), or restrict the 39 | cell-type map generation of SSAM to certain regions (the 40 | ``output_mask``). While this is not required for analysis (infact the 41 | SSAM paper did not apply masks to the osmFISH or MERFISH dataset), here 42 | we define a simply polygon as both the ``input_mask`` and 43 | ``output_mask`` for the VISp region. 44 | 45 | :: 46 | 47 | from matplotlib.path import Path 48 | # manual area annotation 49 | xy = np.array([[1535, 90], 50 | [ 795, 335], 51 | [ 135, 940], 52 | [ 835, 1995], 53 | [1465, 1695], 54 | [2010, 1215]]) 55 | 56 | # Extract coordinates from SSAMDataset 57 | x, y = np.meshgrid(np.arange(ds.vf.shape[0]), np.arange(ds.vf.shape[1])) 58 | x, y = x.flatten(), y.flatten() 59 | points = np.vstack((x,y)).T 60 | 61 | path = Path(xy) 62 | input_mask = path.contains_points(points) 63 | input_mask = input_mask.reshape((ds.vf.shape[1], ds.vf.shape[0], 1)).swapaxes(0, 1) 64 | output_mask = input_mask 65 | 66 | We recommend a visual inspection of the mask to make sure it alignes 67 | with the data as you expect it to: 68 | 69 | :: 70 | 71 | from matplotlib.patches import Polygon 72 | from matplotlib.collections import PatchCollection 73 | 74 | patch = Polygon(xy, True) 75 | p = PatchCollection([patch], alpha=0.4) 76 | 77 | plt.figure(figsize=[5, 5]) 78 | ds.plot_l1norm(rotate=1, cmap="Greys") 79 | plt.gca().add_collection(p) 80 | plt.axis('off') 81 | plt.savefig('images/mask.png') 82 | 83 | .. figure:: ../images/mask.png 84 | :alt: plot of the mRNA density superimposed with the mask 85 | 86 | plot of the mRNA density superimposed with the mask 87 | 88 | Local maxima search and normalization 89 | ------------------------------------- 90 | 91 | In order to reduce the computational burden, we recommend downsampling 92 | the image. While random sampling can be performe, we strongly encourage 93 | downsampling via local maxima selection, followed by `filtering based of 94 | individual and total gene expression `__. 95 | 96 | The local maxima are used to (i) determine the variance stabilisation 97 | parameters for the image, and (ii) be used to determine 98 | `clusters `__ in `de novo analysis `__. In 99 | this section, we will use the local maxima for variance stabilisation. 100 | 101 | Here we apply the ``find_localmax`` function to find the local maxima of 102 | the mRNA density, using a per gene expression threshold of ``0.027`` and 103 | a total gene expression threshold of ``0.2``: 104 | 105 | :: 106 | 107 | analysis.find_localmax( 108 | search_size=3, 109 | min_norm=0.2, # the total gene expression threshold 110 | min_expression=0.027, # the per gene expression threshold 111 | mask=input_mask 112 | ) 113 | 114 | Visualization 115 | ------------- 116 | 117 | After the local maxima have been identified, they can be visualised. In 118 | cases when many local maxima orginate from outside the tissue area a 119 | `k-NN density threshold can be used to filter “stray” local 120 | maxima `__, 121 | however in this example we use an input mask so it is not a problem. 122 | 123 | :: 124 | 125 | plt.figure(figsize=[5, 5]) 126 | ds.plot_l1norm(cmap="Greys", rotate=1) 127 | ds.plot_localmax(c="Blue", rotate=1, s=0.1) 128 | 129 | patch = Polygon(xy, facecolor="black", edgecolor="red", linewidth=10, ls="-") 130 | p = PatchCollection([patch], alpha=0.4) 131 | plt.gca().add_collection(p) 132 | 133 | scalebar = ScaleBar(1, 'um') # 1 pixel = 1um 134 | plt.gca().add_artist(scalebar) 135 | plt.tight_layout() 136 | plt.axis('off') 137 | plt.show() 138 | 139 | .. figure:: ../images/maxima.png 140 | :alt: plot found maxima superimposed with the mask 141 | 142 | plot found maxima superimposed with the mask 143 | 144 | Normalization 145 | ------------- 146 | 147 | Once the local maxima have been identified, we can use them for 148 | calculating the variance stabilisation parameters using ``sctransform``. 149 | If you receive an error here, make sure that you have installed the R 150 | packages in the `installation `__ step 151 | 152 | This part of the analysis ends with the normalization of the mRNA 153 | density and the local-maximum vectors. 154 | 155 | :: 156 | 157 | analysis.normalize_vectors_sctransform() 158 | 159 | Now we are rady to continue with mapping the cell types in 160 | `guided `__ or `de novo mode `__. 161 | -------------------------------------------------------------------------------- /doc/userguide/05-kernel_shape.rst: -------------------------------------------------------------------------------- 1 | The shape of the kernel 2 | ======================= 3 | 4 | The shape of the kernel is defined by the `kernel 5 | function `__. The 6 | shape of the kernel determines how the mRNA signal is smoothed. 7 | 8 | We adopt the use of the Gaussian kernel due to it’s popular use in 9 | signal processing, however other kernel functions can be used: - we have 10 | had success in using semi-circle kernels when applied to `ISS data of 11 | the human pancreas `__ - 12 | the `Epanechnikov kernel `__ 13 | minimizes AMISE and has therefore been described as optimal 14 | 15 | The following exmaples shows how you can apply a semicircular kernel 16 | instead of a Gaussian. 17 | 18 | :: 19 | 20 | # code to change the shape of the kernel (@sebastiantiesmeyer) 21 | -------------------------------------------------------------------------------- /doc/userguide/06-kernel_bandwidth.rst: -------------------------------------------------------------------------------- 1 | Kernel bandwidth 2 | ================ 3 | 4 | The bandwidth of the kernel controls the amount of smoothing applied. 5 | With a low bandwidth, the smooth is spread less. With a high badnwidth, 6 | the smoothing is spread more. 7 | 8 | The bandwidth should be set according to 2 factors: - the maximum size 9 | of the bandwidth should not smooth the signals outside of cells. by 10 | default we choose a bandwidth of 2.5 um, as this has a FWTM or ~10um, 11 | which is the average size of cells in the mouse SSp. This worked well 12 | for all examples in the SSAM paper. - the minimum size of the bandwidth 13 | should at least smooth signal to adjacent mRNA. From experience, this is 14 | not an issue for most ISH based techniques, but sequencing based 15 | techniques such as ISS can produce very sparse data and may require 16 | higher bandwidths to smooth signal sufficiently. 17 | 18 | Here is a close-up of the osmFISH mouse SSp dataset which investigates 19 | the effect of adjusting the kernel bandwidth. You can see that with a 20 | bandwidth of 1um the smoothing is sufficient, and with a bandwidth of 21 | 5um it is a little too much. The bandwidth of 2.5um appears to be a good 22 | balance of smoothing adjacent signal, while not smooting into the 23 | adjacent area or loosing sparse cell types. 24 | 25 | |image0| 26 | 27 | .. |image0| image:: ../images/kernel_bw.png 28 | 29 | -------------------------------------------------------------------------------- /doc/userguide/07-input_mask.rst: -------------------------------------------------------------------------------- 1 | Input masks 2 | =========== 3 | 4 | For some tissue images you may want to restrict analysis to certain 5 | parts of the image. For example, the image may have degradation towards 6 | the edges, you may wish to exclude non tissue areas, or even perhaps 7 | restricting SSAM analysis to previously segmented areas. 8 | 9 | SSAM accepts input masks that are defined as polygons. 10 | 11 | Example for the VISp smFISH dataset: 12 | 13 | :: 14 | 15 | from matplotlib.patches import Polygon 16 | from matplotlib.collections import PatchCollection 17 | 18 | plt.figure(figsize=[5, 5]) 19 | ds.plot_l1norm(cmap="Greys", rotate=1) 20 | ds.plot_localmax(c="Blue", rotate=1, s=0.1) 21 | 22 | patch = Polygon(xy, facecolor="black", edgecolor="red", linewidth=10, ls="-") 23 | p = PatchCollection([patch], alpha=0.4) 24 | plt.gca().add_collection(p~) 25 | plt.show() 26 | 27 | |image0| 28 | 29 | After the desired region selected, a ``mask`` can be created. In this 30 | case we define an ``input_mask`` and ``output_mask`` which restricts all 31 | data process anf reported output to the selected region. 32 | 33 | :: 34 | 35 | from matplotlib.path import Path 36 | 37 | x, y = np.meshgrid(np.arange(ds.vf.shape[0]), np.arange(ds.vf.shape[1])) 38 | x, y = x.flatten(), y.flatten() 39 | points = np.vstack((x,y)).T 40 | 41 | path = Path(xy) 42 | input_mask = path.contains_points(points) 43 | output_mask = input_mask = input_mask.reshape((ds.vf.shape[1], ds.vf.shape[0], 1)).swapaxes(0, 1) 44 | 45 | .. |image0| image:: ../images/mask.png 46 | 47 | -------------------------------------------------------------------------------- /doc/userguide/08-guided.rst: -------------------------------------------------------------------------------- 1 | SSAM *guided* analysis 2 | ====================== 3 | 4 | The main visual output of SSAM is the creation of the cell-type map, 5 | which is created by classifying pixels in the tissue image based of 6 | either predefined or calculated genes expression signatures. When the 7 | gene expression signatures are already known, one can use SSAM in 8 | *guided* mode. When previously known cell type signatures are known, we 9 | highly recommend running *guided* mode analysis as a quality check. 10 | 11 | Single cell RNA sequencing data 12 | ------------------------------- 13 | 14 | We will use scRNA-seq data from `Tasic et al. 15 | 2018 `__ for the guided 16 | analysis. In the paper they identified “shared and distinct 17 | transcriptomic cell types across neocortical areas” in the mouse brain, 18 | also including the mouse VISp (which is our exmaple). 19 | 20 | First we need to load the data: 21 | 22 | :: 23 | 24 | scrna_cl = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/cl.feather") 25 | scrna_cl_df = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/cl_df.feather") 26 | scrna_genes = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/genes.feather") 27 | scrna_counts = pd.read_feather("zenodo/multiplexed_smFISH/raw_data/scrna_data_tasic_2018/counts.feather") 28 | 29 | scrna_clusters = scrna_cl['cluster_id'] 30 | 31 | scrna_cl_dic = dict(zip(scrna_cl['cell_id'], scrna_cl['cluster_id'])) 32 | scrna_cl_metadata_dic = dict(zip( 33 | scrna_cl_df['cluster_id'], 34 | zip(scrna_cl_df['cluster_label'], 35 | scrna_cl_df['cluster_color'], ) 36 | )) 37 | 38 | qc_gene_indices = np.sum(scrna_counts > 0, axis=1) > 5 39 | scrna_genes_qc = np.array(scrna_genes)[qc_gene_indices] 40 | 41 | scrna_counts_qc = np.array(scrna_counts).T[:, qc_gene_indices] 42 | 43 | Normalisation 44 | ------------- 45 | 46 | Once the data is loaded, we will normalise it using ``run_sctransform``: 47 | 48 | :: 49 | 50 | scrna_data_normalized = np.array(ssam.run_sctransform(scrna_counts_qc)[0]) 51 | 52 | Cell-type gene expression signatures 53 | ------------------------------------ 54 | 55 | Once the data is normalised, we can calculate the average gene 56 | expression per cell type (the ``centroids``), which can then be used for 57 | classifying pixels in the image 58 | 59 | :: 60 | 61 | selected_genes_idx = [list(scrna_genes_qc).index(g) for g in ds.genes] 62 | scrna_uniq_clusters = np.unique(scrna_clusters) 63 | scrna_centroids = [] 64 | for cl in scrna_uniq_clusters: 65 | scrna_centroids.append(np.mean(scrna_data_normalized[:, selected_genes_idx][scrna_clusters == cl], axis=0)) 66 | 67 | Generate a *guided* cell-type map 68 | --------------------------------- 69 | 70 | We can now continue to classify pixels in the tissue image using the 71 | cell-type gene expression signatures from the sc-RNAseq data. 72 | 73 | We map the local maxima vectors to the most similar clusters in the 74 | scRNA-seq data using, using a `correlation threshold of classifying 75 | pixels of ``0.6`` `__ 76 | 77 | :: 78 | 79 | analysis.map_celltypes(scrna_centroids) # map the scRNAseq cell type signatures to the tissue image 80 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.3, output_mask=output_mask) # post-filter cell-type map to remove spurious pixels 81 | 82 | plt.figure(figsize=[5, 5]) # initiate the plotting area 83 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False) # SSAM plotting function 84 | 85 | |image0| 86 | 87 | Despite the guided mode producing passable results, we highly recommend 88 | using the `de novo mode for more accurate analysis `__. 89 | 90 | .. |image0| image:: ../images/guided.png 91 | 92 | -------------------------------------------------------------------------------- /doc/userguide/09-celltype_map_thresh_g.rst: -------------------------------------------------------------------------------- 1 | Thresholding the guided cell-type map 2 | ===================================== 3 | 4 | After cell-type signatures are provided, the tissue image can be 5 | classified. The classification of each pixel is based on the Pearson 6 | correlation metric (although an `experimental adversarial autoencoder 7 | based classification method `__ can be applied). 8 | 9 | We found that a minimum correlation threshold (``min_r``) of 0.3 worked 10 | well for guided mode based on single cell RNAseq cell-type signatures, 11 | and 0.6 worked well for *de novo* mode. 12 | 13 | Below we show how the cell-type map changes using correlation thresholds 14 | of ``0.15,0.3,0.45`` using the scRNAseq signatures 15 | 16 | :: 17 | 18 | scrna_uniq_labels = [scrna_cl_metadata_dic[i][0] for i in scrna_uniq_clusters] 19 | scrna_colors = [scrna_cl_metadata_dic[i][1] for i in scrna_uniq_clusters] 20 | 21 | analysis.map_celltypes(scrna_centroids) 22 | 23 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.15, output_mask=output_mask) # post-filter cell- 24 | plt.figure(figsize=[5, 5]) 25 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False) 26 | 27 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.3, output_mask=output_mask) # post-filter cell- 28 | plt.figure(figsize=[5, 5]) 29 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False) 30 | 31 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.45, output_mask=output_mask) # post-filter cell- 32 | plt.figure(figsize=[5, 5]) 33 | ds.plot_celltypes_map(rotate=1, colors=scrna_colors, set_alpha=False) 34 | -------------------------------------------------------------------------------- /doc/userguide/10-de_novo.rst: -------------------------------------------------------------------------------- 1 | SSAM *de novo* analysis 2 | ======================= 3 | 4 | While we believe that the `guided mode of SSAM `__ to be able 5 | to generate good cell-type maps rapidly, the *de novo mode* provide much 6 | more accurate results. 7 | 8 | The steps of the *de novo* analysis are briefly discussed below, with 9 | links to more detailed discussion: 10 | 11 | - `setting cell-type map correlation 12 | threshold `__ 13 | - `visualisation of cell-type signatures: heatmap, tSNE, 14 | UMAP `__ 15 | 16 | Clustering of expression vectors 17 | -------------------------------- 18 | 19 | Once the local maxima have been selected and 20 | `filtered `__, we can perform `clustering 21 | analysis `__. SSAM supports `a number of clustering 22 | methods `__. Here we use the Louvain algorithm using 22 23 | principle components, a resolution of 0.15. 24 | 25 | :: 26 | 27 | analysis.cluster_vectors( 28 | min_cluster_size=0, 29 | pca_dims=22, 30 | resolution=0.15, 31 | metric='correlation') 32 | 33 | Cluster annotation and diagnostics 34 | ---------------------------------- 35 | 36 | SSAM provides `diagnostic plots `__ which can be used to 37 | evaluate the quality of clusters, and `facilitates the annotation of 38 | clusters `__. 39 | 40 | Visualisng the clusters 41 | ----------------------- 42 | 43 | SSAM supports `cluster visualisation via heatmaps, and 2D embedding 44 | (t-SNE and UMAP) `__. Here we give an example of the 45 | t-SNE plot: 46 | 47 | :: 48 | 49 | plt.figure(figsize=[5, 5]) 50 | ds.plot_tsne(pca_dims=22, metric="correlation", s=5, run_tsne=True) 51 | plt.savefig('images/tsne.png') 52 | 53 | .. figure:: ../images/tsne.png 54 | :alt: plot of t-SNE embedding of cell types 55 | 56 | plot of t-SNE embedding of cell types 57 | 58 | Cell type map 59 | ------------- 60 | 61 | Once the clusters have been evaluated for quality, we can generate the 62 | *de novo* cell-type map. This involves `classifying all the pixels in 63 | the tissue image based on a correlation 64 | threshold `__. For the *de novo* application 65 | ``0.6`` was found to perform well: 66 | 67 | :: 68 | 69 | analysis.map_celltypes() 70 | 71 | filter_params = { 72 | "block_size": 151, 73 | "method": "mean", 74 | "mode": "constant", 75 | "offset": 0.2 76 | } 77 | 78 | analysis.filter_celltypemaps(min_norm="local", filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask) 79 | 80 | :: 81 | 82 | plt.figure(figsize=[5, 5]) 83 | ds.plot_celltypes_map(rotate=1, set_alpha=False) 84 | plt.axis('off') 85 | plt.savefig('images/de_novo.png') 86 | 87 | .. figure:: ../images/de_novo.png 88 | :alt: plot of the de novo generated celltype map 89 | 90 | plot of the de novo generated celltype map 91 | 92 | We can now use our celltype map to infer a map of `tissue 93 | domains `__. 94 | -------------------------------------------------------------------------------- /doc/userguide/11-max_filtering.rst: -------------------------------------------------------------------------------- 1 | Filtering local maxima 2 | ====================== 3 | 4 | As demonstrated in the `SSAM 5 | paper `__, local L1 6 | maxima selection is an effective way of downsampling the entire vector 7 | field for faster computation, and they better represent known gene 8 | expression profiles compared to random downsampling. 9 | 10 | However, local maxima in the vector field can arrise from undesirable 11 | locations, e.g. singleton mRNAs. In order to filter less informative 12 | local maxima. 13 | 14 | We recommend applying threshold for individual genes, and for the total 15 | gene expression. 16 | 17 | Per gene expression threshold 18 | ----------------------------- 19 | 20 | The per gene threshold should be at least the height of a single 21 | Gaussian curve over an mRNA. This can easily be empirically determined 22 | by visual analysis. In this multiplexed smFISH exmaple, the per gene 23 | expression threshold, ``exp_thres`` is set to 0.027 24 | 25 | :: 26 | 27 | exp_thres = 0.027 28 | viewport = 0.1 29 | gindices = np.arange(len(ds.genes)) 30 | np.random.shuffle(gindices) 31 | plt.figure(figsize=[5, 7]) 32 | for i, gidx in enumerate(gindices[:6], start=1): 33 | ax = plt.subplot(5, 2, i) 34 | n, bins, patches = ax.hist(ds.vf[..., gidx][np.logical_and(ds.vf[..., gidx] > 0, ds.vf[..., gidx] < viewport)], bins=100, log=True, histtype=u'step') 35 | ax.set_xlim([0, viewport]) 36 | ax.set_ylim([n[0], n[-1]]) 37 | ax.axvline(exp_thres, c='red', ls='--') 38 | ax.set_title(ds.genes[gidx]) 39 | ax.set_xlabel("Expression") 40 | ax.set_ylabel("Count") 41 | plt.tight_layout() 42 | pass 43 | 44 | |image0| 45 | 46 | Total gene expression threshold 47 | ------------------------------- 48 | 49 | The total gene threshold should be empirically determined by examing the 50 | curve of total gene expression of local maxima. This isn’t always easy, 51 | and we highly encourage investigating this thoroughly. 52 | 53 | :: 54 | 55 | norm_thres = 0.2 56 | gidx = 0 57 | plt.figure(figsize=[5, 2]) 58 | #plt.hist(ds.vf[..., gidx][ds.vf[..., gidx] > 0], bins=100, log=True) 59 | n, _, _ = plt.hist(ds.vf_norm[np.logical_and(ds.vf_norm > 0, ds.vf_norm < 0.3)], bins=100, log=True, histtype='step') 60 | ax = plt.gca() 61 | ax.axvline(norm_thres, c='red', ls='--') 62 | ax.set_xlabel("L1-norm") 63 | ax.set_ylabel("Count") 64 | 65 | plt.xlim([0, 0.3]) 66 | plt.ylim([np.min(n), np.max(n) + 100000]) 67 | pass 68 | 69 | |image1| 70 | 71 | Filtering “stray” local maxima using k-nearest neighbour density 72 | ================================================================ 73 | 74 | If there is mRNA signal originating from outside the tissue area (due to 75 | background noise), it would improve downstream analysis to remove such 76 | vectors. We observed this in the osMFISH data. These “stray” local 77 | maxima tend to be less dense than local maxima from the tissue area: 78 | 79 | |image2| 80 | 81 | Because of this, they can be effectively filtered using their k-neearest 82 | neighbor density, in this example settting the ``threshold`` to 0.002. 83 | 84 | :: 85 | 86 | from sklearn.neighbors import KDTree 87 | X = np.array([ds.local_maxs[0], ds.local_maxs[1]]).T 88 | kdt = KDTree(X, leaf_size=30, metric='euclidean') 89 | rho = 100 / (np.pi * kdt.query(X, k=100)[0][:, 99] ** 2) 90 | 91 | threshold = 0.002 92 | 93 | plt.figure(figsize=[5, 2.5]) 94 | plt.hist(rho, bins=100, histtype='step') 95 | plt.axvline(x=threshold, color='r', linestyle='--') 96 | 97 | ax = plt.gca() 98 | ax.set_xlabel("Local KNN density") 99 | ax.set_ylabel("Count") 100 | pass 101 | 102 | |image3| 103 | 104 | …. and a quick look at the before and after in the osmFISH dataset 105 | 106 | |image4| 107 | 108 | .. |image0| image:: ../images/local_max_threshold_gene.png 109 | .. |image1| image:: ../images/local_max_threshold_total.png 110 | .. |image2| image:: ../images/local_max_threshold_knn.png 111 | .. |image3| image:: ../images/local_max_threshold_knn2.png 112 | .. |image4| image:: ../images/local_max_threshold_knn3.png 113 | 114 | -------------------------------------------------------------------------------- /doc/userguide/12-clustering.rst: -------------------------------------------------------------------------------- 1 | Clustering Local L-1 Maxima 2 | =========================== 3 | 4 | In the *de novo* mode analysis, after the local maxima have been 5 | identified from the tissue image, they are clustered. 6 | 7 | The default clustering algorithm is based on `Louvain community 8 | detection `__. 9 | SSAM also supports clustering using ``hdbscan`` and ``optics``. 10 | 11 | It can be initiated by: 12 | 13 | :: 14 | 15 | analysis.cluster_vectors(method="louvain", 16 | pca_dims=-1, 17 | min_cluster_size=2, 18 | max_correlation=1.0, 19 | metric="correlation", 20 | outlier_detection_method='medoid-correlation', 21 | outlier_detection_kwargs={}, 22 | random_state=0, 23 | **kwargs) 24 | 25 | … where - ``method`` can be ``louvain``, ``hdbscan``, ``optics``. - 26 | ``pca_dims`` are the number of principal componants used for clustering. 27 | - ``min_cluster_size`` is the minimum cluster size. - ``resolution`` is 28 | the resolution for Louvain community detection. - ``prune`` is the 29 | threshold for Jaccard index (weight of SNN network). If it is smaller 30 | than prune, it is set to zero. - ``snn_neighbors`` is the number of 31 | neighbors for SNN network. - ``max_correlation`` is the threshold for 32 | which clusters with higher correlation to this value will be merged. - 33 | ``metric`` is the metric for calculation of distance between vectors in 34 | gene expression space. - ``subclustering`` if set to True, each cluster 35 | will be clustered once again with DBSCAN algorithm to find more 36 | subclusters. - ``dbscan_eps`` is the ``eps`` value for DBSCAN 37 | subclustering. Not used when ‘subclustering’ is set False. - 38 | ``centroid_correction_threshold`` is the threshold for which centroid 39 | will be recalculated with the vectors which have the correlation to the 40 | cluster medoid equal or higher than this value. - ``random_state`` is 41 | the random seed or scikit-learn’s random state object to replicate the 42 | same result 43 | 44 | Removing outliers 45 | ----------------- 46 | 47 | The cell type signature is determined as the centroid of the cluster. 48 | This can be affected by outliers, so SSAM supports a number of outlier 49 | removal methods: 50 | 51 | :: 52 | 53 | analysis.remove_outliers(outlier_detection_method='medoid-correlation', outlier_detection_kwargs={}, normalize=True) 54 | 55 | .. where - ``outlier_detection_method`` can be ``medoid-correlation``, 56 | ``robust-covariance``, ``one-class-svm``, ``isolation-forest``, 57 | ``local-outlier-factor`` - ``outlier_detection_kwargs`` are arguments 58 | passed to the outlier detection method 59 | -------------------------------------------------------------------------------- /doc/userguide/13-diagnostic.rst: -------------------------------------------------------------------------------- 1 | Diagnostic plots 2 | ================ 3 | 4 | After unsupervised clustering of gene expression vectors, some clusters 5 | may need to be manually merged or discarded. SSAM supports merging of 6 | clusters based on correlation of gene expression profile, however in 7 | many cases manual inspection is needed to rule out any non-trivial 8 | issues. 9 | 10 | To guide this process, SSAM generates a cluster-wise ‘diagnostic plot’, 11 | which consists of four panels: 1) location of the clustered vectors on 12 | the tissue image, 2) the pixels classified to belong the cluster 13 | signature (the cluster centroid), 3) the mean expression profile of the 14 | clustered vectors, and 4) the t-SNE or UMAP embedding. 15 | 16 | In the three datasets analyzed the clusters to be merged or removed 17 | often showed a discordance between the location of sampled vectors used 18 | to determine the cluster (panel 1) and the pixels classified to belong 19 | to that cluster (panel 2). In case of overclustering, i.e. when a 20 | cell-type signature is split over 2 clusters, the map typically does not 21 | classify the full shape of the cells but instead only fragments (panel 22 | 2), and having almost the same marker gene expression of another cluster 23 | (panel 3). Such clusters can be merged. 24 | 25 | For dubious clusters that should be removed, we observed that vectors 26 | usually originate from outside the tissue region or from image artifacts 27 | (panel 1), or that the gene expression does not show any clear 28 | expression of marker genes or similarity to expected gene expression 29 | profiles (panel 3). 30 | 31 | The remaining clusters are then annotated by comparing cluster marker 32 | genes to known cell-type markers. Note that in many cases, the identity 33 | of clusters can be easily assigned by comparing the centroids of the 34 | clusters to the known cell-type signatures, e.g., from single cell RNA 35 | sequencing. 36 | 37 | To support rapid annotation of cell types to clusters, SSAM additionally 38 | shows the highest correlating known cell-type signature should this data 39 | be available in panel 3. 40 | 41 | Example 1: a large cluster that can be easily annotated 42 | ------------------------------------------------------- 43 | 44 | Local maxima (panel 1), correspond to the same area (panel 2), and 45 | matches known gene expression patterns of *Vip Arhgap36 Hmcn1* cell 46 | types from scRNAseq experiments with high correlation (panel 3) 47 | 48 | |image0| 49 | 50 | Example 2: a large cluster that cannot be easily annotated 51 | ---------------------------------------------------------- 52 | 53 | Local maxima (panel 1), correspond to the same area (panel 2). The gene 54 | expression profile has a good correlation to *L2/3 IT VISp Adamts2* cell 55 | types, but are lacking the very high expression of *Pde1a*. In this 56 | particular case, one would need to check other clusters matching this 57 | cell type and perhaps merge them, or perhaps this indicates low 58 | efficiency of the *Pde1a* probe in the experiment. 59 | 60 | |image1| 61 | 62 | Example 3: a small cluster that is good 63 | --------------------------------------- 64 | 65 | Despite only 2 local maxima (panel 1), the classified pixels correspond 66 | to the same area (panel 2), and matches known gene expression patterns 67 | (panel 3). This presents a very rare, SSt Chodl cell type. 68 | 69 | |image2| 70 | 71 | Example 4: a small cluster that is questionable 72 | ----------------------------------------------- 73 | 74 | Sampled local maxima (panel 1) to no correspond to the classified pixels 75 | (panel 2), and doesnt clearly match known gene expression patterns 76 | (panel 3) 77 | 78 | |image3| 79 | 80 | .. |image0| image:: ../images/diagplot_centroid_2.png 81 | .. |image1| image:: ../images/diagplot_centroid_5.png 82 | .. |image2| image:: ../images/diagplot_centroid_30.png 83 | .. |image3| image:: ../images/diagplot_centroid_8.png 84 | 85 | -------------------------------------------------------------------------------- /doc/userguide/14-cluster_annotation.rst: -------------------------------------------------------------------------------- 1 | Cluster annotation 2 | ================== 3 | 4 | In a typical single cell RNAseq experiment, the process of annotating 5 | cell types manually can be laborious and as such, `a number of automated 6 | methods have emerged `__. 7 | 8 | In a typical *in situ* transcriptomics experiment, the annotation of 9 | cell types is usually much easier as these assays are usually profile 10 | established cell type markers. Cluster can be annotated easily based on 11 | marker gene expression. 12 | 13 | The `diagnostic plots `__ can be used to compare existing 14 | signatures against those identified *de novo* 15 | 16 | :: 17 | 18 | from scipy.stats import pearsonr, spearmanr 19 | 20 | for idx in range(len(ds.centroids)): 21 | plt.figure(figsize=[50, 15]) 22 | ds.plot_diagnostic_plot(idx, known_signatures=[ 23 | ("scRNA-seq", scrna_uniq_labels, scrna_centroids, scrna_colors), 24 | ], correlation_methods=[ 25 | ("r", pearsonr), 26 | ("rho", spearmanr) 27 | ]) 28 | plt.tight_layout() 29 | plt.savefig('diagplots_multiplexed_smFISH/diagplot_centroid_%d.png'%idx) 30 | plt.close() 31 | 32 | This will generate a diagnostic plot for each cluster, which can be used 33 | to assign cluster labels. E.g. the following cluster matches known gene 34 | expression patterns of Vip Arhgap36 Hmcn1 cell types from scRNAseq 35 | experiments with high correlation (panel 3): 36 | 37 | |image0| 38 | 39 | While this is a good example of cluster that can be easily annotated, 40 | some clusters may prepresent noise and would need to be removed, and 41 | when over clustering occurs then clusters may have to be merged. The 42 | `diagnostic plots documentation `__ assist the decision 43 | making process. 44 | 45 | Once each cluster is reviewed, a cell-type be assigned, or removed, or 46 | merged. In the following code snippet, we show an elegent way to 47 | annotate, remove, and merge clusters. 48 | 49 | 1) Determine that (i) clusters with a name will be annotated, (ii) 50 | clusters with a “N/A” will be removed, (iii) clusters with the same 51 | name will be merged 52 | 53 | :: 54 | 55 | denovo_labels = [ 56 | "N/A", 57 | "VLMC", 58 | "Vip Arhgap36 Hmcn1 / Vip Igfbp4 Map21l1", 59 | "L2/3 IT Rrad", 60 | "N/A", 61 | "L2/3 IT Adamts2", 62 | "Sst Nts / Sst Rxfp1 Eya1", 63 | "Lamp5 Lsp1", 64 | "N/A", 65 | "Sst Crhr2 Efemp1 / Sst Esm1", 66 | 67 | "Pvalb Calb1 Sst / Pvalb Reln Tac1", 68 | "Astro Aqp4", 69 | "L6 IT Penk Fst", 70 | "L4 IT Superficial", 71 | "L5 IT Col27a1", 72 | "L2/3 IT Adamts2", 73 | "OPC", 74 | "Oligo", 75 | "L4 IT Rspo1", 76 | "L5 NP Trhr Met", 77 | 78 | "L5 IT Hsd11b1 Endou", 79 | "Pvalb Th Sst / Pvalb Reln Tac1", 80 | "L6 CT Ctxn3 Brinp3 / L6 CT Gpr139", 81 | "L5 PT Chrna6", 82 | "L5 IT Batf3", 83 | "L5 PT C1ql2 Cdh13", 84 | "L5 PT Krt80", 85 | "L6 IT Penk Col27a1", 86 | "L6 IT Penk Col27a1", 87 | "L6b Crh", 88 | 89 | "Sst Chodl", 90 | ] 91 | 92 | 2) make objects for storing the index of clusters to be annotated, 93 | removed and merged 94 | 95 | :: 96 | 97 | denovo_labels_final = [] 98 | exclude_indices = [] 99 | merge_indices = [] 100 | 101 | 3) iterate over the ``denovo_labels`` object and populate the 102 | ``denovo_labels_final``, ``exclude_indices``, ``merge_indices`` 103 | objects 104 | 105 | :: 106 | 107 | for idx, cl in enumerate(denovo_labels): 108 | if cl == 'N/A': 109 | exclude_indices.append(idx) 110 | continue 111 | if cl in denovo_labels_final: 112 | continue 113 | denovo_labels_final.append(cl) 114 | 115 | for cl in np.unique(denovo_labels): 116 | if cl == 'N/A': 117 | continue 118 | mask = [cl == e for e in denovo_labels] 119 | if np.sum(mask) > 1: 120 | merge_indices.append(np.where(mask)[0]) 121 | 122 | 4) plot the removed clusters in t-SNE embedding 123 | 124 | :: 125 | 126 | cmap = plt.get_cmap('jet') 127 | jet_colors = cmap(np.array(list(range(len(ds.centroids)))) / (len(ds.centroids) - 1)) 128 | tsne_colors = np.zeros_like(jet_colors) 129 | tsne_colors[..., :] = [0.8, 0.8, 0.8, 1] 130 | tsne_colors[exclude_indices] = [0, 0, 0, 1] #jet_colors[exclude_indices] 131 | import matplotlib.patheffects as PathEffects 132 | plt.figure(figsize=[5, 5]) 133 | ds.plot_tsne(pca_dims=33, metric="correlation", s=5, run_tsne=False, colors=tsne_colors) 134 | plt.axis('off') 135 | 136 | |image1| 137 | 138 | 5) plot the merged clusters in t-SNE embedding 139 | 140 | :: 141 | 142 | cmap = plt.get_cmap('rainbow') 143 | jet_colors = cmap(np.array(list(range(len(merge_indices)))) / (len(merge_indices) - 1)) 144 | plt.figure(figsize=[5, 5]) 145 | tsne_colors = np.zeros([len(ds.centroids), 4]) 146 | tsne_colors[..., :] = [0.8, 0.8, 0.8, 1] 147 | for idx, mi in enumerate(merge_indices): 148 | tsne_colors[mi] = jet_colors[idx] 149 | ds.plot_tsne(pca_dims=33, metric="correlation", s=5, run_tsne=False, colors=tsne_colors) 150 | plt.axis('off') 151 | 152 | |image2| 153 | 154 | 6) update the ``analysis`` object with the clusters to remove and merge 155 | 156 | :: 157 | 158 | analysis.exclude_and_merge_clusters(exclude_indices, merge_indices, centroid_correction_threshold=0.6) 159 | 160 | .. |image0| image:: ../images/diagplot_centroid_2.png 161 | .. |image1| image:: ../images/tsne_removed.png 162 | .. |image2| image:: ../images/tsne_merged.png 163 | 164 | -------------------------------------------------------------------------------- /doc/userguide/15-celltype_map_thresh_d.rst: -------------------------------------------------------------------------------- 1 | Thresholding the de-novo cell-type map 2 | ====================================== 3 | 4 | After cell-type signatures are calculated, the tissue image can be 5 | classified. The classification of each pixel is based on the Pearson 6 | correlation metric (although an `experimental adversarial autoencoder 7 | based classification method `__ can be applied). 8 | 9 | We found that a minimum correlation threshold (``min_r``) of 0.3 worked 10 | well for guided mode based on single cell RNAseq cell-type signatures, 11 | and 0.6 worked well for *de novo* mode. 12 | 13 | Below we show how the cell-type map changes using correlation thresholds 14 | of ``0.4,0.6,0.8`` for the guided cell-type map. 15 | 16 | :: 17 | 18 | analysis.map_celltypes() 19 | 20 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.4, fill_blobs=True, min_blob_area=50, output_mask=output_mask) 21 | plt.figure(figsize=[5, 5]) 22 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False) 23 | 24 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.6, fill_blobs=True, min_blob_area=50, output_mask=output_mask) 25 | plt.figure(figsize=[5, 5]) 26 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False) 27 | 28 | analysis.filter_celltypemaps(min_norm=filter_method, filter_params=filter_params, min_r=0.8, fill_blobs=True, min_blob_area=50, output_mask=output_mask) 29 | plt.figure(figsize=[5, 5]) 30 | ds.plot_celltypes_map(colors=denovo_celltype_colors, rotate=1, set_alpha=False) 31 | -------------------------------------------------------------------------------- /doc/userguide/16-visualisation.rst: -------------------------------------------------------------------------------- 1 | Visualisation of 2D gene expression embeddings (t-SNE and UMAP) 2 | =============================================================== 3 | 4 | An important part of presenting the summary of the clustering analysis 5 | is 2D visualisation via embedding. 6 | 7 | `UMAP `__ and 8 | `t-SNE `__, 9 | are 2 common dimensionality reduction methods that can be useful for 10 | displaying clustering results. 11 | 12 | Running t-SNE 13 | ------------- 14 | 15 | To run the t-SNE on the ``ds`` object: 16 | ``ds.run_tsne(pca_dims=-1,n_iter=5000, perplexity=70, early_exaggeration=10, metric="correlation", exclude_bad_clusters=True, random_state=0, tsne_kwargs={})`` 17 | 18 | .. where: 19 | 20 | - ``pca_dims``: Number of PCA dimensions used for the tSNE embedding. 21 | - ``n_iter``: Maximum number of iterations for the tSNE. 22 | - ``perplexity``: The perplexity value of the tSNE (please refer to the 23 | section `How should I set the perplexity in 24 | t-SNE? `__ ). 25 | - ``early_exaggeration``: Early exaggeration parameter for tSNE. 26 | Controls the tightness of the resulting tSNE plot. 27 | - ``metric``: Metric for calculation of distance between vectors in 28 | gene expression space. 29 | - ``exclude_bad_clusters``: If true, the vectors that are excluded by 30 | the clustering algorithm will not be considered for tSNE computation. 31 | - ``random_state``: Random seed or scikit-learn’s random state object 32 | to replicate the same result 33 | - ``tsne_kwargs``: Other keyward parameters for tSNE. 34 | 35 | Running UMAP 36 | ------------ 37 | 38 | To run the t-SNE on the ``ds`` object: 39 | ``ds.run_umap(self, pca_dims=-1, metric="correlation", min_dist=0.8, exclude_bad_clusters=True, random_state=0, umap_kwargs={})`` 40 | 41 | .. where: 42 | 43 | - ``pca_dims``: Number of PCA dimensions used for the UMAP embedding. 44 | - ``metric``: Metric for calculation of distance between vectors in 45 | gene expression space. 46 | - ``min_dist``: ‘min_dist’ parameter for UMAP. 47 | - ``exclude_bad_clusters``: If true, the vectors that are excluded by 48 | the clustering algorithm will not be considered for UMAP computation. 49 | - ``random_state``: Random seed or scikit-learn’s random state object 50 | to replicate the same result 51 | - ``umap_kwargs``: Other keyward parameters for UMAP. 52 | 53 | Plotting embeddings 54 | ------------------- 55 | 56 | Plotting of the t-SNE and UMAP beddings can be performed by: 57 | 58 | :: 59 | 60 | ds.plot_embedding(method='umap') 61 | ds.plot_embedding(method='tSNE') 62 | 63 | |image0| 64 | 65 | .. |image0| image:: ../images/tsne_final.png 66 | 67 | -------------------------------------------------------------------------------- /doc/userguide/17-domain.rst: -------------------------------------------------------------------------------- 1 | Identifying tissue domains 2 | ========================== 3 | 4 | Cells are organised into tissues and organs. Spatial gene expression not 5 | only allows the identification of cell types *in situ*, but also allows 6 | investigation of how these cells are organised. 7 | 8 | SSAM facilitates the identification of “tissue domains”, which are 9 | regions in the tissue exhibiting similar local cell type composition. 10 | This is based on circular window sampling with a defined ``radius`` and 11 | ``step``, which is then followed by `agglomerative 12 | clustering `__. 13 | 14 | Perform circular window sampling 15 | -------------------------------- 16 | 17 | The first step is to sample cell-type composition in circular sweeping 18 | windows. For this, the size of circular window (``radius``) and the step 19 | between each sampling (``step``) has to be defined. The units here are 20 | in um, which is also equivalent to pixels in this example. The following 21 | performs this sampling using a circular window of 100um, with 10um 22 | steps: 23 | 24 | :: 25 | 26 | analysis.bin_celltypemaps(step=10, radius=100) 27 | 28 | Clustering domain signatures 29 | ---------------------------- 30 | 31 | After performing the sampling, we continue with identifying domain 32 | signatures through clustering. This is based on agglomerative clustering 33 | to identify the initial clusters (``n_clusters``) of windows which 34 | include a minimum number of classified pixels (``norm_thres``), followed 35 | cluster merging when the correlation between clusters exceeds a 36 | threshold (``merge_thres``). The merging of clusters can be restricted 37 | to adjacent clusters (``merge_remote=FALSE``), or not restricted to 38 | spatial proximity (``merge_remote=True``) 39 | 40 | :: 41 | 42 | analysis.find_domains(n_clusters=20, merge_remote=True, merge_thres=0.7, norm_thres=1500) 43 | 44 | Visualizing identified domains 45 | ------------------------------ 46 | 47 | Once the domains have been indentified, they have to be visualised for 48 | evaluation. 49 | 50 | :: 51 | 52 | from matplotlib.colors import ListedColormap 53 | cmap_jet = plt.get_cmap('jet') 54 | num_domains = np.max(ds.inferred_domains_cells) + 1 55 | 56 | fig, axs = plt.subplots(1, num_domains, figsize=(4*num_domains, 4)) 57 | for domain_idx in range(num_domains): 58 | ax = axs[domain_idx] 59 | plt.sca(ax) 60 | plt.axis('off') 61 | cmap = ListedColormap([cmap_jet(lbl_idx / num_domains) if domain_idx == lbl_idx else "#cccccc" for lbl_idx in range(num_domains)]) 62 | ds.plot_domains(rotate=1, cmap=cmap) 63 | plt.tight_layout() 64 | plt.savefig(f'plots/domains_individual') 65 | 66 | .. figure:: ../images/domains_individual.png 67 | :alt: side by side plot of all tissue domains 68 | 69 | side by side plot of all tissue domains 70 | 71 | Post-processing the identified domains 72 | -------------------------------------- 73 | 74 | In certain cases, one may wish to **exclude certain domains** 75 | (``excluded_domain_indices``) as they may originate from tissue 76 | artifacts or contain no information. In our case the third domain (0 77 | based index 2) seems to be an artifact and the fourth one contains no 78 | useful information. The First two domains are obviously part of the same 79 | layer and can therefore be merged. 80 | 81 | Due to possible imaging artifacts such as tiling, some domains might be 82 | split. While it is still possible to tune the ``merge_thres`` in the 83 | clustering step, one can simply perform this as manual post processing. 84 | In the case above, there do not appear to be any domains that require 85 | merging. 86 | 87 | Once the domains to be excluded or merged have been determined, they can 88 | be excluded and removed(!): 89 | 90 | :: 91 | 92 | excluded_domain_indices = [2,3,7,10] 93 | merged_domain_indices = [[0,1],[9,11]] 94 | analysis.exclude_and_merge_domains(excluded_domain_indices, merged_domain_indices) 95 | 96 | The final plot 97 | -------------- 98 | 99 | The individual domains represent the established neocortex layering 100 | patterns found in the mouse brain. We can continue with assigning domain 101 | colours, names, and plotting all of the domains together. 102 | 103 | :: 104 | 105 | plt.figure(figsize=[5, 5]) 106 | ds.plot_domains(rotate=1) 107 | 108 | |image0| 109 | 110 | .. |image0| image:: ../images/final.png 111 | 112 | -------------------------------------------------------------------------------- /doc/userguide/18-composition.rst: -------------------------------------------------------------------------------- 1 | Cell-type composition analysis in tissue domains 2 | ================================================ 3 | 4 | After identifying `tissue domains `__ that exhibit specific 5 | cell-type composition properties, it may be desirable to report the 6 | cell-type composition properties of the identified domains. 7 | 8 | In the `SSAM 9 | manuscript `__ we used 10 | this functionality to identify that astrocytes cell type representation 11 | of neocortex layer were previously under-reported, and identified the 12 | cell-type composition of novel layering patterns in the primary visual 13 | cortex (VISp). 14 | 15 | Performing the cell-type composition analysis 16 | --------------------------------------------- 17 | 18 | The analysis is initiated on the ``analysis`` object: 19 | 20 | :: 21 | 22 | analysis.calc_cell_type_compositions() 23 | 24 | Plotting the composition of each domain 25 | --------------------------------------- 26 | 27 | Once this has completed, you can plot the cell-type composition of the 28 | different layers using the plot function. In the following exmaple, we 29 | plot the 7 identified layers (``domain_index = 0-6``) in the order that 30 | they would appear in the neocortex: 31 | 32 | :: 33 | 34 | # note - this could be wrapped up into a function 35 | for domain_idx in [1, 0, 2, 3, 4, 5, 6]: 36 | plt.figure(figsize=[5, 5]) 37 | ds.plot_celltype_composition(domain_idx, 38 | cell_type_colors=denovo_celltype_colors, 39 | cell_type_orders=heatmap_clusters_index[::-1], 40 | label_cutoff=0.03) 41 | plt.title(domain_labels[domain_idx]) 42 | 43 | |image0| 44 | 45 | Plotting the composition of the entire tissue 46 | --------------------------------------------- 47 | 48 | It would be worthwhile to compare the cell-type composition within each 49 | domain, and compare this to what is observed over the entire tissue. The 50 | cell-type compostion over the entire tissue is stored as the last 51 | domain, in this case the 8th element (``domain_index = 7``): 52 | 53 | :: 54 | 55 | # note - this can be wrapped up into a function 56 | plt.figure(figsize=[5, 5]) 57 | ds.plot_celltype_composition(domain_index=7, 58 | cell_type_colors=denovo_celltype_colors, 59 | cell_type_orders=heatmap_clusters_index[::-1], 60 | label_cutoff=0.03) 61 | plt.title('All') 62 | 63 | |image1| 64 | 65 | .. |image0| image:: ../images/domain_composition.png 66 | .. |image1| image:: ../images/domain_composition_all.png 67 | 68 | -------------------------------------------------------------------------------- /doc/userguide/19-experimental.rst: -------------------------------------------------------------------------------- 1 | Experimental features 2 | ===================== 3 | 4 | We will endevour to improve the functionality of SSAM by implementing 5 | novel features. So far, these experimental features only works with the ``develop`` branch of SSAM. 6 | 7 | The current novel features supported by SSAM include: 8 | 9 | - `Adversarial Auto Encoder based classification `__ 10 | 11 | - `Segmenting the cell-type map `__ 12 | -------------------------------------------------------------------------------- /doc/userguide/20-aaec.rst: -------------------------------------------------------------------------------- 1 | Cell-type classification using Adversarial Autoencoders 2 | ======================================================= 3 | 4 | The default classification algorithm is based on Pearson correlation as 5 | this has been `shown to be effective for automatic classification of 6 | cell types `__ for single 7 | cell RNAseq experiments. This proved to be both highly performant and 8 | accurate also for spatial gene expression data. However, it may be 9 | desirable to explore other classification methods. 10 | 11 | One recent and exciting Deep Learning framework that achieve competitive 12 | results in generative modeling and semi-supervised classification tasks 13 | are `adversarial autoencoders `__. 14 | 15 | SSAM implements a modified version of adversarial autoencoder classifier 16 | based on the `original 17 | implementation `__ 18 | by `Shahar Azulay `__. 19 | 20 | Mapping cell types using an adversarial autoencoder 21 | --------------------------------------------------- 22 | 23 | In order to use the AAEC classification of pixels instead of the Pearson 24 | correlation based method, simply replace ``analysis.map_celltypes()`` 25 | with : 26 | 27 | :: 28 | 29 | analysis.map_celltypes_aaec(epochs=1000, seed=0, batch_size=1000, chunk_size=100000, z_dim=10, noise=0) 30 | 31 | -------------------------------------------------------------------------------- /doc/userguide/21-segment_celltype_map.rst: -------------------------------------------------------------------------------- 1 | Segmenting the SSAM cell type map 2 | ================================= 3 | 4 | While we demonstrate the accuracy of SSAM in reconstructing celltype 5 | maps, we understand that many applications in biology require cell 6 | segmentation. As such, the development branch of SSAM supports 7 | segmentation of the celltype map using the ``watershed`` algorithm. 8 | 9 | **This is an experimental feature!** 10 | 11 | The segmentation of the cell type map can be performed by: 12 | 13 | .. code-block:: python 14 | 15 | # Load DAPI image 16 | with open('zenodo/osmFISH/raw_data/im_nuc_small.pickle', 'rb') as f: 17 | dapi = pickle.load(f) 18 | dapi_small = np.hstack([dapi.T[:1640], np.zeros([1640, 12])]).reshape(ds.vf_norm.shape) 19 | 20 | # Threshold DAPI image to create markers 21 | dapi_threshold = filters.threshold_local(dapi_small[..., 0], 35, offset=-0.0002) 22 | dapi_thresh_im = dapi_small[..., 0] > dapi_threshold 23 | dapi_thresh_im = dapi_thresh_im.reshape(ds.vf_norm.shape).astype(np.uint8) * 255 24 | 25 | # Run watershed segmentation of cell-type maps with DAPI as markers 26 | # After running below, the segmentation data will be available as: 27 | # - Segmentations: ds.watershed_segmentations 28 | # - Cell-type map: ds.watershed_celltype_map 29 | analysis.run_watershed(dapi_thresh_im) 30 | 31 | Below we demonstrate the application of the segmentation on the *de 32 | novo* celltype map generated for the mouse SSp osmFISH data. 33 | 34 | |image0| 35 | 36 | .. |image0| image:: ../images/segmented_celltype_map.png 37 | 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas 4 | matplotlib 5 | seaborn 6 | scikit-learn 7 | umap-learn 8 | python-louvain 9 | sparse 10 | scikit-image 11 | pyarrow 12 | packaging 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import setuptools 3 | try: 4 | import numpy as np 5 | except ImportError: 6 | print("Please install Numpy first. e.g. pip install numpy") 7 | exit(1) 8 | from glob import glob 9 | 10 | module_utils = setuptools.extension.Extension('ssam.utils', sources=["c/utils.cpp"], extra_compile_args=["-fopenmp"], extra_link_args=["-fopenmp"], include_dirs=[np.get_include()]) 11 | 12 | with io.open("README.rst", "r", encoding="utf-8") as fh: 13 | long_description = fh.read() 14 | 15 | setuptools.setup( 16 | name="ssam", 17 | version="1.0.2", 18 | author="Jeongbin Park", 19 | author_email="j.park@dkfz-heidelberg.de", 20 | description="SSAM", 21 | long_description=long_description, 22 | long_description_content_type="text/markdown", 23 | url="https://github.com/HiDiHlabs/ssam", 24 | packages=setuptools.find_packages(), 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 28 | "Operating System :: POSIX", 29 | ], 30 | ext_modules = [module_utils], 31 | install_requires=[ 32 | "numpy", 33 | "scipy", 34 | "pandas", 35 | "matplotlib", 36 | "seaborn", 37 | "scikit-learn", 38 | "umap-learn", 39 | "python-louvain", 40 | "sparse", 41 | "scikit-image", 42 | "pyarrow", 43 | "packaging", 44 | ] 45 | ) 46 | -------------------------------------------------------------------------------- /ssam/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from matplotlib.colors import to_rgba 5 | import seaborn as sns 6 | import multiprocessing 7 | import os 8 | sns.set() 9 | sns.set_style("whitegrid", {'axes.grid' : False}) 10 | from functools import reduce 11 | from sklearn.neighbors import KernelDensity 12 | from sklearn import preprocessing 13 | import scipy 14 | from scipy import ndimage 15 | from sklearn.decomposition import PCA 16 | from sklearn.manifold import TSNE 17 | from umap import UMAP 18 | from multiprocessing import Pool 19 | from contextlib import closing 20 | from tempfile import mkdtemp, TemporaryDirectory 21 | from sklearn.neighbors import kneighbors_graph 22 | from sklearn.cluster import KMeans 23 | import community 24 | import networkx as nx 25 | from sklearn.cluster import DBSCAN 26 | import sparse 27 | from skimage import filters 28 | from skimage.morphology import disk 29 | from skimage import measure 30 | from matplotlib.colors import ListedColormap 31 | import pickle 32 | import subprocess 33 | from scipy.spatial.distance import cdist 34 | from sklearn.cluster import AgglomerativeClustering 35 | from PIL import Image 36 | from scipy.ndimage import zoom 37 | import pyarrow 38 | import time 39 | from packaging import version 40 | 41 | from .utils import corr, calc_ctmap, calc_corrmap, flood_fill, calc_kde 42 | 43 | def _fast_gaussian_kde(args): 44 | # TODO: 1) support sampling distance 45 | # 2) support other kernels 46 | (bandwidth, save_dir, gene_name, shape, locations, sampling_distance) = args 47 | 48 | print('Processing gene %s...'%gene_name) 49 | 50 | maxdist = int(bandwidth * 4) 51 | span = np.linspace(-maxdist,maxdist,maxdist*2+1) 52 | X, Y, Z = np.meshgrid(span,span,span) 53 | 54 | def create_kernel(x, y, z): 55 | X_=(-x+X)/bandwidth 56 | Y_=(-y+Y)/bandwidth 57 | Z_=(-z+Z)/bandwidth 58 | return np.exp(-0.5*(X_**2+Y_**2+Z_**2)) 59 | 60 | pd = np.zeros(shape) 61 | for loc in locations: 62 | int_loc = [int(i) for i in loc] 63 | rem_loc = [i%1 for i in loc] 64 | 65 | kernel = create_kernel(*rem_loc) 66 | 67 | pos_start = [i - maxdist for i in int_loc] 68 | pos_end = [i + maxdist + 1 for i in int_loc] 69 | 70 | kernel_pos_start = [abs(i) if i < 0 else 0 for i in pos_start] 71 | kernel_pos_end = [maxdist*2+1 - (i-j) if i > j else maxdist*2+1 for i, j in zip(pos_end, shape)] 72 | 73 | pos_start = [0 if i < 0 else i for i in pos_start] 74 | pos_end = [j if i >= j else i for i, j in zip(pos_end, shape)] 75 | 76 | slices = tuple([slice(i, j) for i, j in zip(pos_start, pos_end)]) 77 | kernel_slices = tuple([slice(i, j) for i, j in zip(kernel_pos_start, kernel_pos_end)]) 78 | pd[slices] += kernel.swapaxes(0, 1)[kernel_slices] 79 | 80 | pd /= pd.sum() 81 | pd *= len(locations) 82 | 83 | return pd 84 | 85 | def run_sctransform(data, clip_range=None, verbose=True, debug_path=None, plot_model_pars=False, **kwargs): 86 | """ 87 | Run 'sctransform' R package and returns the normalized matrix and the model parameters. 88 | Package 'feather' is used for the data exchange between R and Python. 89 | :param data: N x D ndarray to normlize (N is number of samples, D is number of dimensions). 90 | :type data: numpy.ndarray 91 | :param kwargs: Any keyword arguments passed to R function `vst`. 92 | :returns: A 2-tuple, which contains two pandas.dataframe: 93 | (1) normalized N x D matrix. 94 | (2) determined model parameters. 95 | """ 96 | def _log(m): 97 | if verbose: 98 | print(m) 99 | 100 | vst_options = ['%s = "%s"'%(k, v) if type(v) is str else '%s = %s'%(k, v) for k, v in kwargs.items()] 101 | if len(vst_options) == 0: 102 | vst_opt_str = '' 103 | else: 104 | vst_opt_str = ', ' + ', '.join(vst_options) 105 | with TemporaryDirectory() as tmpdirname: 106 | if debug_path: 107 | tmpdirname = debug_path 108 | ifn, ofn, pfn, rfn = [os.path.join(tmpdirname, e) for e in ["in.feather", "out.feather", "fit_params.feather", "script.R"]] 109 | _log("Writing temporary files...") 110 | if isinstance(data, pd.DataFrame): 111 | df = data 112 | else: 113 | df = pd.DataFrame(data, columns=[str(e) for e in range(data.shape[1])]) 114 | if version.parse(pyarrow.__version__) >= version.parse("1.0.0"): 115 | df.to_feather(ifn, version=1) 116 | else: 117 | df.to_feather(ifn) 118 | rcmd = 'library(feather); library(sctransform); mat <- t(as.matrix(read_feather("{0}"))); colnames(mat) <- 1:ncol(mat); res <- vst(mat{1}, return_gene_attr=TRUE, return_cell_attr=TRUE); write_feather(as.data.frame(t(res$y)), "{2}"); write_feather(as.data.frame(res$model_pars_fit), "{3}");'.format(ifn, vst_opt_str, ofn, pfn) 119 | if plot_model_pars: 120 | plot_path = os.path.join(tmpdirname, 'model_pars.png') 121 | rcmd += 'png(file="%s", width=3600, height=1200, res=300); plot_model_pars(res, show_var=TRUE); dev.off();'%plot_path 122 | rcmd = rcmd.replace('\\', '\\\\') 123 | with open(rfn, "w") as f: 124 | f.write(rcmd) 125 | _log("Running scTransform via Rscript...") 126 | proc = subprocess.Popen(["Rscript", rfn], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 127 | while not proc.poll(): 128 | c = proc.stdout.read(1) 129 | if not c: 130 | break 131 | if verbose: 132 | try: 133 | sys.stdout.write(c.decode("utf-8")) 134 | except: 135 | pass 136 | time.sleep(0.0001) 137 | _log("Reading output files...") 138 | o, p = pd.read_feather(ofn), pd.read_feather(pfn) 139 | if plot_model_pars: 140 | try: 141 | from matplotlib.image import imread 142 | import matplotlib.pyplot as plt 143 | img = imread(plot_path) 144 | dpi = 80 145 | fig = plt.figure(figsize=(img.shape[1]/dpi, img.shape[0]/dpi), dpi=dpi) 146 | plt.imshow(img, interpolation='nearest') 147 | plt.gca().set_axis_off() 148 | plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) 149 | plt.margins(0, 0) 150 | plt.gca().xaxis.set_major_locator(plt.NullLocator()) 151 | plt.gca().yaxis.set_major_locator(plt.NullLocator()) 152 | plt.show() 153 | except: 154 | print("Warning: plotting failed, perhaps matplotlib is not available?") 155 | _log("Clipping residuals...") 156 | if clip_range is None: 157 | r = np.sqrt(data.shape[0]/30.0) 158 | clip_range = (-r, r) 159 | o.clip(*clip_range) 160 | return o, p 161 | 162 | 163 | class SSAMDataset(object): 164 | """ 165 | A class to store intial values and results of SSAM analysis. 166 | 167 | :param genes: The genes that will be used for the analysis. 168 | :type genes: list(str) 169 | :param locations: Location of the mRNAs in um, given as a list of 170 | N x D ndarrays (N is number of mRNAs, D is number of dimensions). 171 | :type locations: list(numpy.ndarray) 172 | :param width: Width of the image in um. 173 | :type width: float 174 | :param height: Height of the image in um. 175 | :type height: float 176 | :param depth: Depth of the image in um. Depth == 1 means 2D image. 177 | :type depth: float 178 | """ 179 | 180 | def __init__(self, genes, locations, width, height, depth=1): 181 | if depth < 1 or width < 1 or height < 1: 182 | raise ValueError("Invalid image dimension") 183 | self.shape = (width, height, depth) 184 | self.ndim = 2 if depth == 1 else 3 185 | self.genes = list(genes) 186 | self.locations = [] 187 | for l in list(locations): 188 | if l.shape[-1] == 3: 189 | self.locations.append(l) 190 | elif l.shape[-1] == 2: 191 | self.locations.append(np.concatenate((l, np.zeros([l.shape[0], 1])), axis=1)) 192 | else: 193 | raise ValueError("Invalid mRNA locations") 194 | self.__vf = None 195 | self.__vf_norm = None 196 | self.normalized_vectors = None 197 | self.expanded_vectors = None 198 | self.cluster_labels = None 199 | #self.corr_map = None 200 | self.tsne = None 201 | self.umap = None 202 | self.normalized_vf = None 203 | self.excluded_clusters = None 204 | self.celltype_binned_counts = None 205 | 206 | @property 207 | def vf(self): 208 | """ 209 | Vector field as a numpy.ndarray. 210 | """ 211 | return self.__vf 212 | 213 | @vf.setter 214 | def vf(self, vf): 215 | self.__vf = vf 216 | self.__vf_norm = None 217 | 218 | @property 219 | def vf_norm(self): 220 | """ 221 | `L1-norm `_ of the vector field as a numpy.ndarray. 222 | """ 223 | 224 | if self.vf is None: 225 | return None 226 | if self.__vf_norm is None: 227 | self.__vf_norm = np.sum(self.vf, axis=len(self.vf.shape) - 1) 228 | return self.__vf_norm 229 | 230 | def plot_l1norm(self, cmap="viridis", rotate=0, z=None): 231 | """ 232 | Plot the `L1-norm `_ of the vector field. 233 | 234 | :param cmap: Colormap used for the plot. 235 | :type cmap: str or matplotlib.colors.Colormap 236 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3. 237 | :type rotate: int 238 | :param z: Z index to slice 3D vector field. 239 | If not given, the slice at the middle will be plotted. 240 | :type z: int 241 | """ 242 | if z is None: 243 | z = int(self.vf_norm.shape[2] / 2) 244 | if rotate < 0 or rotate > 3: 245 | raise ValueError("rotate can only be 0, 1, 2, 3") 246 | im = np.array(self.vf_norm, copy=True) 247 | if rotate == 1 or rotate == 3: 248 | im = im.swapaxes(0, 1) 249 | plt.imshow(im[..., z], cmap=cmap) 250 | if rotate == 1: 251 | plt.gca().invert_xaxis() 252 | elif rotate == 2: 253 | plt.gca().invert_xaxis() 254 | plt.gca().invert_yaxis() 255 | elif rotate == 3: 256 | plt.gca().invert_yaxis() 257 | 258 | def plot_localmax(self, c=None, cmap=None, s=1, rotate=0): 259 | """ 260 | Scatter plot the local maxima. 261 | 262 | :param c: Color of the scatter dots. Overrides `cmap` parameter. 263 | :type c: str or list(str), or list(float) or list(list(float)) 264 | :param cmap: Colormap of the scatter dots. 265 | :type cmap: str or matplotlib.colors.Colormap 266 | :param s: Size of the scatter dots. 267 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3. 268 | :type rotate: int 269 | """ 270 | if rotate < 0 or rotate > 3: 271 | raise ValueError("rotate can only be 0, 1, 2, 3") 272 | if rotate == 0 or rotate == 2: 273 | dim0, dim1 = 1, 0 274 | elif rotate == 1 or rotate == 3: 275 | dim0, dim1 = 0, 1 276 | plt.scatter(self.local_maxs[dim0], self.local_maxs[dim1], s=s, c=c, cmap=cmap) 277 | plt.xlim([0, self.vf_norm.shape[dim0]]) 278 | plt.ylim([self.vf_norm.shape[dim1], 0]) 279 | if rotate == 1: 280 | plt.gca().invert_xaxis() 281 | elif rotate == 2: 282 | plt.gca().invert_xaxis() 283 | plt.gca().invert_yaxis() 284 | elif rotate == 3: 285 | plt.gca().invert_yaxis() 286 | 287 | def __run_pca(self, exclude_bad_clusters, pca_dims, random_state): 288 | if exclude_bad_clusters: 289 | good_vecs = self.normalized_vectors[self.filtered_cluster_labels != -1, :] 290 | else: 291 | good_vecs = self.normalized_vectors 292 | return PCA(n_components=pca_dims, random_state=random_state).fit_transform(good_vecs) 293 | 294 | def plot_tsne(self, run_tsne=False, pca_dims=10, n_iter=5000, perplexity=70, early_exaggeration=10, 295 | metric="correlation", exclude_bad_clusters=True, s=None, random_state=0, colors=[], excluded_color="#00000033", cmap="jet", tsne_kwargs={}): 296 | """ 297 | Scatter plot the tSNE embedding. 298 | 299 | :param run_tsne: If false, this method tries to load precomputed tSNE result before running tSNE. 300 | :type run_tsne: bool 301 | :param pca_dims: Number of PCA dimensions used for the tSNE embedding. 302 | :type pca_dims: int 303 | :param n_iter: Maximum number of iterations for the tSNE. 304 | :type n_iter: int 305 | :param perplexity: The perplexity value of the tSNE (please refer to the section `How should I set the perplexity in t-SNE?` in this `link `_). 306 | :type perplexity: float 307 | :param early_exaggeration: Early exaggeration parameter for tSNE. Controls the tightness of the resulting tSNE plot. 308 | :type early_exaggeration: float 309 | :param metric: Metric for calculation of distance between vectors in gene expression space. 310 | :type metric: str 311 | :param exclude_bad_clusters: If true, the vectors that are excluded by the clustering algorithm will not be considered for tSNE computation. 312 | :type exclude_bad_clusters: bool 313 | :param s: Size of the scatter dots. 314 | :type s: float 315 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result 316 | :type random_state: int or random state object 317 | :param colors: Color of each clusters. 318 | :type colors: list(str), list(list(float)) 319 | :param excluded_color: Color of the vectors excluded by the clustering algorithm. 320 | :type excluded_color: str of list(float) 321 | :param cmap: Colormap for the clusters. 322 | :type cmap: str or matplotlib.colors.Colormap 323 | :param tsne_kwargs: Other keyward parameters for tSNE. 324 | :type tsne_kwargs: dict 325 | """ 326 | if self.filtered_cluster_labels is None: 327 | exclude_bad_clusters = False 328 | if run_tsne or self.tsne is None: 329 | pcs = self.__run_pca(exclude_bad_clusters, pca_dims, random_state) 330 | self.tsne = TSNE(n_iter=n_iter, perplexity=perplexity, early_exaggeration=early_exaggeration, metric=metric, random_state=random_state, **tsne_kwargs).fit_transform(pcs[:, :pca_dims]) 331 | if self.filtered_cluster_labels is not None: 332 | cols = self.filtered_cluster_labels[self.filtered_cluster_labels != -1] 333 | else: 334 | cols = None 335 | if len(colors) > 0: 336 | cmap = ListedColormap(colors) 337 | if not exclude_bad_clusters and self.filtered_cluster_labels is not None: 338 | plt.scatter(self.tsne[:, 0][self.filtered_cluster_labels == -1], self.tsne[:, 1][self.filtered_cluster_labels == -1], s=s, c=excluded_color) 339 | plt.scatter(self.tsne[:, 0][self.filtered_cluster_labels != -1], self.tsne[:, 1][self.filtered_cluster_labels != -1], s=s, c=cols, cmap=cmap) 340 | else: 341 | plt.scatter(self.tsne[:, 0], self.tsne[:, 1], s=s, c=cols, cmap=cmap) 342 | return 343 | 344 | def plot_umap(self, run_umap=False, pca_dims=10, metric="correlation", exclude_bad_clusters=True, s=None, random_state=0, colors=[], excluded_color="#00000033", cmap="jet", umap_kwargs={}): 345 | """ 346 | Scatter plot the UMAP embedding. 347 | 348 | :param run_umap: If false, this method tries to load precomputed UMAP result before running UMAP. 349 | :type run_tsne: bool 350 | :param pca_dims: Number of PCA dimensions used for the UMAP embedding. 351 | :type pca_dims: int 352 | :param metric: Metric for calculation of distance between vectors in gene expression space. 353 | :type metric: str 354 | :param exclude_bad_clusters: If true, the vectors that are excluded by the clustering algorithm will not be considered for tSNE computation. 355 | :type exclude_bad_clusters: bool 356 | :param s: Size of the scatter dots. 357 | :type s: float 358 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result 359 | :type random_state: int or random state object 360 | :param colors: Color of each clusters. 361 | :type colors: list(str), list(list(float)) 362 | :param excluded_color: Color of the vectors excluded by the clustering algorithm. 363 | :type excluded_color: str of list(float) 364 | :param cmap: Colormap for the clusters. 365 | :type cmap: str or matplotlib.colors.Colormap 366 | :param umap_kwargs: Other keyward parameters for UMAP. 367 | :type umap_kwargs: dict 368 | """ 369 | if self.filtered_cluster_labels is None: 370 | exclude_bad_clusters = False 371 | if run_umap or self.umap is None: 372 | pcs = self.__run_pca(exclude_bad_clusters, pca_dims, random_state) 373 | self.umap = UMAP(metric=metric, random_state=random_state, **umap_kwargs).fit_transform(pcs[:, :pca_dims]) 374 | if self.filtered_cluster_labels is not None: 375 | cols = self.filtered_cluster_labels[self.filtered_cluster_labels != -1] 376 | else: 377 | cols = None 378 | if len(colors) > 0: 379 | cmap = ListedColormap(colors) 380 | if not exclude_bad_clusters and self.filtered_cluster_labels is not None: 381 | plt.scatter(self.umap[:, 0][self.filtered_cluster_labels == -1], self.umap[:, 1][self.filtered_cluster_labels == -1], s=s, c=excluded_color) 382 | plt.scatter(self.umap[:, 0][self.filtered_cluster_labels != -1], self.umap[:, 1][self.filtered_cluster_labels != -1], s=s, c=cols, cmap=cmap) 383 | else: 384 | plt.scatter(self.umap[:, 0], self.umap[:, 1], s=s, c=cols, cmap=cmap) 385 | return 386 | 387 | def plot_expanded_mask(self, cmap='Greys'): # TODO 388 | """ 389 | Plot the expanded area of the vectors (Not fully implemented yet). 390 | 391 | :param cmap: Colormap for the mask. 392 | """ 393 | plt.imshow(self.expanded_mask, vmin=0, vmax=1, cmap=cmap) 394 | return 395 | 396 | def plot_correlation_map(self, cmap='hot'): # TODO 397 | """ 398 | Plot the correlations near the vectors in the vector field (Not fully implemented yet). 399 | 400 | :param cmap: Colormap for the image. 401 | """ 402 | plt.imshow(self.corr_map, vmin=0.995, vmax=1.0, cmap=cmap) 403 | plt.colorbar() 404 | return 405 | 406 | def plot_celltypes_map(self, background="black", centroid_indices=[], colors=None, cmap='jet', rotate=0, min_r=0.6, set_alpha=False, z=None): 407 | """ 408 | Plot the merged cell-type map. 409 | 410 | :param background: Set background color of the cell-type map. 411 | :type background: str or list(float) 412 | :param centroid_indices: The centroids which will be in the cell type map. If not given, the cell-type map is drawn with all centroids. 413 | :type centroid_indices: list(int) 414 | :param colors: Color of the clusters. Overrides `cmap` parameter. 415 | :type colors: list(str), list(list(float)) 416 | :param cmap: Colormap for the clusters. 417 | :type cmap: str or matplotlib.colors.Colormap 418 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3. 419 | :type rotate: int 420 | :param min_r: Minimum correlation threshold for the cell-type map. 421 | This value is only for the plotting, does not affect to the cell-type maps generated by `filter_celltypemaps`. 422 | :type min_r: float 423 | :param set_alpha: Set alpha of each pixel based on the correlation. 424 | Not properly implemented yet, doesn't work properly with the background other than black. 425 | :type set_alpha: bool 426 | :param z: Z index to slice 3D cell-type map. 427 | If not given, the slice at the middle will be used. 428 | :type z: int 429 | """ 430 | if z is None: 431 | z = int(self.shape[2] / 2) 432 | num_ctmaps = np.max(self.filtered_celltype_maps) + 1 433 | 434 | if len(centroid_indices) == 0: 435 | centroid_indices = list(range(num_ctmaps)) 436 | 437 | if colors is None: 438 | cmap_internal = plt.get_cmap(cmap) 439 | colors = cmap_internal([float(i) / (num_ctmaps - 1) for i in range(num_ctmaps)]) 440 | 441 | all_colors = [background if not j in centroid_indices else colors[i] for i, j in enumerate(range(num_ctmaps))] 442 | cmap_internal = ListedColormap(all_colors) 443 | 444 | celltype_maps_internal = np.array(self.filtered_celltype_maps[..., z], copy=True) 445 | empty_mask = celltype_maps_internal == -1 446 | celltype_maps_internal[empty_mask] = 0 447 | sctmap = cmap_internal(celltype_maps_internal) 448 | sctmap[empty_mask] = (0, 0, 0, 0) 449 | 450 | if set_alpha: 451 | alpha = np.array(self.max_correlations[..., z], copy=True) 452 | alpha[alpha < 0] = 0 # drop negative correlations 453 | alpha = min_r + alpha / (np.max(alpha) / (1.0 - min_r)) 454 | sctmap[..., 3] = alpha 455 | 456 | if rotate == 1 or rotate == 3: 457 | sctmap = sctmap.swapaxes(0, 1) 458 | 459 | plt.gca().set_facecolor(background) 460 | plt.imshow(sctmap) 461 | 462 | if rotate == 1: 463 | plt.gca().invert_xaxis() 464 | elif rotate == 2: 465 | plt.gca().invert_xaxis() 466 | plt.gca().invert_yaxis() 467 | elif rotate == 3: 468 | plt.gca().invert_yaxis() 469 | 470 | return 471 | 472 | def plot_domains(self, background='white', colors=None, cmap='jet', rotate=0, domain_background=False, background_alpha=0.3, z=None): 473 | """ 474 | Plot tissue domains. 475 | 476 | :param background: Background color of the plot. 477 | :type background: str or list(float) 478 | :param colors: Color of the domains. Overrides `cmap` parameter. 479 | :type colors: list(str), list(list(float)) 480 | :param cmap: Colormap for the domains. 481 | :type cmap: str or matplotlib.colors.Colormap 482 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3. 483 | :type rotate: int 484 | :param domain_background: Show the area of the inferred domains behind the domain map. 485 | :type domain_background: bool 486 | :param background_alpha: The alpha value of the area of the inferred domains. 487 | :type background_alpha: float 488 | :param z: Z index to slice 3D domain map. 489 | If not given, the slice at the middle will be used. 490 | :type z: int 491 | """ 492 | if z is None: 493 | z = int(self.shape[2] / 2) 494 | 495 | inferred_domains = self.inferred_domains[..., z] 496 | inferred_domains_cells = self.inferred_domains_cells[..., z] 497 | 498 | if rotate == 1 or rotate == 3: 499 | inferred_domains = inferred_domains.swapaxes(0, 1) 500 | inferred_domains_cells = inferred_domains_cells.swapaxes(0, 1) 501 | 502 | if colors is None: 503 | cmap_internal = plt.get_cmap(cmap) 504 | colors_domains = cmap_internal(np.linspace(0, 1, np.max(inferred_domains) + 1)) 505 | colors_cells = cmap_internal(np.linspace(0, 1, np.max(inferred_domains_cells) + 1)) 506 | 507 | colors_domains[:, 3] = background_alpha 508 | if -1 in inferred_domains: 509 | colors_domains = [[0, 0, 0, 0]] + list(colors_domains) 510 | if -1 in inferred_domains_cells: 511 | colors_cells = [[0, 0, 0, 0]] + list(colors_cells) 512 | 513 | plt.gca().set_facecolor(background) 514 | if domain_background: 515 | plt.imshow(inferred_domains, cmap=ListedColormap(colors_domains)) 516 | plt.imshow(inferred_domains_cells, cmap=ListedColormap(colors_cells)) 517 | 518 | if rotate == 1: 519 | plt.gca().invert_xaxis() 520 | elif rotate == 2: 521 | plt.gca().invert_xaxis() 522 | plt.gca().invert_yaxis() 523 | elif rotate == 3: 524 | plt.gca().invert_yaxis() 525 | 526 | return 527 | 528 | def plot_diagnostic_plot(self, centroid_index, cluster_name=None, cluster_color=None, cmap=None, rotate=0, z=None, use_embedding="tsne", known_signatures=[], correlation_methods=[]): 529 | """ 530 | Plot the diagnostic plot. This method requires `plot_tsne` or `plot_umap` was run at least once before. 531 | 532 | :param centroid_index: Index of the centroid for the diagnostic plot. 533 | :type centroid_index: int 534 | :param cluster_name: The name of the cluster. 535 | :type cluster_name: str 536 | :param cluster_color: The color of the cluster. Overrides `cmap` parameter. 537 | :type cluster_color: str or list(float) 538 | :param cmap: The colormap for the clusters. The cluster color is determined using the `centroid_index` th color of the given colormap. 539 | :type cmap: str or matplotlib.colors.Colormap 540 | :param rotate: Rotate the plot. Possible values are 0, 1, 2, and 3. 541 | :type rotate: int 542 | :param z: Z index to slice 3D vector norm and cell-type map plots. 543 | If not given, the slice at the middle will be used. 544 | :type z: int 545 | :param use_embedding: The type of the embedding for the last panel. Possible values are "tsne" or "umap". 546 | :type use_embedding: str 547 | :param known_signatures: The list of known signatures, which will be displayed in the 3rd panel. Each signature can be 3-tuple or 4-tuple, 548 | containing 1) the name of signature, 2) gene labels of the signature, 3) gene expression values of the signature, 4) optionally the color of the signature. 549 | :type known_signatures: list(tuple) 550 | :param correlation_methods: The correlation method used to determine max correlation of the centroid to the `known_signatures`. Each method should be 2-tuple, 551 | containing 1) the name of the correaltion, 2) the correaltion function (compatiable with the correlation methods available in `scipy.stats `_) 552 | :type correlation_methods: list(tuple) 553 | """ 554 | if z is None: 555 | z = int(self.vf_norm.shape[2] / 2) 556 | p, e = self.centroids[centroid_index], self.centroids_stdev[centroid_index] 557 | if cluster_name is None: 558 | cluster_name = "Cluster #%d"%centroid_index 559 | 560 | if cluster_color is None: 561 | if cmap is None: 562 | cmap = plt.get_cmap("jet") 563 | cluster_color = cmap(centroid_index / (len(self.centroids) - 1)) 564 | 565 | if len(correlation_methods) == 0: 566 | correlation_methods = [("r", corr), ] 567 | total_signatures = len(correlation_methods) * len(known_signatures) + 1 568 | 569 | ax = plt.subplot(1, 4, 1) 570 | mask = self.filtered_cluster_labels == centroid_index 571 | plt.scatter(self.local_maxs[0][mask], self.local_maxs[1][mask], c=[cluster_color]) 572 | self.plot_l1norm(rotate=rotate, cmap="Greys", z=z) 573 | 574 | ax = plt.subplot(1, 4, 2) 575 | ctmap = np.zeros([self.filtered_celltype_maps.shape[0], self.filtered_celltype_maps.shape[1], 4]) 576 | ctmap[self.filtered_celltype_maps[..., z] == centroid_index] = to_rgba(cluster_color) 577 | ctmap[np.logical_and(self.filtered_celltype_maps[..., z] != centroid_index, self.filtered_celltype_maps[..., 0] > -1)] = [0.9, 0.9, 0.9, 1] 578 | if rotate == 1 or rotate == 3: 579 | ctmap = ctmap.swapaxes(0, 1) 580 | ax.imshow(ctmap) 581 | if rotate == 1: 582 | ax.invert_xaxis() 583 | elif rotate == 2: 584 | ax.invert_xaxis() 585 | ax.invert_yaxis() 586 | elif rotate == 3: 587 | ax.invert_yaxis() 588 | 589 | ax = plt.subplot(total_signatures, 4, 3) 590 | ax.bar(self.genes, p, yerr=e) 591 | ax.set_title(cluster_name) 592 | plt.xlim([-1, len(self.genes)]) 593 | plt.xticks(rotation=90) 594 | 595 | subplot_idx = 0 596 | for signature in known_signatures: 597 | sig_title, sig_labels, sig_values = signature[:3] 598 | sig_colors_defined = False 599 | if len(signature) == 4: 600 | sig_colors = signature[3] 601 | sig_colors_defined = True 602 | for corr_label, corr_func in correlation_methods: 603 | corr_results = [corr_func(p, sig_value) for sig_value in sig_values] 604 | corr_results = [e[0] if hasattr(e, "__getitem__") else e for e in corr_results] 605 | max_corr_idx = np.argmax(corr_results) 606 | ax = plt.subplot(total_signatures, 4, 7+subplot_idx*4) 607 | lbl = sig_labels[max_corr_idx] 608 | if sig_colors_defined: 609 | col = sig_colors[max_corr_idx] 610 | else: 611 | col = cluster_color 612 | ax.bar(self.genes, sig_values[max_corr_idx], color=col) 613 | ax.set_title("%s in %s (max %s, %.3f)"%(lbl, sig_title, corr_label, corr_results[max_corr_idx])) 614 | plt.xlim([-1, len(self.genes)]) 615 | plt.xticks(rotation=90) 616 | subplot_idx += 1 617 | 618 | if use_embedding == 'tsne': 619 | embedding = self.tsne 620 | fig_title = "t-SNE, %d vectors"%sum(self.filtered_cluster_labels == centroid_index) 621 | elif use_embedding == 'umap': 622 | embedding = self.umap 623 | fig_title = "UMAP, %d vectors"%sum(self.filtered_cluster_labels == centroid_index) 624 | good_vectors = self.filtered_cluster_labels[self.filtered_cluster_labels != -1] 625 | ax = plt.subplot(1, 4, 4) 626 | ax.scatter(embedding[:, 0][good_vectors != centroid_index], embedding[:, 1][good_vectors != centroid_index], c=[[0.8, 0.8, 0.8, 1],], s=80) 627 | ax.scatter(embedding[:, 0][good_vectors == centroid_index], embedding[:, 1][good_vectors == centroid_index], c=[cluster_color], s=80) 628 | ax.get_xaxis().set_visible(False) 629 | ax.get_yaxis().set_visible(False) 630 | ax.set_title(fig_title) 631 | 632 | def plot_celltype_composition(self, domain_index, cell_type_colors=None, cell_type_cmap='jet', cell_type_orders=None, label_cutoff=0.03, pctdistance=1.15, **kwargs): 633 | """ 634 | Plot composition of cell types in each domain. 635 | 636 | :param domain_index: Index of the domain. 637 | :type domain_index: int 638 | :param cell_type_colors: The colors of the cell types. Overrides `cell_type_cmap` parameter. 639 | :type cell_type_colors: str or list(float) 640 | :param cell_type_cmap: The colormap for the cell types. 641 | :type cell_type_cmap: str or matplotlib.colors.Colormap 642 | :param label_cutoff: The minimum cutoff of the labeling of the percentage. From 0 to 1. 643 | :type label_cutoff: float 644 | :param pctdistance: The distance from center of the pie to the labels. 645 | :type pctdistance: float 646 | :param kwargs: More kewward arguments for the matplotlib.pyplot.pie. 647 | """ 648 | if cell_type_colors is None: 649 | cmap = plt.get_cmap(cell_type_cmap) 650 | cell_type_colors = cmap(np.arange(0, len(self.centroids)) / (len(self.centroids) - 1)) 651 | 652 | if cell_type_orders is not None: 653 | ctcs = np.array(cell_type_colors)[cell_type_orders] 654 | p = self.inferred_domains_compositions[domain_index][cell_type_orders] 655 | else: 656 | ctcs = cell_type_colors 657 | p = self.inferred_domains_compositions[domain_index] 658 | plt.pie(p, 659 | colors=ctcs, 660 | autopct=lambda e: '%.1f %%'%e if e > 3 else '', 661 | pctdistance=pctdistance, **kwargs) 662 | 663 | def plot_spatial_relationships(self, cluster_labels, *args, **kwargs): 664 | """ 665 | Plot spatial relationship between cell types, presented as a heatmap. 666 | 667 | :param cluster_labels: x- and y-axis label of the heatmap. 668 | :type cluster_labels: list(str) 669 | :param args: More arguments for the seaborn.heatmap. 670 | :param kwargs: More keyword arguments for the seaborn.heatmap. 671 | """ 672 | sns.heatmap(self.spatial_relationships, *args, xticklabels=cluster_labels, yticklabels=cluster_labels, **kwargs) 673 | 674 | def get_celltype_correlation(self, idx): 675 | """ 676 | Get correlation values of a cell type map between the given cluster's centroid to the vector field. 677 | 678 | :param idx: Index of a cluster 679 | :type idx: int 680 | :return: Correlation values of a cell type map of the specified cluster's centroid 681 | :rtype: numpy.ndarray 682 | """ 683 | rtn = np.zeros_like(self.max_correlations) - 1 684 | rtn[self.celltype_maps == idx] = self.max_correlations[self.celltype_maps == idx] 685 | return rtn 686 | 687 | 688 | class SSAMAnalysis(object): 689 | """ 690 | A class to run SSAM analysis. 691 | 692 | :param dataset: A SSAMDataset object. 693 | :type dataset: SSAMDataset 694 | :param ncores: Number of cores for parallel computation. If a negative value is given, 695 | ((# of all available cores on system) - abs(ncores)) cores will be used. 696 | :type ncores: int 697 | :param save_dir: Directory to store intermediate data (e.g. density / vector field). 698 | Any data which already exists will be loaded and reused. 699 | :type save_dir: str 700 | :param verbose: If True, then it prints out messages during the analysis. 701 | :type verbose: bool 702 | """ 703 | def __init__(self, dataset, ncores=-1, save_dir="", verbose=False): 704 | 705 | self.dataset = dataset 706 | if not ncores > 0: 707 | ncores += multiprocessing.cpu_count() 708 | if ncores > multiprocessing.cpu_count(): 709 | ncores = multiprocessing.cpu_count() 710 | if not ncores > 0: 711 | raise ValueError("Invalid number of cores.") 712 | self.ncores = ncores 713 | self.use_savedir = True 714 | if len(save_dir) == 0: 715 | save_dir = mkdtemp() 716 | self.use_savedir = False 717 | if not os.path.exists(save_dir): 718 | os.makedirs(save_dir) 719 | self.save_dir = save_dir 720 | self.verbose = verbose 721 | 722 | def __m__(self, message): 723 | if self.verbose: 724 | print(message) 725 | 726 | def run_kde(self, kernel="gaussian", bandwidth=2.5, sampling_distance=1.0, use_mmap=False): 727 | """ 728 | Run KDE to estimate density of mRNA. 729 | 730 | :param kernel: Kernel for density estimation. 731 | :type kernel: str 732 | :param bandwidth: Parameter to adjust width of kernel. 733 | Set it 2.5 to make FWTM of Gaussian kernel to be ~10um (assume that avg. cell diameter is ~10um). 734 | :type bandwidth: float 735 | :param sampling_distance: Grid spacing in um. 736 | :type sampling_distance: float 737 | :param use_mmap: Use MMAP to reduce memory usage during analysis. 738 | Turning on this option can reduce the amount of memory used by SSAM analysis, but also lower the analysis speed. 739 | :type use_mmap: bool 740 | """ 741 | def save_pickle(fn, o): 742 | with open(fn, "wb") as f: 743 | return pickle.dump(o, f, protocol=4) 744 | def load_pickle(fn): 745 | with open(fn, "rb") as f: 746 | return pickle.load(f) 747 | 748 | steps = [int(np.ceil(e / sampling_distance)) for e in self.dataset.shape] 749 | total_steps = np.prod(steps) 750 | vf_shape = tuple(steps + [len(self.dataset.genes), ]) 751 | vf_filename = os.path.join(self.save_dir, 'vf_sd%s_bw%s'%( 752 | ('%f' % sampling_distance).rstrip('0').rstrip('.'), 753 | ('%f' % bandwidth).rstrip('0').rstrip('.') 754 | )) 755 | if (use_mmap and not os.path.exists(vf_filename + '.dat')) or \ 756 | (not use_mmap and not os.path.exists(vf_filename + '.pkl') and not os.path.exists(vf_filename + '.dat')): 757 | # If VF file doesn't exist, then run KDE 758 | if use_mmap: 759 | vf = np.memmap(vf_filename + '.dat.tmp', dtype='double', mode='w+', shape=vf_shape) 760 | else: 761 | vf = np.zeros(vf_shape) 762 | chunksize = min(int(np.ceil(total_steps / self.ncores)), 100000) 763 | def yield_chunk(): 764 | chunk = np.zeros(shape=[chunksize, len(steps)], dtype=int) 765 | cnt = 0 766 | remaining_cnt = total_steps 767 | for x in range(steps[0]): 768 | for y in range(steps[1]): 769 | for z in range(steps[2]): 770 | chunk[cnt, :] = [x, y, z] 771 | cnt += 1 772 | if cnt == chunksize: 773 | yield chunk 774 | remaining_cnt -= cnt 775 | cnt = 0 776 | chunk = np.zeros(shape=[min(chunksize, remaining_cnt), len(steps)], dtype=int) 777 | if cnt > 0: 778 | yield chunk 779 | 780 | def yield_chunks(): 781 | chunks = [] 782 | for chunk in yield_chunk(): 783 | chunks.append(chunk) 784 | if len(chunks) == self.ncores: 785 | yield chunks 786 | chunks = [] 787 | if len(chunks) > 0: 788 | yield chunks 789 | 790 | pool = None 791 | for gidx, gene_name in enumerate(self.dataset.genes): 792 | pdf_filename = os.path.join(self.save_dir, 'pdf_sd%s_bw%s_%s.npy'%( 793 | ('%f' % sampling_distance).rstrip('0').rstrip('.'), 794 | ('%f' % bandwidth).rstrip('0').rstrip('.'), 795 | gene_name) 796 | ) 797 | if os.path.exists(pdf_filename): 798 | self.__m__("Loading %s..."%gene_name) 799 | pdf = np.load(pdf_filename) 800 | else: 801 | self.__m__("Running KDE for %s..."%gene_name) 802 | pdf = np.zeros(shape=vf_shape[:-1]) 803 | if kernel != "gaussian": 804 | kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(self.dataset.locations[gidx]) 805 | if pool is None: 806 | pool = multiprocessing.Pool(self.ncores) 807 | else: 808 | X, Y, Z = [self.dataset.locations[gidx][:, i] for i in range(3)] 809 | for chunks in yield_chunks(): 810 | if kernel == "gaussian": 811 | pdf_chunks = [calc_kde(bandwidth, X, Y, Z, chunk[:, 0], chunk[:, 1], chunk[:, 2], 0, self.ncores) for chunk in chunks] 812 | else: 813 | pdf_chunks = pool.map(kde.score_samples, [chunk * sampling_distance for chunk in chunks]) 814 | for pdf_chunk, pos_chunk in zip(pdf_chunks, chunks): 815 | if kernel == "gaussian": 816 | pdf[pos_chunk[:, 0], pos_chunk[:, 1], pos_chunk[:, 2]] = pdf_chunk 817 | else: 818 | pdf[pos_chunk[:, 0], pos_chunk[:, 1], pos_chunk[:, 2]] = np.exp(pdf_chunk) 819 | pdf /= np.sum(pdf) 820 | np.save(pdf_filename, pdf) 821 | vf[..., gidx] = pdf * len(self.dataset.locations[gidx]) 822 | if use_mmap: 823 | vf.flush() 824 | os.rename(vf_filename + '.dat.tmp', vf_filename + '.dat') 825 | vf = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape) 826 | elif self.use_savedir: 827 | save_pickle(vf_filename + '.pkl', vf) 828 | elif not use_mmap: 829 | if os.path.exists(vf_filename + '.pkl'): 830 | vf = load_pickle(vf_filename + '.pkl') 831 | else: # == os.path.exists(vf_filename + '.dat'): 832 | vf_tmp = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape) 833 | vf = np.array(vf_tmp, copy=True) 834 | if self.use_savedir: 835 | save_pickle(vf_filename + '.pkl', vf) 836 | elif use_mmap: 837 | vf = np.memmap(vf_filename + '.dat', dtype='double', mode='r', shape=vf_shape) 838 | self.dataset.vf = vf 839 | return 840 | 841 | def run_fast_kde(self, kernel='gaussian', bandwidth=2.5, sampling_distance=1.0, re_run=False, use_mmap=False): 842 | """ 843 | Run KDE faster than `run_kde` method. This method uses precomputed kernels to estimate density of mRNA. 844 | 845 | :param kernel: Kernel for density estimation. Currently only Gaussian kernel is supported. 846 | :type kernel: str 847 | :param bandwidth: Parameter to adjust width of kernel. 848 | Set it 2.5 to make FWTM of Gaussian kernel to be ~10um (assume that avg. cell diameter is ~10um). 849 | :type bandwidth: float 850 | :param sampling_distance: Grid spacing in um. Currently only 1 um is supported. 851 | :type sampling_distance: float 852 | :param re_run: Recomputes KDE, ignoring all existing precomputed densities in the data directory. 853 | :type re_run: bool 854 | :param use_mmap: Use MMAP to reduce memory usage during analysis. Currently not implemented, this option should be always disabled. 855 | :type use_mmap: bool 856 | """ 857 | if kernel != 'gaussian': 858 | raise NotImplementedError('Only Gaussian kernel is supported.') 859 | if sampling_distance != 1.0: 860 | raise NotImplementedError('Sampling distance should be 1.') 861 | if use_mmap: 862 | raise NotImplementedError('MMAP is not supported yet.') 863 | 864 | def save_pickle(fn, o): 865 | with open(fn, "wb") as f: 866 | return pickle.dump(o, f, protocol=4) 867 | def load_pickle(fn): 868 | with open(fn, "rb") as f: 869 | return pickle.load(f) 870 | 871 | vf_filename = os.path.join(self.save_dir, 'vf_sd%s_bw%s.pkl'%( 872 | ('%f' % sampling_distance).rstrip('0').rstrip('.'), 873 | ('%f' % bandwidth).rstrip('0').rstrip('.') 874 | )) 875 | 876 | if os.path.exists(vf_filename) and not re_run: 877 | self.dataset.vf = load_pickle(vf_filename) 878 | return 879 | 880 | self.dataset.vf = np.zeros(self.dataset.shape+(len(self.dataset.genes),)) 881 | idcs = np.argsort([len(i) for i in self.dataset.locations])[::-1] 882 | pdf_filenames = [os.path.join(self.save_dir, 'pdf_sd%s_bw%s_%s.npy'%( 883 | ('%f' % sampling_distance).rstrip('0').rstrip('.'), 884 | ('%f' % bandwidth).rstrip('0').rstrip('.'), 885 | self.dataset.genes[gidx]) 886 | ) for gidx in idcs] 887 | 888 | if not re_run: 889 | idcs = np.where([not os.path.exists(fn) for fn in pdf_filenames])[0] 890 | for gidx in np.where([os.path.exists(fn) for fn in pdf_filenames])[0]: 891 | print("Loading gene %s..."%self.dataset.genes[gidx]) 892 | self.dataset.vf[..., gidx] = np.load(pdf_filenames[gidx]) 893 | 894 | if len(idcs) > 0: 895 | with closing(Pool(self.ncores, maxtasksperchild=1)) as p: 896 | res = p.imap(_fast_gaussian_kde,[(bandwidth, 897 | self.save_dir, 898 | self.dataset.genes[gidx], 899 | self.dataset.shape, 900 | self.dataset.locations[gidx], 901 | sampling_distance) for gidx in idcs]) 902 | for gidx, pd in zip(idcs, res): # imap returns result in the same order as the input array 903 | self.dataset.vf[..., gidx] = pd 904 | np.save(pdf_filenames[gidx], pd) 905 | p.close() 906 | p.join() 907 | save_pickle(vf_filename, self.dataset.vf) 908 | 909 | def calc_correlation_map(self, corr_size=3): 910 | """ 911 | Calculate local correlation map of the vector field. 912 | 913 | :param corr_size: Size of square (or cube) that is used to compute the local correlation values. 914 | This value should be an odd number. 915 | :type corr_size: int 916 | """ 917 | 918 | corr_map = calc_corrmap(self.dataset.vf, ncores=self.ncores, size=int(corr_size/2)) 919 | self.dataset.corr_map = np.array(corr_map, copy=True) 920 | return 921 | 922 | def find_localmax(self, search_size=3, min_norm=0, min_expression=0, mask=None): 923 | """ 924 | Find local maxima vectors in the norm of the vector field. 925 | 926 | :param search_size: Size of square (or cube in 3D) that is used to search for the local maxima. 927 | This value should be an odd number. 928 | :type search_size: int 929 | :param min_norm: Minimum value of norm at the local maxima. 930 | :type min_norm: float 931 | :param min_expression: Minimum value of gene expression in a unit pixel at the local maxima. 932 | mask: numpy.ndarray, optional 933 | If given, find vectors in the masked region, instead of the whole image. 934 | :type min_expression: float 935 | """ 936 | 937 | max_mask = self.dataset.vf_norm == ndimage.maximum_filter(self.dataset.vf_norm, size=search_size) 938 | max_mask &= self.dataset.vf_norm > min_norm 939 | if min_expression > 0: 940 | exp_mask = np.zeros_like(max_mask) 941 | for i in range(len(self.dataset.genes)): 942 | exp_mask |= self.dataset.vf[..., i] > min_expression 943 | max_mask &= exp_mask 944 | if mask is not None: 945 | max_mask &= mask 946 | local_maxs = np.where(max_mask) 947 | self.__m__("Found %d local max vectors."%len(local_maxs[0])) 948 | self.dataset.local_maxs = local_maxs 949 | return 950 | 951 | def expand_localmax(self, r=0.99, min_pixels=7, max_pixels=1000): 952 | """ 953 | Merge the vectors nearby the local max vectors. 954 | Only the vectors with the large Pearson correlation values are merged. 955 | 956 | :param r: Minimum Pearson's correlation coefficient to look for the nearby vectors. 957 | :type r: float 958 | :param min_pixels: Minimum number of pixels to merge. 959 | :type min_pixels: float 960 | :param max_pixels: Maximum number of pixels to merge. 961 | :type max_pixels: float 962 | """ 963 | 964 | expanded_vecs = [] 965 | self.__m__("Expanding local max vectors...") 966 | fill_dx = np.meshgrid(range(3), range(3), range(3)) 967 | fill_dx = np.array(list(zip(*[np.ravel(e) - 1 for e in fill_dx]))) 968 | mask = np.zeros(self.dataset.vf.shape[:-1]) # TODO: sparse? 969 | nlocalmaxs = len(self.dataset.local_maxs[0]) 970 | valid_pos_list = [] 971 | for cnt, idx in enumerate(range(nlocalmaxs), start=1): 972 | local_pos = tuple(i[idx] for i in self.dataset.local_maxs) 973 | filled_pos = tuple(zip(*flood_fill(local_pos, self.dataset.vf, r, min_pixels, max_pixels))) 974 | if len(filled_pos) > 0: 975 | mask[filled_pos] = 1 976 | valid_pos_list.append(local_pos) 977 | expanded_vecs.append(np.sum(self.dataset.vf[filled_pos], axis=0)) 978 | if cnt % 100 == 0: 979 | self.__m__("Processed %d/%d..."%(cnt, nlocalmaxs)) 980 | self.__m__("Processed %d/%d..."%(cnt, nlocalmaxs)) 981 | self.dataset.expanded_vectors = np.array(expanded_vecs) 982 | self.dataset.expanded_mask = mask 983 | self.dataset.valid_local_maxs = valid_pos_list 984 | return 985 | 986 | def normalize_vectors_sctransform(self, use_expanded_vectors=False, normalize_vf=True, vst_kwargs={}): 987 | """ 988 | Normalize and regularize vectors using SCtransform 989 | 990 | :param use_expanded_vectors: If True, use averaged vectors nearby local maxima 991 | of the vector field. 992 | :type use_expanded_vectors: bool 993 | :param normalize_vf: If True, the vector field is also normalized 994 | using the same parameters used to normalize the local maxima. 995 | :type normalize_vf: bool 996 | :param vst_kwargs: Optional keywords arguments for sctransform's vst function. 997 | :type vst_kwargs: dict 998 | """ 999 | if use_expanded_vectors: 1000 | vec = np.array(self.dataset.expanded_vectors, copy=True) 1001 | else: 1002 | vec = np.array(self.dataset.vf[self.dataset.local_maxs], copy=True) 1003 | 1004 | norm_vec, fit_params = run_sctransform(vec, **vst_kwargs) 1005 | self.dataset.normalized_vectors = np.array(norm_vec) 1006 | 1007 | if normalize_vf: 1008 | vf_nonzero = self.dataset.vf[self.dataset.vf_norm > 0] 1009 | nvec = vf_nonzero.shape[0] 1010 | fit_params = np.array(fit_params).T 1011 | regressor_data = np.ones([nvec, 2]) 1012 | regressor_data[:, 1] = np.log10(np.sum(vf_nonzero, axis=1)) 1013 | 1014 | mu = np.exp(np.dot(regressor_data, fit_params[1:, :])) 1015 | with np.errstate(divide='ignore', invalid='ignore'): 1016 | res = (vf_nonzero - mu) / np.sqrt(mu + mu**2 / fit_params[0, :]) 1017 | self.dataset.normalized_vf = np.zeros_like(self.dataset.vf) 1018 | self.dataset.normalized_vf[self.dataset.vf_norm > 0] = np.nan_to_num(res) 1019 | return 1020 | 1021 | def normalize_vectors(self, use_expanded_vectors=False, normalize_gene=False, normalize_vector=False, normalize_median=False, size_after_normalization=1e4, log_transform=False, scale=False): 1022 | """ 1023 | Normalize and regularize vectors 1024 | 1025 | :param use_expanded_vectors: If True, use averaged vectors nearby local maxima of the vector field. 1026 | :type use_expanded_vectors: bool 1027 | :param normalize_gene: If True, normalize vectors by sum of each gene expression across all vectors. 1028 | :type normalize_gene: bool 1029 | :param normalize_vector: If True, normalize vectors by sum of all gene expression of each vector. 1030 | :type normalize_vector: bool 1031 | :param log_transform: If True, vectors are log transformed. 1032 | :type log_transform: bool 1033 | :param scale: If True, vectors are z-scaled (mean centered and scaled by stdev). 1034 | :type scale: bool 1035 | """ 1036 | if use_expanded_vectors: 1037 | vec = np.array(self.dataset.expanded_vectors, copy=True) 1038 | else: 1039 | vec = np.array(self.dataset.vf[self.dataset.local_maxs], copy=True) 1040 | if normalize_gene: 1041 | vec = preprocessing.normalize(vec, norm=norm, axis=0) * size_after_normalization # Normalize per gene 1042 | if normalize_vector: 1043 | vec = preprocessing.normalize(vec, norm="l1", axis=1) * size_after_normalization # Normalize per vector 1044 | if normalize_median: 1045 | def n(v): 1046 | s, m = np.sum(v, axis=1), np.median(v, axis=1) 1047 | s[m > 0] = s[m > 0] / m[m > 0] 1048 | s[m == 0] = 0 1049 | v[s > 0] = v[s > 0] / s[s > 0][:, np.newaxis] 1050 | v[v == 0] = 0 1051 | return v 1052 | vec = n(vec) 1053 | if log_transform: 1054 | vec = np.log2(vec + 1) 1055 | if scale: 1056 | vec = preprocessing.scale(vec) 1057 | self.dataset.normalized_vectors = vec 1058 | return 1059 | 1060 | def __correct_cluster_labels(self, cluster_labels, centroid_correction_threshold): 1061 | new_labels = np.array(cluster_labels, copy=True) 1062 | if centroid_correction_threshold < 1.0: 1063 | for cidx in np.unique(cluster_labels): 1064 | if cidx == -1: 1065 | continue 1066 | prev_midx = -1 1067 | while True: 1068 | vecs = self.dataset.normalized_vectors[new_labels == cidx] 1069 | vindices = np.where(new_labels == cidx)[0] 1070 | midx = vindices[np.argmin(np.sum(cdist(vecs, vecs), axis=0))] 1071 | if midx == prev_midx: 1072 | break 1073 | prev_midx = midx 1074 | m = self.dataset.normalized_vectors[midx] 1075 | for vidx, v in zip(vindices, vecs): 1076 | if corr(v, m) < centroid_correction_threshold: 1077 | new_labels[vidx] = -1 1078 | return new_labels 1079 | 1080 | def __calc_centroid(self, cluster_labels): 1081 | centroids = [] 1082 | centroids_stdev = [] 1083 | #medoids = [] 1084 | for lbl in sorted(list(set(cluster_labels))): 1085 | if lbl == -1: 1086 | continue 1087 | cl_vecs = self.dataset.normalized_vectors[cluster_labels == lbl, :] 1088 | #cl_dists = scipy.spatial.distance.cdist(cl_vecs, cl_vecs, metric) 1089 | #medoid = cl_vecs[np.argmin(np.sum(cl_dists, axis=0))] 1090 | centroid = np.mean(cl_vecs, axis=0) 1091 | centroid_stdev = np.std(cl_vecs, axis=0) 1092 | #medoids.append(medoid) 1093 | centroids.append(centroid) 1094 | centroids_stdev.append(centroid_stdev) 1095 | return centroids, centroids_stdev#, medoids 1096 | 1097 | def cluster_vectors(self, pca_dims=10, min_cluster_size=0, resolution=0.6, prune=1.0/15.0, snn_neighbors=30, max_correlation=1.0, 1098 | metric="correlation", subclustering=False, dbscan_eps=0.4, centroid_correction_threshold=0.8, random_state=0): 1099 | """ 1100 | Cluster the given vectors using the specified clustering method. 1101 | 1102 | :param pca_dims: Number of principal componants used for clustering. 1103 | :type pca_dims: int 1104 | :param min_cluster_size: Set minimum cluster size. 1105 | :type min_cluster_size: int 1106 | :param resolution: Resolution for Louvain community detection. 1107 | :type resolution: float 1108 | :param prune: Threshold for Jaccard index (weight of SNN network). If it is smaller than prune, it is set to zero. 1109 | :type prune: float 1110 | :param snn_neighbors: Number of neighbors for SNN network. 1111 | :type snn_neighbors: int 1112 | :param max_correlation: Clusters with higher correlation to this value will be merged. 1113 | :type max_correlation: bool 1114 | :param metric: Metric for calculation of distance between vectors in gene expression space. 1115 | :type metric: str 1116 | :param subclustering: If True, each cluster will be clustered once again with DBSCAN algorithm to find more subclusters. 1117 | :type subclustering: bool 1118 | :param centroid_correction_threshold: Centroid will be recalculated with the vectors 1119 | which have the correlation to the cluster medoid equal or higher than this value. 1120 | :type centroid_correction_threshold: float 1121 | :param random_state: Random seed or scikit-learn's random state object to replicate the same result 1122 | :type random_state: int or random state object 1123 | """ 1124 | 1125 | vecs_normalized = self.dataset.normalized_vectors 1126 | vecs_normalized_dimreduced = PCA(n_components=pca_dims, random_state=random_state).fit_transform(vecs_normalized) 1127 | 1128 | def cluster_vecs(vecs): 1129 | k = min(snn_neighbors, vecs.shape[0]) 1130 | knn_graph = kneighbors_graph(vecs, k, mode='connectivity', include_self=True, metric=metric).todense() 1131 | intersections = np.dot(knn_graph, knn_graph.T) 1132 | snn_graph = intersections / (k + (k - intersections)) # borrowed from Seurat 1133 | snn_graph[snn_graph < prune] = 0 1134 | G = nx.from_numpy_matrix(snn_graph) 1135 | partition = community.best_partition(G, resolution=resolution, random_state=random_state) 1136 | lbls = np.array(list(partition.values())) 1137 | return lbls 1138 | 1139 | def remove_small_clusters(lbls, lbls2=None): 1140 | small_clusters = [] 1141 | cluster_indices = [] 1142 | lbls = np.array(lbls) 1143 | for lbl in np.unique(lbls): 1144 | if lbl == -1: 1145 | continue 1146 | cnt = np.sum(lbls == lbl) 1147 | if cnt < min_cluster_size: 1148 | small_clusters.append(lbl) 1149 | else: 1150 | cluster_indices.append(lbl) 1151 | for lbl in small_clusters: 1152 | lbls[lbls == lbl] = -1 1153 | tmp = np.array(lbls, copy=True) 1154 | for i, idx in enumerate(cluster_indices): 1155 | lbls[tmp == idx] = i 1156 | if lbls2 is not None: 1157 | for lbl in small_clusters: 1158 | lbls2[lbls2 == lbl] = -1 1159 | tmp = np.array(lbls2, copy=True) 1160 | for i, idx in enumerate(cluster_indices): 1161 | lbls2[tmp == idx] = i 1162 | return lbls, lbls2 1163 | else: 1164 | return lbls 1165 | 1166 | if subclustering: 1167 | super_lbls = cluster_vecs(vecs_normalized_dimreduced) 1168 | dbscan = DBSCAN(eps=dbscan_eps, min_samples=min_cluster_size, metric=metric) 1169 | all_lbls = np.zeros_like(super_lbls) 1170 | global_lbl_idx = 0 1171 | for super_lbl in set(list(super_lbls)): 1172 | super_lbl_idx = np.where(super_lbls == super_lbl)[0] 1173 | if super_lbl == -1: 1174 | all_lbls[super_lbl_idx] = -1 1175 | continue 1176 | sub_lbls = dbscan.fit(vecs_normalized_dimreduced[super_lbl_idx]).labels_ 1177 | for sub_lbl in set(list(sub_lbls)): 1178 | if sub_lbl == -1: 1179 | all_lbls[tuple([super_lbl_idx[sub_lbls == sub_lbl]])] = -1 1180 | continue 1181 | all_lbls[tuple([super_lbl_idx[sub_lbls == sub_lbl]])] = global_lbl_idx 1182 | global_lbl_idx += 1 1183 | else: 1184 | all_lbls = cluster_vecs(vecs_normalized_dimreduced) 1185 | 1186 | new_labels = self.__correct_cluster_labels(all_lbls, centroid_correction_threshold) 1187 | new_labels, all_lbls = remove_small_clusters(new_labels, all_lbls) 1188 | centroids, centroids_stdev = self.__calc_centroid(new_labels) 1189 | 1190 | merge_candidates = [] 1191 | if max_correlation < 1.0: 1192 | Z = scipy.cluster.hierarchy.linkage(centroids, metric='correlation') 1193 | clbls = scipy.cluster.hierarchy.fcluster(Z, 1 - max_correlation, 'distance') 1194 | for i in set(clbls): 1195 | leaf_indices = np.where(clbls == i)[0] 1196 | if len(leaf_indices) > 1: 1197 | merge_candidates.append(leaf_indices) 1198 | removed_indices = [] 1199 | for cand in merge_candidates: 1200 | for i in cand[1:]: 1201 | all_lbls[all_lbls == i] = cand[0] 1202 | removed_indices.append(i) 1203 | for i in sorted(removed_indices, reverse=True): 1204 | all_lbls[all_lbls > i] -= 1 1205 | 1206 | new_labels = self.__correct_cluster_labels(all_lbls, centroid_correction_threshold) 1207 | new_labels, all_lbls = remove_small_clusters(new_labels, all_lbls) 1208 | centroids, centroids_stdev = self.__calc_centroid(new_labels) 1209 | 1210 | self.dataset.cluster_labels = all_lbls 1211 | self.dataset.filtered_cluster_labels = new_labels 1212 | self.dataset.centroids = np.array(centroids) 1213 | self.dataset.centroids_stdev = np.array(centroids_stdev) 1214 | #self.dataset.medoids = np.array(medoids) 1215 | 1216 | self.__m__("Found %d clusters"%len(centroids)) 1217 | return 1218 | 1219 | def rescue_cluster(self, gene_names, expression_thresholds=[]): 1220 | assert len(gene_names) > 0 1221 | assert len(expression_thresholds) == 0 or len(gene_names) == len(expression_thresholds) 1222 | 1223 | expression_thresholds = list(expression_thresholds) 1224 | lm_vectors = self.dataset.vf[self.dataset.local_maxs[0], self.dataset.local_maxs[1], self.dataset.local_maxs[2], :] 1225 | lm_mask = np.ones(len(lm_vectors), dtype=bool) 1226 | for i in range(len(gene_names)): 1227 | rg_idx = self.dataset.genes.index(gene_names[i]) 1228 | if len(expression_thresholds) == 0: 1229 | expression_threshold = filters.threshold_otsu(self.dataset.vf[..., rg_idx]) 1230 | else: 1231 | expression_threshold = float(expression_thresholds[i]) 1232 | lm_mask = np.logical_and(lm_mask, lm_vectors[:, rg_idx] > expression_threshold) 1233 | 1234 | rg_vectors = lm_vectors[lm_mask] 1235 | rg_centroid = np.mean(rg_vectors, axis=0) 1236 | rg_centroid_stdev = np.std(rg_vectors, axis=0) 1237 | 1238 | self.dataset.cluster_labels[lm_mask] = len(self.dataset.centroids) 1239 | self.dataset.filtered_cluster_labels[lm_mask] = len(self.dataset.centroids) 1240 | self.dataset.centroids = np.append(self.dataset.centroids, [rg_centroid], axis=0) 1241 | self.dataset.centroids_stdev = np.append(self.dataset.centroids_stdev, [rg_centroid_stdev], axis=0) 1242 | 1243 | def exclude_and_merge_clusters(self, exclude=[], merge=[], centroid_correction_threshold=0.8): 1244 | """ 1245 | Exclude bad clusters (including the vectors in the clusters), and merge similar clusters for the downstream analysis. 1246 | 1247 | :param exclude: List of cluster indices to be excluded. 1248 | :type exclude: list(int) 1249 | :param merge: List of list of cluster indices to be merged. 1250 | :type merge: list(list(int)) 1251 | :param centroid_correction_threshold: Centroid will be recalculated with the vectors 1252 | which have the correlation to the cluster medoid equal or higher than this value. 1253 | :type centroid_correction_threshold: float 1254 | """ 1255 | exclude = list(exclude) 1256 | merge = np.array(merge) 1257 | for centroids in merge: 1258 | centroids = np.unique(centroids) 1259 | for centroid in centroids[1:][::-1]: 1260 | self.dataset.cluster_labels[self.dataset.cluster_labels == centroid] = centroids[0] 1261 | exclude.append(centroid) 1262 | exclude = sorted(exclude) 1263 | 1264 | mask = np.ones(len(self.dataset.centroids), np.bool) 1265 | mask[exclude] = False 1266 | 1267 | #self.dataset.centroids = self.dataset.centroids[mask] 1268 | #self.dataset.centroids_stdev = self.dataset.centroids_stdev[mask] 1269 | #self.dataset.medoids = self.dataset.medoids[mask] 1270 | 1271 | mask = np.ones(len(self.dataset.cluster_labels), np.bool) 1272 | for centroid in exclude: 1273 | # There will be no vectors for already merged centroids - so there is no problem 1274 | mask[np.array(self.dataset.cluster_labels) == centroid] = False 1275 | self.dataset.cluster_labels = self.dataset.cluster_labels[mask] 1276 | self.dataset.local_maxs = tuple([lm[mask] for lm in self.dataset.local_maxs]) 1277 | 1278 | for centroid in exclude[::-1]: 1279 | self.dataset.cluster_labels[self.dataset.cluster_labels > centroid] -= 1 1280 | self.dataset.normalized_vectors = self.dataset.normalized_vectors[mask, :] 1281 | 1282 | new_labels = self.__correct_cluster_labels(self.dataset.cluster_labels, centroid_correction_threshold) 1283 | centroids, centroids_stdev = self.__calc_centroid(new_labels) 1284 | 1285 | self.dataset.centroids = centroids 1286 | self.dataset.centroids_stdev = centroids_stdev 1287 | self.dataset.filtered_cluster_labels = new_labels 1288 | 1289 | return 1290 | 1291 | def map_celltypes(self, centroids=None): 1292 | """ 1293 | Create correlation maps between the centroids and the vector field. 1294 | Each correlation map corresponds each cell type map. 1295 | 1296 | :param centroids: If given, map celltypes with the given cluster centroids. 1297 | :type centroids: list(np.array(int)) 1298 | """ 1299 | 1300 | if self.dataset.normalized_vf is None: 1301 | normalized_vf = self.dataset.vf 1302 | else: 1303 | normalized_vf = self.dataset.normalized_vf 1304 | 1305 | if centroids is None: 1306 | centroids = self.dataset.centroids 1307 | else: 1308 | self.dataset.centroids = centroids 1309 | 1310 | max_corr = np.zeros_like(self.dataset.vf_norm) - 1 # range from -1 to +1 1311 | max_corr_idx = np.zeros_like(self.dataset.vf_norm, dtype=int) - 1 # -1 for background 1312 | for cidx, centroid in enumerate(centroids): 1313 | ctmap = calc_ctmap(centroid, normalized_vf, self.ncores) 1314 | ctmap = np.nan_to_num(ctmap) 1315 | mask = max_corr < ctmap 1316 | max_corr[mask] = ctmap[mask] 1317 | max_corr_idx[mask] = cidx 1318 | self.dataset.max_correlations = max_corr 1319 | self.dataset.celltype_maps = max_corr_idx 1320 | return 1321 | 1322 | def filter_celltypemaps(self, min_r=0.6, min_norm=0.1, fill_blobs=True, min_blob_area=0, filter_params={}, output_mask=None): 1323 | """ 1324 | Post-filter cell type maps created by `map_celltypes`. 1325 | 1326 | :param min_r: minimum threshold of the correlation. 1327 | :type min_r: float 1328 | :param min_norm: minimum threshold of the vector norm. 1329 | If a string is given instead, then the threshold is automatically determined using 1330 | sklearn's `threshold filter functions `_ (The functions start with `threshold_`). 1331 | :type min_norm: str or float 1332 | :param fill_blobs: If true, then the algorithm automatically fill holes in each blob. 1333 | :type fill_blobs: bool 1334 | :param min_blob_area: The blobs with its area less than this value will be removed. 1335 | :type min_blob_area: int 1336 | :param filter_params: Filter parameters used for the sklearn's threshold filter functions. 1337 | Not used when `min_norm` is float. 1338 | :type filter_params: dict 1339 | :param output_mask: If given, the cell type maps will be filtered using the output mask. 1340 | :type output_mask: np.ndarray(bool) 1341 | """ 1342 | 1343 | if isinstance(min_norm, str): 1344 | # filter_params dict will be used for kwd params for filter_* functions. 1345 | # some functions doesn't support param 'offset', therefore temporariliy remove it from here 1346 | filter_offset = filter_params.pop('offset', 0) 1347 | 1348 | filtered_ctmaps = np.zeros_like(self.dataset.celltype_maps) - 1 1349 | mask = np.zeros_like(self.dataset.vf_norm, dtype=bool) 1350 | for cidx in range(len(self.dataset.centroids)): 1351 | ctcorr = self.dataset.get_celltype_correlation(cidx) 1352 | if isinstance(min_norm, str): 1353 | for z in range(self.dataset.shape[2]): 1354 | if min_norm in ["local", "niblack", "sauvola", "localotsu"]: 1355 | im = np.zeros(self.dataset.vf_norm.shape[:-1]) 1356 | im[ctcorr[..., z] > min_r] = self.dataset.vf_norm[..., z][ctcorr[..., z] > min_r] 1357 | if min_norm == "localotsu": 1358 | max_norm = np.max(im) 1359 | im /= max_norm 1360 | selem = disk(filter_params['radius']) 1361 | min_norm_cut = filters.rank.otsu(im, selem) * max_norm 1362 | else: 1363 | filter_func = getattr(filters, "threshold_" + min_norm) 1364 | if min_norm in ["local", "niblack", "sauvola"]: 1365 | min_norm_cut = filter_func(im, **filter_params) 1366 | else: 1367 | highr_norm = self.dataset.vf_norm[..., z][ctcorr[..., z] > min_r] 1368 | #sigma = np.std(highr_norm) 1369 | if len(highr_norm) == 0 or np.max(highr_norm) == np.min(highr_norm): 1370 | min_norm_cut = np.max(self.dataset.vf_norm) 1371 | else: 1372 | min_norm_cut = filter_func(highr_norm, **filter_params) 1373 | min_norm_cut += filter_offset # manually apply filter offset 1374 | mask[..., z][np.logical_and(self.dataset.vf_norm[..., z] > min_norm_cut, ctcorr[..., z] > min_r)] = 1 1375 | else: 1376 | mask[np.logical_and(self.dataset.vf_norm > min_norm, ctcorr > min_r)] = 1 1377 | 1378 | if min_blob_area > 0 or fill_blobs: 1379 | blob_labels = measure.label(mask, background=0) 1380 | for bp in measure.regionprops(blob_labels): 1381 | if min_blob_area > 0 and bp.filled_area < min_blob_area: 1382 | for c in bp.coords: 1383 | mask[c[0], c[1], c[2]] = 0 # fill with zeros 1384 | #mask[c[0], c[1]] = 0 # fill with zeros 1385 | continue 1386 | if fill_blobs and bp.area != bp.filled_area: 1387 | minx, miny, minz, maxx, maxy, maxz = bp.bbox 1388 | mask[minx:maxx, miny:maxy, minz:maxz] |= bp.filled_image 1389 | #minr, minc, maxr, maxc = bp.bbox 1390 | #mask[minr:maxr, minc:maxc] |= bp.filled_image 1391 | 1392 | filtered_ctmaps[np.logical_and(mask == 1, np.logical_or(self.dataset.celltype_maps == -1, self.dataset.celltype_maps == cidx))] = cidx 1393 | 1394 | if isinstance(min_norm, str): 1395 | # restore offset param 1396 | filter_params['offset'] = filter_offset 1397 | 1398 | if output_mask is not None: 1399 | filtered_ctmaps[~output_mask.astype(bool)] = -1 1400 | self.dataset.filtered_celltype_maps = filtered_ctmaps 1401 | 1402 | def bin_celltypemaps(self, step=10, radius=100): 1403 | """ 1404 | Sweep a sphere window along a lattice on the image, and count the number of cell types in each window. 1405 | 1406 | :param step: The lattice spacing. 1407 | :type step: int 1408 | :param radius: The radius of the sphere window. 1409 | :type radius: int 1410 | """ 1411 | def make_sphere_mask(radius): 1412 | dia = radius*2+1 1413 | X, Y, Z = np.ogrid[:dia, :dia, :dia] 1414 | dist_from_center = np.sqrt((X - radius)**2 + (Y - radius)**2 + (Z - radius)**2) 1415 | mask = dist_from_center <= radius 1416 | return mask 1417 | 1418 | centers = np.array(self.dataset.vf_norm.shape) // 2 1419 | steps = np.array(np.floor(centers / step) * 2 + np.array(self.dataset.vf_norm.shape) % 2, dtype=int) 1420 | starts = centers - step * np.floor(centers / step) 1421 | ends = starts + steps * step 1422 | X, Y, Z = [np.arange(s, e, step, dtype=int) for s, e in zip(starts, ends)] 1423 | 1424 | ct_centers = np.zeros([len(X), len(Y), len(Z)], dtype=int) 1425 | ct_counts = np.zeros([len(X), len(Y), len(Z), len(self.dataset.centroids)], dtype=int) 1426 | 1427 | ncelltypes = np.max(self.dataset.filtered_celltype_maps) + 1 1428 | cnt_matrix = np.zeros([ncelltypes, ncelltypes]) 1429 | sphere_mask = make_sphere_mask(radius) 1430 | 1431 | for xidx, x in enumerate(X): 1432 | for yidx, y in enumerate(Y): 1433 | for zidx, z in enumerate(Z): 1434 | mask_slices = [slice(0, radius*2+1), slice(0, radius*2+1), slice(0, radius*2+1)] 1435 | s = [x - radius, y - radius, z - radius ] 1436 | e = [x + radius + 1, y + radius + 1, z + radius + 1] 1437 | 1438 | for ms_idx, ms in enumerate(s): 1439 | if ms < 0: 1440 | mask_slices[ms_idx] = slice(abs(ms), mask_slices[ms_idx].stop) 1441 | s[ms_idx] = 0 1442 | for me_idx, me in enumerate(e): 1443 | ctmap_size = self.dataset.filtered_celltype_maps.shape[me_idx] 1444 | #ctmap_size = 50 1445 | if me > ctmap_size: 1446 | mask_slices[me_idx] = slice(mask_slices[me_idx].start, (radius * 2 + 1) + ctmap_size - me) 1447 | e[me_idx] = ctmap_size 1448 | 1449 | w = self.dataset.filtered_celltype_maps[s[0]:e[0], 1450 | s[1]:e[1], 1451 | s[2]:e[2]][sphere_mask[tuple(mask_slices)]] + 1 1452 | 1453 | ct_centers[xidx, yidx, zidx] = self.dataset.filtered_celltype_maps[x, y, z] 1454 | ct_counts[xidx, yidx, zidx] = np.bincount(np.ravel(w), minlength=len(self.dataset.centroids) + 1)[1:] 1455 | 1456 | self.dataset.celltype_binned_centers = ct_centers 1457 | self.dataset.celltype_binned_counts = ct_counts 1458 | return 1459 | 1460 | def find_domains(self, centroid_indices=[], n_clusters=10, norm_thres=0, merge_thres=0.6, merge_remote=True): 1461 | """ 1462 | Find domains in the image, using the result of `bin_celltypemaps`. 1463 | 1464 | :param centroid_indices: The indices of centroids which will be used for determine tissue domains. 1465 | :type centroid_indices: list(int) 1466 | :param n_clusters: Initial number of clusters (domains) of agglomerative clustering. 1467 | :type n_clusters: int 1468 | :param norm_thres: Threshold for the total number of cell types in each window. 1469 | The window which contains the number of cell-type pixels less than this value will be ignored. 1470 | :type norm_thres: int 1471 | :param merge_thres: Threshold for merging domains. The centroids of the domains 1472 | which have higher correlation to this value will be merged. 1473 | :type merge_thres: float 1474 | :param merge_remote: If true, allow merging clusters that are not adjacent to each other. 1475 | :type merge_remote: bool 1476 | """ 1477 | def find_neighbors(m, l): 1478 | neighbors = set() 1479 | for x, y, z in zip(*np.where(m == l)): 1480 | neighbors.add(m[x - 1, y , z ]) 1481 | neighbors.add(m[x + 1, y , z ]) 1482 | neighbors.add(m[x , y - 1, z ]) 1483 | neighbors.add(m[x , y + 1, z ]) 1484 | neighbors.add(m[x , y , z - 1]) 1485 | neighbors.add(m[x , y , z + 1]) 1486 | neighbors.add(m[x - 1, y - 1, z ]) 1487 | neighbors.add(m[x + 1, y - 1, z ]) 1488 | neighbors.add(m[x - 1, y + 1, z ]) 1489 | neighbors.add(m[x + 1, y + 1, z ]) 1490 | neighbors.add(m[x - 1, y , z - 1]) 1491 | neighbors.add(m[x + 1, y , z - 1]) 1492 | neighbors.add(m[x - 1, y , z + 1]) 1493 | neighbors.add(m[x + 1, y , z + 1]) 1494 | neighbors.add(m[x , y - 1, z - 1]) 1495 | neighbors.add(m[x , y + 1, z - 1]) 1496 | neighbors.add(m[x , y - 1, z + 1]) 1497 | neighbors.add(m[x , y + 1, z + 1]) 1498 | neighbors.add(m[x - 1, y - 1, z - 1]) 1499 | neighbors.add(m[x + 1, y - 1, z - 1]) 1500 | neighbors.add(m[x - 1, y - 1, z + 1]) 1501 | neighbors.add(m[x + 1, y - 1, z + 1]) 1502 | neighbors.add(m[x - 1, y + 1, z - 1]) 1503 | neighbors.add(m[x + 1, y + 1, z - 1]) 1504 | neighbors.add(m[x - 1, y + 1, z + 1]) 1505 | neighbors.add(m[x + 1, y + 1, z + 1]) 1506 | return neighbors 1507 | 1508 | if self.dataset.celltype_binned_counts is None: 1509 | raise AssertionError("Run 'bin_celltypemap()' method first!") 1510 | 1511 | if len(centroid_indices) > 0: 1512 | binned_ctmaps = self.dataset.celltype_binned_counts[..., centroid_indices] 1513 | else: 1514 | binned_ctmaps = self.dataset.celltype_binned_counts 1515 | 1516 | binned_ctmaps_norm = np.sum(binned_ctmaps, axis=3) 1517 | 1518 | ctvf_vecs = binned_ctmaps[binned_ctmaps_norm > norm_thres] 1519 | ctvf_vecs_normalized = preprocessing.normalize(ctvf_vecs, norm='l1', axis=1) 1520 | 1521 | clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', affinity='euclidean').fit(ctvf_vecs_normalized) 1522 | labels_predicted = clustering.labels_ + 1 1523 | 1524 | layer_map = np.zeros(binned_ctmaps_norm.shape) 1525 | layer_map[binned_ctmaps_norm > norm_thres] = labels_predicted 1526 | layer_map = measure.label(layer_map) 1527 | 1528 | if merge_thres < 1.0: 1529 | while True: 1530 | uniq_labels = np.array(list(set(list(np.ravel(layer_map))) - set([0]))) 1531 | if not merge_remote: 1532 | layer_map_padded = np.pad(layer_map, 1, mode='constant', constant_values=0) 1533 | neighbors_dic = {} 1534 | for lbl in uniq_labels: 1535 | neighbors_dic[lbl] = find_neighbors(layer_map_padded, lbl) 1536 | cluster_centroids = [] 1537 | for lbl in uniq_labels: 1538 | cluster_centroids.append(np.mean(binned_ctmaps[layer_map == lbl], axis=0)) 1539 | max_corr = 0 1540 | #max_corr_indices = (0, 0, ) 1541 | for i in range(len(uniq_labels)): 1542 | for j in range(i+1, len(uniq_labels)): 1543 | lbl_i, lbl_j = uniq_labels[i], uniq_labels[j] 1544 | if lbl_i == 0 or lbl_j == 0: 1545 | continue 1546 | corr_ij = corr(cluster_centroids[i], cluster_centroids[j]) 1547 | if corr_ij > max_corr and (merge_remote or lbl_j in neighbors_dic[lbl_i]): 1548 | max_corr = corr_ij 1549 | max_corr_indices = (lbl_i, lbl_j, ) 1550 | if max_corr > merge_thres: 1551 | layer_map[layer_map == max_corr_indices[1]] = max_corr_indices[0] 1552 | else: 1553 | break 1554 | 1555 | """ 1556 | if min_size > 0: 1557 | labeled_layer_map = measure.label(layer_map) 1558 | labeled_layer_map_padded = np.pad(labeled_layer_map, 1, mode='constant', constant_values=0) 1559 | for prop in measure.regionprops(labeled_layer_map): 1560 | if prop.area < min_size: 1561 | find_neighbors(layer_map_padded, ) 1562 | """ 1563 | 1564 | uniq_labels = sorted(set(list(np.ravel(layer_map))) - set([0])) 1565 | for i, lbl in enumerate(uniq_labels, start=1): 1566 | layer_map[layer_map == lbl] = i 1567 | 1568 | resized_layer_map = zoom(layer_map, np.array(self.dataset.vf_norm.shape)/np.array(layer_map.shape), order=0) - 1 1569 | resized_layer_map2 = np.array(resized_layer_map, copy=True) 1570 | resized_layer_map2[self.dataset.filtered_celltype_maps == -1] = -1 1571 | 1572 | self.dataset.inferred_domains = resized_layer_map 1573 | self.dataset.inferred_domains_cells = resized_layer_map2 1574 | 1575 | def exclude_and_merge_domains(self, exclude=[], merge=[]): 1576 | """ 1577 | Manually exclude or merge domains. 1578 | 1579 | :param exclude: Indices of the domains which will be excluded. 1580 | :type exclude: list(int) 1581 | :param merge: List of indices of the domains which will be merged. 1582 | :type merge: list(list(int)) 1583 | """ 1584 | for i in exclude: 1585 | self.dataset.inferred_domains[self.dataset.inferred_domains == i] = -1 1586 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == i] = -1 1587 | 1588 | for i in merge: 1589 | for j in i[1:]: 1590 | self.dataset.inferred_domains[self.dataset.inferred_domains == j] = i[0] 1591 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == j] = i[0] 1592 | 1593 | uniq_indices = np.unique(self.dataset.inferred_domains_cells) 1594 | if -1 in uniq_indices: 1595 | uniq_indices = uniq_indices[1:] 1596 | 1597 | for new_idx, i in enumerate(uniq_indices): 1598 | self.dataset.inferred_domains[self.dataset.inferred_domains == i] = new_idx 1599 | self.dataset.inferred_domains_cells[self.dataset.inferred_domains_cells == i] = new_idx 1600 | 1601 | def calc_cell_type_compositions(self): 1602 | """ 1603 | Calculate cell type compositions in each domain. 1604 | """ 1605 | cell_type_compositions = [] 1606 | for i in range(np.max(self.dataset.inferred_domains) + 1): 1607 | counts = np.bincount(self.dataset.filtered_celltype_maps[self.dataset.inferred_domains == i] + 1, minlength=len(self.dataset.centroids) + 1) 1608 | cell_type_compositions.append(counts[1:]) 1609 | 1610 | masked_ctmap = self.dataset.filtered_celltype_maps[self.dataset.filtered_celltype_maps != -1] 1611 | counts_all = np.array(np.bincount(masked_ctmap, minlength=len(self.dataset.centroids)), dtype=float) 1612 | cell_type_compositions.append(counts_all) # Add proportion from the whole tissue 1613 | cell_type_compositions = preprocessing.normalize(cell_type_compositions, axis=1, norm='l1') 1614 | self.dataset.inferred_domains_compositions = cell_type_compositions 1615 | 1616 | 1617 | def calc_spatial_relationship(self): 1618 | """ 1619 | Calculate spatial relationship between the domains using the result of `bin_celltypemap`. 1620 | """ 1621 | if self.dataset.celltype_binned_counts is None: 1622 | raise AssertionError("Run 'bin_celltypemap()' method first!") 1623 | 1624 | ct_centers = self.dataset.celltype_binned_centers 1625 | 1626 | sparel = np.zeros([len(self.dataset.centroids), len(self.dataset.centroids)]) 1627 | for idx in np.unique(ct_centers): 1628 | sparel[idx, :] = np.sum(self.dataset.celltype_binned_counts[ct_centers == idx], axis=0) 1629 | 1630 | self.dataset.spatial_relationships = preprocessing.normalize(sparel, axis=1, norm='l1') 1631 | --------------------------------------------------------------------------------