├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── data ├── biochemists-nb-coef.tsv ├── biochemists-nb-predictions.tsv ├── biochemists-zinb-coef.tsv ├── biochemists-zinb-predictions.tsv ├── biochemists.R ├── biochemists.tsv ├── test-biochemists-nb.py ├── test-biochemists-zinb-ae.py └── test-biochemists-zinb.py ├── dca ├── __init__.py ├── __main__.py ├── api.py ├── hyper.py ├── io.py ├── layers.py ├── loss.py ├── network.py ├── test.py ├── train.py └── utils.py ├── docs ├── Makefile └── source │ ├── conf.py │ └── index.rst ├── pytest.ini ├── reproducibility ├── code │ ├── Figure2.ipynb │ ├── Figure4.R │ ├── Figure5.R │ ├── Figure6.R │ ├── Figure8.R │ ├── Figure9.ipynb │ ├── ImputeUsingDCA.sh │ ├── ImputeUsingMAGIC.py │ ├── ImputeUsingSAVER.R │ └── ImputeUsingSCIMPUTE.R └── download.sh ├── scripts ├── seurat.R └── simulate.R ├── setup.py └── tutorial.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | dist 3 | logs 4 | build 5 | *.egg-info 6 | .Rproj.user 7 | docs/build 8 | data/simulation/ 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 Gokcen Eraslan 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Deep count autoencoder for denoising scRNA-seq data 2 | 3 | A deep count autoencoder network to denoise scRNA-seq data and remove the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function. 4 | 5 | See our [manuscript](https://www.nature.com/articles/s41467-018-07931-2) and [tutorial](https://nbviewer.ipython.org/github/theislab/dca/blob/master/tutorial.ipynb) for more details. 6 | 7 | ### Installation 8 | 9 | #### pip 10 | 11 | For a traditional Python installation of the count autoencoder and the required packages, use 12 | 13 | ``` 14 | $ pip install dca 15 | ``` 16 | 17 | #### conda 18 | 19 | Another approach for installing count autoencoder and the required packages is to use [Conda](https://conda.io/docs/) (most easily obtained via the [Miniconda Python distribution](https://conda.io/miniconda.html)). Afterwards run the following commands. 20 | 21 | ``` 22 | $ conda install -c bioconda dca 23 | ``` 24 | 25 | ### Usage 26 | 27 | You can run the autoencoder from the command line: 28 | 29 | `dca matrix.csv results` 30 | 31 | where `matrix.csv` is a CSV/TSV-formatted raw count matrix with genes in rows and cells in columns. Cell and gene labels are mandatory. 32 | 33 | ### Results 34 | 35 | Output folder contains the main output file (representing the mean parameter of ZINB distribution) as well as some additional matrices in TSV format: 36 | 37 | - `mean.tsv` is the main output of the method which represents the mean parameter of the ZINB distribution. This file has the same dimensions as the input file (except that the zero-expression genes or cells are excluded). It is formatted as a `gene x cell` matrix. Additionally, `mean_norm.tsv` file contains the library size-normalized expressions of each cell and gene. See `normalize_total` function from [Scanpy](https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_total.html) for the details about the default library size normalization method used in DCA. 38 | 39 | - `pi.tsv` and `dispersion.tsv` files represent dropout probabilities and dispersion for each cell and gene. Matrix dimensions are same as `mean.tsv` and the input file. 40 | 41 | - `reduced.tsv` file contains the hidden representation of each cell (in a 32-dimensional space by default), which denotes the activations of bottleneck neurons. 42 | 43 | Use `-h` option to see all available parameters and defaults. 44 | 45 | ### Hyperparameter optimization 46 | 47 | You can run the autoencoder with `--hyper` option to perform hyperparameter search. 48 | -------------------------------------------------------------------------------- /data/biochemists-nb-coef.tsv: -------------------------------------------------------------------------------- 1 | val coef 2 | 0.2561440246579784 intercept 3 | -0.2164184233058461 fem 4 | 0.15048945147514323 mar 5 | -0.17641524234590464 kid5 6 | 0.015271155545275089 phd 7 | 0.029082341647915382 ment 8 | 2.2643876948599235 theta 9 | -------------------------------------------------------------------------------- /data/biochemists-nb-predictions.tsv: -------------------------------------------------------------------------------- 1 | count 2 | 1.9130391956542903 3 | 1.2782929044159645 4 | 1.3119131406197488 5 | 1.3986188605079646 6 | 2.3469892052889474 7 | 0.9515840123801975 8 | 1.1920656151073028 9 | 1.2402772942718756 10 | 1.6506857562464894 11 | 1.232819715385078 12 | 2.0309413986296003 13 | 1.53622140026988 14 | 1.1704606628624212 15 | 1.4311328733091957 16 | 1.2430281022384175 17 | 1.12299116940048 18 | 1.5285716777907834 19 | 1.6444182751891445 20 | 1.3426067685941796 21 | 1.4433689545810553 22 | 1.4906643813523415 23 | 1.0098859412651533 24 | 1.5544832864428726 25 | 1.4773103504298528 26 | 0.9499161892728708 27 | 1.3507692350418918 28 | 1.350558801323357 29 | 1.4666465194713034 30 | 1.090802289250184 31 | 1.69775973817029 32 | 1.0792039269957219 33 | 2.023453016270353 34 | 1.553917453483086 35 | 1.3340420248011764 36 | 1.2884376881192057 37 | 1.072795558035042 38 | 1.0687076594085227 39 | 1.7921183505325313 40 | 1.3593596282612026 41 | 1.4812607664459911 42 | 1.7921183505325313 43 | 1.1557186273567475 44 | 1.5863228482269782 45 | 1.1645769831130584 46 | 1.5883686762018843 47 | 1.281765207859098 48 | 1.606236245712563 49 | 3.270696955250543 50 | 1.675663874964121 51 | 1.62050319775802 52 | 1.601413662937508 53 | 0.8758755581848123 54 | 1.5651688541739872 55 | 2.023453016270353 56 | 1.3811213317624642 57 | 1.4799468981257047 58 | 1.1582505203048081 59 | 1.359891227621199 60 | 1.375976287357663 61 | 1.3516778135768421 62 | 1.294203120133693 63 | 1.299900416675326 64 | 1.2377829072331636 65 | 1.3531559787470975 66 | 1.2944503035043777 67 | 1.432775751678407 68 | 2.1528400807185166 69 | 1.8345203662229255 70 | 1.3018870341740043 71 | 1.0566178639331978 72 | 1.4143690858831897 73 | 0.8840728133241387 74 | 1.2797090690833863 75 | 1.4620242742742222 76 | 1.3855421375184638 77 | 1.269488774699007 78 | 1.090000156097383 79 | 1.3405095860554894 80 | 1.5070329038492702 81 | 1.2638438092708362 82 | 1.1387825006673689 83 | 1.1017965140431174 84 | 2.2380888590256687 85 | 1.8494286995758793 86 | 2.699177836883008 87 | 1.1017885307505892 88 | 3.07443398911672 89 | 1.3183766679275646 90 | 2.7862907309045783 91 | 1.6507134150824274 92 | 1.3837730079424306 93 | 1.2615781559887052 94 | 1.6745852811807076 95 | 2.699177836883008 96 | 1.591661302045164 97 | 1.8840468135577844 98 | 1.7495445461388062 99 | 1.3479481861993374 100 | 1.011041277934036 101 | 2.023453016270353 102 | 1.2743090917174307 103 | 1.299900416675326 104 | 1.3214888271794254 105 | 1.4265506627771969 106 | 1.116865525389416 107 | 1.4000207507939337 108 | 1.569972773762755 109 | 1.2782929044159645 110 | 0.9983853333558727 111 | 0.9482475638299139 112 | 1.756128589424267 113 | 1.0747632979277084 114 | 1.1311961071483507 115 | 1.805493422827841 116 | 1.2250300127635236 117 | 1.4722565684306739 118 | 1.7308599799596198 119 | 1.0541568534833894 120 | 1.4799468981257047 121 | 1.4068790723226388 122 | 1.3129654808087163 123 | 1.4230148535020344 124 | 1.7391848010061288 125 | 1.1061409853731698 126 | 1.1437601360707996 127 | 1.919306990559933 128 | 1.4064293857277659 129 | 1.6716623814650637 130 | 1.3340420248011764 131 | 1.438020552662828 132 | 1.1461099961103376 133 | 2.586804092654758 134 | 1.4562657664381922 135 | 1.2160919941663049 136 | 1.3963218428068862 137 | 2.709667619531886 138 | 1.609262566196966 139 | 1.2347622174202633 140 | 0.9513679278806935 141 | 1.1617827924083581 142 | 1.0533522518058107 143 | 1.2979262385950383 144 | 1.3422159115789083 145 | 1.1645769831130584 146 | 1.1370447770446959 147 | 1.035486037662375 148 | 1.5506945096538096 149 | 1.1370447770446959 150 | 1.6754959993528413 151 | 1.1454385625445833 152 | 1.4148011376036695 153 | 1.8112669987448937 154 | 1.4434658776015743 155 | 1.305550189160803 156 | 1.432629159082089 157 | 1.5058826374552357 158 | 1.291191279242996 159 | 1.2795136578284365 160 | 1.496766122948153 161 | 1.9797443348852555 162 | 1.3728126410218597 163 | 1.1343669888721053 164 | 1.4223869040705455 165 | 1.3958954393725276 166 | 1.8552717812142536 167 | 1.2702290005882346 168 | 1.73103339959018 169 | 1.4815864691092595 170 | 1.571194404199422 171 | 1.5685349160612598 172 | 1.356259171878278 173 | 1.3295102630895161 174 | 1.1132584490019601 175 | 1.4296451282982567 176 | 1.1588136364558406 177 | 1.166771869342331 178 | 1.655569763764054 179 | 1.2848610321322673 180 | 1.08459827352548 181 | 1.299900416675326 182 | 1.0539958839953787 183 | 2.262639335473195 184 | 1.747408447917363 185 | 1.84278084358781 186 | 1.546614802720863 187 | 1.6830164208804672 188 | 1.2192103036891195 189 | 1.5749554053823576 190 | 1.0525916105782345 191 | 1.3182774884328046 192 | 1.0693606742472472 193 | 1.28892316901959 194 | 1.0810183247897833 195 | 1.597430007039541 196 | 1.6222196458051712 197 | 1.6816726796459187 198 | 1.2654371983630648 199 | 0.8923467892223494 200 | 2.1209970081347924 201 | 0.8663640031857405 202 | 1.5936287891007983 203 | 1.5100277156326696 204 | 1.1874766643260461 205 | 1.222939747008075 206 | 1.3241286018856473 207 | 0.903790003843517 208 | 1.373441717573077 209 | 1.379411944758564 210 | 1.1327401896608185 211 | 1.2544699291902102 212 | 1.1432681866726222 213 | 1.3877252778336102 214 | 1.3515904238174947 215 | 1.1644121631447761 216 | 1.3153737444661586 217 | 1.7921183505325313 218 | 1.7266359862465284 219 | 1.9091169068903897 220 | 1.501071967601107 221 | 1.9616813397384274 222 | 1.7026929762159284 223 | 1.472654497379225 224 | 1.5969194138588296 225 | 1.396281515401538 226 | 1.3517102551089963 227 | 1.7429017086027023 228 | 1.379411944758564 229 | 1.1373157422679463 230 | 1.4847574325159862 231 | 1.7478595456922623 232 | 1.3580952904912151 233 | 2.832322364981433 234 | 1.3622461725677337 235 | 1.646076705554899 236 | 1.7391848010061288 237 | 1.5972094324766268 238 | 1.1951643500219205 239 | 2.264360655299461 240 | 1.299900416675326 241 | 1.595311459526821 242 | 1.1109759442233038 243 | 1.1185411728322991 244 | 1.5591048832897865 245 | 1.5556532996381962 246 | 1.4620242742742222 247 | 1.3516778135768421 248 | 1.2892209829560282 249 | 1.6693908333357095 250 | 1.1240960809377827 251 | 1.1173461136970986 252 | 2.308564373192447 253 | 1.6908539089563155 254 | 1.236649285226663 255 | 1.84404402118172 256 | 1.2122593901239413 257 | 1.1748175589553567 258 | 1.3295995404361716 259 | 1.5179856911754155 260 | 1.5511460818656453 261 | 1.7222407505898245 262 | 1.4967120456950531 263 | 1.4246072672273935 264 | 1.299900416675326 265 | 1.0924693453474235 266 | 1.1942052859147665 267 | 1.6994614659425926 268 | 1.111516080553065 269 | 3.8172842445514825 270 | 1.2644582625768408 271 | 1.532976504690712 272 | 1.2581150804243435 273 | 1.73365355983202 274 | 1.3626809208662145 275 | 1.5559130690546394 276 | 1.6807288385485204 277 | 1.7537416255001923 278 | 1.1700031144007204 279 | 1.9616813397384274 280 | 0.9134352145463237 281 | 1.5067812897959885 282 | 1.30412293226417 283 | 1.5506945096538096 284 | 1.3160608037741297 285 | 1.2800510061707886 286 | 2.106547548193457 287 | 1.1370447770446959 288 | 1.7393389962099362 289 | 2.066958103460676 290 | 1.5598193262470104 291 | 2.27168986162654 292 | 2.012417495033639 293 | 1.7782577052264712 294 | 1.6435648710396282 295 | 1.461487825996827 296 | 1.3915983629984086 297 | 1.5886720430514658 298 | 2.4746307247566177 299 | 0.9445005646625634 300 | 1.5051676785420898 301 | 1.7764344400480543 302 | 1.6676072355518912 303 | 1.54287746806488 304 | 4.141626697136798 305 | 1.385660698934494 306 | 1.5379546574141412 307 | 1.5601958605206607 308 | 1.620216860604285 309 | 1.2099363795360183 310 | 1.9291009295014914 311 | 1.3365359631500826 312 | 1.1040724960797867 313 | 2.133774348268156 314 | 2.2079235218393802 315 | 2.1435776783456215 316 | 2.056043487566545 317 | 1.2944007748084345 318 | 2.351541954789073 319 | 1.320040070879489 320 | 1.355884162147352 321 | 1.537599385248513 322 | 1.6830456235042799 323 | 1.9827699472230462 324 | 1.209751622553321 325 | 2.0144621475335347 326 | 2.080708925096761 327 | 1.5043280897866023 328 | 1.1874766643260461 329 | 12.142858231772479 330 | 1.91418083020366 331 | 1.452257745280585 332 | 1.6423880836585882 333 | 1.2773172273979465 334 | 2.132991420382744 335 | 1.9091169068903897 336 | 1.3459427853993509 337 | 1.3203955959985 338 | 1.470855591043725 339 | 1.3837730079424306 340 | 1.4740563105808742 341 | 1.2773172273979465 342 | 1.1993695855691393 343 | 1.3007416974883115 344 | 1.4053759504632581 345 | 1.3072514650333393 346 | 1.2173838473392469 347 | 1.3415335324702822 348 | 1.477645892398769 349 | 2.151334693113531 350 | 1.1266739688653569 351 | 1.1854286407108858 352 | 1.42470293039027 353 | 1.6003600403781872 354 | 1.3512835063656823 355 | 1.389036183937616 356 | 1.6286785010399056 357 | 1.5972834586212834 358 | 1.159921399412275 359 | 1.2493081284469363 360 | 1.2562757306689412 361 | 1.589157333851994 362 | 2.3784701434714712 363 | 1.8581071640928828 364 | 2.0669730782564657 365 | 1.7856820212966644 366 | 1.0896672960104299 367 | 1.4421990577488608 368 | 1.0606595180532017 369 | 1.148770450520872 370 | 1.1696262250974012 371 | 1.5486379494446831 372 | 1.1971737091523267 373 | 1.686642552973629 374 | 1.1964760562306826 375 | 1.506958405689278 376 | 1.7872288700758432 377 | 1.4348945022623711 378 | 1.4620242742742222 379 | 1.818984761630097 380 | 1.659088830952156 381 | 2.0847102097800647 382 | 2.7200992338272054 383 | 2.121833004147098 384 | 1.3415293887662447 385 | 1.387089658993257 386 | 1.3227764620691815 387 | 1.7204665226383646 388 | 1.3153737444661586 389 | 2.1989548188461296 390 | 1.1471606218042851 391 | 1.1019533965775652 392 | 1.8919660752560719 393 | 1.804660879715179 394 | 1.2315952088734266 395 | 1.9443889240337113 396 | 1.7478595456922623 397 | 0.971829780970315 398 | 1.1406970683293147 399 | 1.425775256592143 400 | 1.591661302045164 401 | 1.458045960826828 402 | 3.271024661345266 403 | 1.6488206406128503 404 | 1.8994474716611474 405 | 1.3596683800451452 406 | 1.7305294645857356 407 | 1.3739788239596982 408 | 0.9370195163360168 409 | 2.981411684593304 410 | 1.7737860828664083 411 | 1.3634983275261823 412 | 1.1767927102294096 413 | 1.5833387188815247 414 | 1.3605863320145757 415 | 1.9259367806445027 416 | 1.081348542880117 417 | 1.2099363795360183 418 | 1.448889964854583 419 | 1.0883213496173914 420 | 1.7059620520769656 421 | 2.248908580070489 422 | 2.190763119142914 423 | 0.9901860770929424 424 | 1.1158426454235635 425 | 1.2243730141049218 426 | 1.2781294586157481 427 | 1.7276856800466787 428 | 2.2143715401730417 429 | 1.760206756626474 430 | 1.6034326426911396 431 | 1.126726187020662 432 | 1.5377026358943673 433 | 1.4722565684306739 434 | 1.7478595456922623 435 | 2.2985934657886697 436 | 1.540305089489859 437 | 1.7921183505325313 438 | 1.4537866553228214 439 | 1.811569458188886 440 | 1.6044587362194564 441 | 1.2528412091594672 442 | 2.019569305468053 443 | 1.9135503510024814 444 | 1.3662079039710568 445 | 2.0726128639789727 446 | 1.7335052230702734 447 | 1.640410297845879 448 | 1.3659117819233757 449 | 1.5070329038492702 450 | 1.786841496070003 451 | 1.241000487246354 452 | 2.1462748468066413 453 | 1.230432888052957 454 | 1.1130884547150879 455 | 2.0293623070665054 456 | 4.994087204385433 457 | 1.3461872238242758 458 | 1.4934692282860007 459 | 1.6290716546828803 460 | 2.1500276019478806 461 | 1.2678434131942082 462 | 1.1575590401550242 463 | 1.291352406656757 464 | 1.2901086947508988 465 | 1.4545504993541192 466 | 1.65074891776069 467 | 1.4422878510632526 468 | 1.845002620550053 469 | 1.4094595938898957 470 | 1.5728146955520448 471 | 1.5669043337167583 472 | 3.2403243494780356 473 | 1.8626527818067853 474 | 1.3599597526741263 475 | 1.2422690358024981 476 | 1.3915983629984086 477 | 1.4716810439816361 478 | 1.5137980843115815 479 | 1.4638315141882428 480 | 1.159921399412275 481 | 2.190763119142914 482 | 2.156083852809556 483 | 2.156083852809556 484 | 4.876951361586901 485 | 1.6327451303031313 486 | 1.5679217551403173 487 | 1.5601958605206607 488 | 1.8214605259851877 489 | 2.1394646802958226 490 | 1.608034271416974 491 | 1.3173840096099623 492 | 1.5610894572784533 493 | 1.067158344969774 494 | 1.511079683907806 495 | 2.182779630157373 496 | 1.507028248953826 497 | 1.5438422347197596 498 | 1.34342714951139 499 | 1.4967120456950531 500 | 1.8439734704585764 501 | 1.2148742804601573 502 | 1.9379314440497082 503 | 1.4246072672273935 504 | 1.3938342744069776 505 | 1.1225916630518447 506 | 1.2825484100644697 507 | 1.5506945096538096 508 | 1.830999282952867 509 | 2.9441975417989306 510 | 1.9547814408248436 511 | 1.3665529655117288 512 | 1.651805290654875 513 | 2.447353651263764 514 | 1.7158345236830304 515 | 2.3108286246795116 516 | 1.1920656151073028 517 | 1.1503182851846718 518 | 1.324411574742509 519 | 1.2813698317415987 520 | 1.571194404199422 521 | 1.6095312933704564 522 | 1.712170050958161 523 | 1.7758406629804506 524 | 2.03497732647866 525 | 1.3904829986528187 526 | 1.7338935990268316 527 | 1.270120318177618 528 | 1.3438889409262853 529 | 1.236649285226663 530 | 1.1391303639496575 531 | 1.175255343592379 532 | 1.4246072672273935 533 | 1.1696458233157454 534 | 1.3932851077699513 535 | 1.251742886957726 536 | 1.804660879715179 537 | 1.5829123946131227 538 | 1.9616813397384274 539 | 1.6276334495136737 540 | 1.2776705024753328 541 | 1.2848610321322673 542 | 2.9114040968994663 543 | 1.0928030613748911 544 | 1.472109606964229 545 | 4.061375507652374 546 | 1.7380114185637785 547 | 4.207183107911281 548 | 6.3022920627776 549 | 2.055042401923337 550 | 1.7419971944819568 551 | 1.4218729499053318 552 | 1.2743090917174307 553 | 1.137913307143268 554 | 1.3861925359492449 555 | 1.78381147899951 556 | 1.662839276071543 557 | 1.090802289250184 558 | 1.845002620550053 559 | 1.2914751866854917 560 | 1.477645892398769 561 | 1.709205923154766 562 | 2.547655499823954 563 | 1.489163416288754 564 | 1.580979745136837 565 | 1.2266225720690511 566 | 1.7185380917819992 567 | 1.4667450055931295 568 | 1.5290604149324027 569 | 1.1987732237461235 570 | 3.536185390992804 571 | 2.0334518893397915 572 | 1.6067039861851677 573 | 1.468647263384833 574 | 1.6592828650771068 575 | 1.9429048388285797 576 | 1.5872170596463864 577 | 1.4550297413552988 578 | 1.7775879134401187 579 | 1.5161115927539688 580 | 0.9727236505503528 581 | 1.2151210511990873 582 | 1.230432888052957 583 | 1.9724531839170805 584 | 1.6191458656013205 585 | 1.677712269904247 586 | 1.5851386168588182 587 | 1.4058996453825523 588 | 1.595311459526821 589 | 1.507028248953826 590 | 1.5139529824511828 591 | 3.21833124408263 592 | 1.4391189853160273 593 | 1.1437601360707996 594 | 1.5740538714969317 595 | 1.4173961899580547 596 | 2.019569305468053 597 | 1.4347258587984166 598 | 1.0875061817174327 599 | 1.7678367736010594 600 | 1.28977573598255 601 | 2.853983742257853 602 | 1.614351271995543 603 | 1.1586331113461001 604 | 1.1951643500219205 605 | 1.6469960059434845 606 | 1.1860733314346792 607 | 3.222388750130358 608 | 1.6570275357335575 609 | 1.28079616198825 610 | 1.9054526468880446 611 | 1.5473446697110143 612 | 2.919529697438214 613 | 1.665292494582132 614 | 1.3241286018856473 615 | 1.5052687513809706 616 | 1.2634578614726677 617 | 1.3531950655274996 618 | 1.684194104700233 619 | 3.0821621473342047 620 | 1.5426638730292128 621 | 1.848694319834159 622 | 2.001738055405492 623 | 1.6046229111354398 624 | 1.8520599141065015 625 | 1.2356469390673412 626 | 2.4344636159031587 627 | 1.8046664539356445 628 | 0.9646703648968143 629 | 1.7738997079148424 630 | 2.2163690190274346 631 | 2.2615650856479466 632 | 1.855811379489538 633 | 1.200958619242105 634 | 1.2688718768561633 635 | 1.2679387983238632 636 | 1.1509512718690886 637 | 1.8346071511861994 638 | 2.5336517654188704 639 | 1.2987568277187695 640 | 1.2110455141327818 641 | 2.0461664798401777 642 | 1.3426067685941796 643 | 1.8374812522737474 644 | 2.6288501384067686 645 | 1.5135453407444897 646 | 2.340155018184363 647 | 1.3052526347687385 648 | 1.3592011615001736 649 | 1.5601958605206607 650 | 1.459222647908934 651 | 1.4388026184376503 652 | 1.2480323918900023 653 | 2.1711440081336457 654 | 1.2813555142604682 655 | 1.3053683530236413 656 | 1.1351762677685773 657 | 1.7796160314231309 658 | 1.412406363975991 659 | 1.6435648710396282 660 | 2.3142458164432926 661 | 1.4184073872475786 662 | 1.3317358369857621 663 | 1.5604341384235236 664 | 1.7498410617420639 665 | 1.199675504589035 666 | 1.1597850714857532 667 | 2.340155018184363 668 | 1.5969194138588296 669 | 1.2362885337769147 670 | 6.85517513290001 671 | 1.7593497104069002 672 | 1.4472896666035948 673 | 1.2160919941663049 674 | 1.2272427114465063 675 | 1.490659777015688 676 | 1.758450877017632 677 | 1.2926590693109838 678 | 2.0336759049974873 679 | 3.679099469385176 680 | 1.6163016694314787 681 | 1.5426638730292128 682 | 3.1388054124308353 683 | 3.1378916382098354 684 | 1.3599825401147634 685 | 1.6073027092090386 686 | 1.3780637673334344 687 | 1.537599385248513 688 | 1.4612087510880485 689 | 1.0630107569677125 690 | 0.9460779264066411 691 | 1.8011724903302118 692 | 1.5157867644494982 693 | 1.1985767788380866 694 | 2.1770059270968964 695 | 2.6960466534310816 696 | 1.3493898910242819 697 | 1.1471606218042851 698 | 1.3320342001938383 699 | 1.6064765929738287 700 | 1.7921183505325313 701 | 1.5749554053823576 702 | 1.1018520376897647 703 | 1.7495445461388062 704 | 2.7862222000555597 705 | 1.8207218688668838 706 | 1.9486752269414889 707 | 1.1686745957357658 708 | 1.5151094133672105 709 | 1.9135503510024814 710 | 3.300676775076133 711 | 2.214118123529967 712 | 1.2148742804601573 713 | 1.3632027927722234 714 | 4.720955250489674 715 | 1.9827699472230462 716 | 1.9135503510024814 717 | 1.7478595456922623 718 | 1.2914751866854917 719 | 1.4860616354149334 720 | 1.6347752455810902 721 | 1.485961852257869 722 | 1.86148811554261 723 | 2.173382996416875 724 | 1.126726187020662 725 | 1.5232292121615698 726 | 2.3311804946679517 727 | 2.2687151094464313 728 | 1.5581528016877275 729 | 1.3716397961165223 730 | 1.6891608002195535 731 | 2.117190247090065 732 | 1.5249718273342485 733 | 1.1862876041095358 734 | 1.4535890130529665 735 | 1.9767233466846406 736 | 1.672875267920636 737 | 2.866760710836066 738 | 1.3506645789244776 739 | 1.2147499821707957 740 | 1.231502557817369 741 | 1.7531810697883528 742 | 2.3784701434714712 743 | 4.523958062021655 744 | 1.7023678363569763 745 | 3.192997850429486 746 | 2.2036925546385135 747 | 1.2416525298356502 748 | 2.335663445980759 749 | 1.6745852811807076 750 | 1.3931460267422964 751 | 3.384224982806275 752 | 2.399425155391763 753 | 1.0738888527395116 754 | 1.4906643813523415 755 | 2.480305796358033 756 | 1.4256795266176017 757 | 1.562818919848765 758 | 1.216896386455207 759 | 1.741576791010962 760 | 1.5431351097386157 761 | 1.5386594085090133 762 | 1.594398273455865 763 | 1.7360131761854907 764 | 1.8074244909024912 765 | 1.1645769831130584 766 | 1.8140351298019544 767 | 2.3792306268278907 768 | 1.7195313575456406 769 | 2.7200992338272054 770 | 1.9980954741705295 771 | 1.160906732445729 772 | 4.8417475299086785 773 | 1.9332296815983918 774 | 1.16342385271845 775 | 1.442825300718281 776 | 1.9982726242575635 777 | 1.9501778484880097 778 | 2.8214824996343046 779 | 1.2240949850223426 780 | 1.3600696602445814 781 | 1.5262562040787233 782 | 1.0083449035023344 783 | 1.2848610321322673 784 | 1.127133456613148 785 | 1.8135070248958534 786 | 2.0644494171550782 787 | 3.0403062457988406 788 | 1.6908539089563155 789 | 2.0594406998539454 790 | 1.5985890336712016 791 | 1.159921399412275 792 | 1.5387738840395555 793 | 1.1166075260836446 794 | 1.6994614659425926 795 | 1.317084859707623 796 | 5.375265829827838 797 | 1.601413662937508 798 | 1.473364692028392 799 | 2.5066516058832273 800 | 6.9109998866989875 801 | 1.4765180561000602 802 | 1.1482442799932788 803 | 1.7052532261055533 804 | 7.710388580079479 805 | 1.780690526884969 806 | 1.6794098730963125 807 | 1.7788262788501346 808 | 1.6682831736678057 809 | 2.064932799514329 810 | 1.58911824737042 811 | 2.340065478380446 812 | 1.3661203881651482 813 | 5.080921489809112 814 | 1.2160919941663049 815 | 1.1806056444153608 816 | 3.8367167286317168 817 | 2.149699293270523 818 | 1.8836057749952264 819 | 1.4458926861394328 820 | 1.5171715823082605 821 | 1.6025382413322895 822 | 2.083112587863047 823 | 3.379110228650751 824 | 1.8903602436977847 825 | 1.3855421375184638 826 | 1.9086348452083035 827 | 1.2944007748084345 828 | 1.5077779604889854 829 | 1.3778006224660475 830 | 1.3604471253852821 831 | 2.3970091499005277 832 | 1.6754318970575488 833 | 2.4106317601862775 834 | 1.2183839666542644 835 | 1.450149546384091 836 | 3.122108430662958 837 | 1.5544832864428726 838 | 1.5566038539639102 839 | 1.1920656151073028 840 | 2.1121652720127613 841 | 1.8909646605776496 842 | 2.7092622222829355 843 | 1.4313514403163254 844 | 1.4280217905363068 845 | 1.3961620452355947 846 | 1.2023036917783498 847 | 1.7779677928034017 848 | 1.8355180646103677 849 | 1.7262405160193222 850 | 1.3069505791191476 851 | 2.088001449121803 852 | 1.9938059115742035 853 | 1.608115306277354 854 | 1.731229870179861 855 | 1.5276553109391047 856 | 2.89662306954205 857 | 1.9105751901861643 858 | 2.27168986162654 859 | 1.8726618423113022 860 | 1.6515295054192958 861 | 2.7092622222829355 862 | 2.449223059004678 863 | 1.4795681007586423 864 | 2.3491063122079994 865 | 3.8367167286317168 866 | 1.541170073306789 867 | 2.005275029434266 868 | 1.1070248477747302 869 | 1.752670687198158 870 | 1.3273827468809165 871 | 2.5392449057547775 872 | 1.2979262385950383 873 | 6.3022920627776 874 | 1.622368476175161 875 | 2.2073660099745944 876 | 1.780690526884969 877 | 1.2886810215439344 878 | 1.199919183715295 879 | 1.7047691468766337 880 | 3.5581785737312677 881 | 2.640366940963742 882 | 2.0847550542241797 883 | 2.595306208514257 884 | 1.5098938492432687 885 | 1.9222402374485936 886 | 2.3283024295382457 887 | 2.0446338725519695 888 | 1.580979745136837 889 | 2.126185751689322 890 | 1.4184586404631023 891 | 1.725111647649377 892 | 1.6219497658666162 893 | 1.3295102630895161 894 | 1.4956450242050274 895 | 1.3033265577136575 896 | 1.6331967081491723 897 | 1.599077353809558 898 | 2.4348021467611254 899 | 1.2973223514436398 900 | 1.4184586404631023 901 | 2.144554602396006 902 | 4.697044210338071 903 | 2.0641636160101284 904 | 2.3988401107969 905 | 2.3970091499005277 906 | 2.916151901986888 907 | 2.3751464073919 908 | 2.1487146680037488 909 | 2.571050866200927 910 | 5.080921489809112 911 | 2.1565735648195674 912 | 1.3512835063656823 913 | 3.719590003770812 914 | 1.4978553098718994 915 | 2.8403703867346572 916 | 5.24091538504311 917 | -------------------------------------------------------------------------------- /data/biochemists-zinb-coef.tsv: -------------------------------------------------------------------------------- 1 | count zero coef 2 | 0.4167465258789788 -0.19168829397210915 intercept 3 | -0.19550683126146232 0.6359332030001064 fem 4 | 0.09758262897011574 -1.4994684859382412 mar 5 | -0.15173245821414386 0.6284272015007232 kid5 6 | -7.001340806705136e-4 -0.03771473930388164 phd 7 | 0.024786201372630465 -0.8822932239400029 ment 8 | 2.6547660033812437 2.6547660033812437 theta 9 | -------------------------------------------------------------------------------- /data/biochemists.R: -------------------------------------------------------------------------------- 1 | library(pscl) 2 | library(readr) 3 | 4 | 5 | # Load and save biochemists data ------------------------------------------ 6 | data("bioChemists", package = "pscl") 7 | head(bioChemists) 8 | 9 | #encode design matrix 10 | design <- cbind.data.frame(art=bioChemists$art, model.matrix(art~., bioChemists)[,-1]) 11 | colnames(design) <- colnames(bioChemists) 12 | head(design) 13 | write_tsv(design, 'biochemists.tsv') 14 | 15 | 16 | # NB fit ------------------------------------------------------------------ 17 | nb <- MASS::glm.nb(art ~ ., data = bioChemists) 18 | coef.df <- rbind.data.frame(data.frame(coef(nb)), theta=nb$theta) 19 | colnames(coef.df) <- 'val' 20 | coef.df$coef <- rownames(coef.df) 21 | coef.df 22 | coef.df$coef <- c('intercept', colnames(bioChemists)[-1], 'theta') 23 | coef.df 24 | write_tsv(coef.df, 'biochemists-nb-coef.tsv') 25 | pred.nb <- predict(nb, type='response') 26 | write_tsv(data.frame(count=pred.nb), 'biochemists-nb-predictions.tsv') 27 | 28 | 29 | # ZINB fit ---------------------------------------------------------------- 30 | zinb <- zeroinfl(art ~ . | ., data = bioChemists, dist = "negbin") 31 | coef(zinb) 32 | coef.df <- data.frame(count=zinb$coefficients$count, 33 | zero=zinb$coefficients$zero) 34 | coef.df <- rbind(coef.df, theta=zinb$theta) 35 | coef.df$coef <- c('intercept', colnames(bioChemists)[-1], 'theta') 36 | coef.df 37 | write_tsv(coef.df, 'biochemists-zinb-coef.tsv') 38 | 39 | pred.df <- cbind.data.frame(zero=predict(zinb, type='zero'), 40 | count=predict(zinb, type='count')) 41 | 42 | write_tsv(pred.df, 'biochemists-zinb-predictions.tsv') -------------------------------------------------------------------------------- /data/biochemists.tsv: -------------------------------------------------------------------------------- 1 | art fem mar kid5 phd ment 2 | 0 0 1 0 2.5199999809265137 7 3 | 0 1 0 0 2.049999952316284 6 4 | 0 1 0 0 3.75 6 5 | 0 0 1 1 1.1799999475479126 3 6 | 0 1 0 0 3.75 26 7 | 0 1 1 2 3.5899999141693115 2 8 | 0 1 0 0 3.190000057220459 3 9 | 0 0 1 2 2.9600000381469727 4 10 | 0 0 0 0 4.619999885559082 6 11 | 0 1 1 0 1.25 0 12 | 0 0 0 0 2.9600000381469727 14 13 | 0 1 0 0 0.7549999952316284 13 14 | 0 1 1 1 3.690000057220459 3 15 | 0 1 1 0 3.4000000953674316 4 16 | 0 1 1 0 1.7899999618530273 0 17 | 0 1 0 0 3.0899999141693115 1 18 | 0 1 1 0 2 7 19 | 0 0 1 2 4.289999961853027 13 20 | 0 1 0 0 3.359999895095825 7 21 | 0 1 0 0 4.289999961853027 9 22 | 0 1 1 0 2.259999990463257 6 23 | 0 0 1 3 2.9600000381469727 3 24 | 0 0 1 1 4.289999961853027 5 25 | 0 0 1 1 2.859999895095825 4 26 | 0 0 1 3 2.759999990463257 1 27 | 0 1 1 0 1.5199999809265137 3 28 | 0 1 1 1 3.5399999618530273 8 29 | 0 0 1 1 4.289999961853027 3 30 | 0 1 0 0 3.0899999141693115 0 31 | 0 0 1 0 2.319999933242798 3 32 | 0 1 0 0 2.390000104904175 0 33 | 0 0 1 0 4.289999961853027 8 34 | 0 1 0 0 1.5049999952316284 13 35 | 0 0 0 0 2.0999999046325684 0 36 | 0 0 1 1 1.5199999809265137 0 37 | 0 1 0 0 2 0 38 | 0 1 0 0 1.75 0 39 | 0 0 0 0 4.289999961853027 9 40 | 0 0 1 1 1.2200000286102295 2 41 | 0 1 1 0 3.75 5 42 | 0 0 0 0 4.289999961853027 9 43 | 0 1 1 1 2.859999895095825 3 44 | 0 0 0 0 3.9200000762939453 5 45 | 0 1 1 1 3.359999895095825 3 46 | 0 0 0 0 2.0999999046325684 6 47 | 0 0 1 1 1.1799999475479126 0 48 | 0 0 1 0 2.5 1 49 | 0 0 1 0 3.359999895095825 25 50 | 0 1 0 0 4.539999961853027 14 51 | 0 1 1 0 3.9200000762939453 8 52 | 0 0 0 0 4.539999961853027 5 53 | 0 1 1 2 1.9700000286102295 0 54 | 0 0 1 2 2.9600000381469727 12 55 | 0 0 1 0 4.289999961853027 8 56 | 0 0 1 1 2.259999990463257 2 57 | 0 1 0 0 2.119999885559082 11 58 | 0 1 0 0 3.2100000381469727 2 59 | 0 0 1 1 3.1500000953674316 1 60 | 0 0 1 1 3.9200000762939453 1 61 | 0 0 0 0 2.9600000381469727 0 62 | 0 1 0 0 2.859999895095825 6 63 | 0 0 1 1 2.0999999046325684 0 64 | 0 1 0 0 3.75 4 65 | 0 1 1 0 3.5399999618530273 2 66 | 0 1 1 0 2.5399999618530273 1 67 | 0 0 1 1 2.759999990463257 3 68 | 0 0 1 0 4.539999961853027 10 69 | 0 0 1 0 1.6799999475479126 6 70 | 0 0 1 1 2.200000047683716 0 71 | 0 1 0 0 1.0049999952316284 0 72 | 0 0 0 0 2.119999885559082 2 73 | 0 1 1 2 2.5799999237060547 0 74 | 0 1 1 0 1.7899999618530273 1 75 | 0 0 0 0 4.289999961853027 2 76 | 0 1 1 0 1.2799999713897705 4 77 | 0 0 1 2 2.5799999237060547 5 78 | 0 0 1 2 2.119999885559082 0 79 | 0 0 1 1 2.2100000381469727 1 80 | 0 0 1 1 2.259999990463257 5 81 | 0 1 0 0 3.2100000381469727 5 82 | 0 1 0 0 2.0999999046325684 2 83 | 0 1 1 1 3.5399999618530273 1 84 | 0 0 1 1 3.4000000953674316 18 85 | 0 0 1 0 2.2100000381469727 6 86 | 0 0 1 0 2.2100000381469727 19 87 | 0 0 1 2 0.9200000166893005 1 88 | 0 1 0 0 4.289999961853027 35 89 | 0 0 1 2 3.1500000953674316 6 90 | 0 0 1 0 4.289999961853027 19 91 | 0 0 1 1 2.509999990463257 8 92 | 0 0 1 1 4.289999961853027 1 93 | 0 1 1 0 2.759999990463257 0 94 | 0 0 1 0 1.4199999570846558 3 95 | 0 0 1 0 2.2100000381469727 19 96 | 0 0 0 0 4.139999866485596 5 97 | 0 0 1 0 1.5199999809265137 7 98 | 0 0 0 0 4.619999885559082 8 99 | 0 1 0 0 3.619999885559082 7 100 | 0 1 1 2 3.75 4 101 | 0 0 1 0 4.289999961853027 8 102 | 0 1 0 0 3.75 5 103 | 0 0 1 1 2.0999999046325684 0 104 | 0 0 1 2 1.399999976158142 7 105 | 0 1 1 0 3.190000057220459 4 106 | 0 0 1 2 1.809999942779541 1 107 | 0 0 1 1 3.1500000953674316 2 108 | 0 1 1 0 3.75 7 109 | 0 1 0 0 2.049999952316284 6 110 | 0 0 1 3 2.2100000381469727 3 111 | 0 1 1 2 3.359999895095825 2 112 | 0 1 1 0 3.4700000286102295 11 113 | 0 1 0 0 2.119999885559082 0 114 | 0 1 1 1 3.359999895095825 2 115 | 0 0 1 0 2.5399999618530273 5 116 | 0 0 1 2 2.1500000953674316 4 117 | 0 0 1 1 4.539999961853027 3 118 | 0 0 1 0 1.6799999475479126 4 119 | 0 1 1 1 2.549999952316284 0 120 | 0 1 0 0 2.119999885559082 11 121 | 0 0 1 1 3.4700000286102295 2 122 | 0 1 1 0 3.4700000286102295 1 123 | 0 1 0 0 3.359999895095825 9 124 | 0 0 1 1 2.119999885559082 10 125 | 0 1 0 0 2.0999999046325684 1 126 | 0 1 0 0 4.289999961853027 1 127 | 0 0 1 1 2.859999895095825 13 128 | 0 1 1 1 4.289999961853027 9 129 | 0 0 1 0 3.2100000381469727 2 130 | 0 0 0 0 2.0999999046325684 0 131 | 0 1 1 0 1.809999942779541 5 132 | 0 1 0 0 2.5199999809265137 2 133 | 0 0 1 1 3.359999895095825 23 134 | 0 1 1 0 4.539999961853027 4 135 | 0 1 1 1 4.289999961853027 4 136 | 0 1 0 0 2.119999885559082 9 137 | 0 0 1 2 4.619999885559082 30 138 | 0 1 1 1 3.5899999141693115 14 139 | 0 1 0 0 3.5899999141693115 4 140 | 0 0 1 3 2.859999895095825 1 141 | 0 1 0 0 1.5049999952316284 3 142 | 0 1 1 1 2.5 0 143 | 0 1 1 0 4.619999885559082 0 144 | 0 0 0 0 2.5 0 145 | 0 1 1 1 3.359999895095825 3 146 | 0 1 0 0 2 2 147 | 0 1 1 2 3.4100000858306885 5 148 | 0 0 1 0 2.0999999046325684 0 149 | 0 1 0 0 2 2 150 | 0 0 1 0 3.359999895095825 2 151 | 0 0 1 3 3.5899999141693115 7 152 | 0 0 0 0 2.140000104904175 2 153 | 0 1 1 0 3.5899999141693115 12 154 | 0 1 0 0 2.390000104904175 10 155 | 0 0 1 3 4.539999961853027 11 156 | 0 0 0 0 2.9600000381469727 2 157 | 0 0 1 1 2.2100000381469727 5 158 | 0 0 1 2 3.690000057220459 5 159 | 0 1 1 0 1.7799999713897705 1 160 | 0 1 0 0 2.859999895095825 11 161 | 0 0 1 0 2.859999895095825 8 162 | 0 1 1 0 2.5799999237060547 3 163 | 0 1 0 0 3.75 1 164 | 0 1 1 1 1.2200000286102295 11 165 | 0 1 0 0 2.0999999046325684 9 166 | 0 1 0 0 3.5899999141693115 18 167 | 0 1 0 0 3.5399999618530273 5 168 | 0 1 0 0 2.859999895095825 16 169 | 0 1 1 0 1.8600000143051147 6 170 | 0 0 1 0 2.9600000381469727 0 171 | 0 1 1 0 3.690000057220459 7 172 | 0 1 1 0 3.690000057220459 2 173 | 0 1 1 0 4.289999961853027 1 174 | 0 1 0 0 2.5199999809265137 1 175 | 0 1 0 0 1.7599999904632568 10 176 | 0 0 1 2 2.319999933242798 2 177 | 0 1 0 0 3.690000057220459 2 178 | 0 1 0 0 3.75 14 179 | 0 1 0 0 4.289999961853027 5 180 | 0 1 1 2 4.539999961853027 6 181 | 0 0 1 1 2.0999999046325684 0 182 | 0 1 1 1 2.5399999618530273 0 183 | 0 0 1 1 2.2100000381469727 19 184 | 0 0 0 0 4.539999961853027 8 185 | 0 0 1 1 2.0999999046325684 12 186 | 0 0 0 0 2.259999990463257 5 187 | 0 1 1 1 4.619999885559082 15 188 | 0 1 0 0 2.759999990463257 4 189 | 0 1 0 0 4.289999961853027 12 190 | 0 1 0 0 0.7549999952316284 0 191 | 0 1 1 0 1.8300000429153442 2 192 | 0 1 0 0 1.7899999618530273 0 193 | 0 1 1 1 4.289999961853027 6 194 | 0 1 0 0 2.5 0 195 | 0 0 1 0 2.140000104904175 1 196 | 0 0 1 2 3.4000000953674316 13 197 | 0 1 0 0 2.869999885559082 15 198 | 0 1 1 0 2.9600000381469727 0 199 | 0 1 1 2 3.190000057220459 0 200 | 0 0 1 1 3.690000057220459 16 201 | 0 1 1 2 1.2549999952316284 0 202 | 0 0 1 2 4.139999866485596 12 203 | 0 0 1 1 2.390000104904175 5 204 | 0 0 1 2 3.9200000762939453 2 205 | 0 1 0 0 2.9600000381469727 4 206 | 0 1 1 0 2.119999885559082 2 207 | 0 1 1 2 2.119999885559082 1 208 | 0 1 1 0 2.609999895095825 3 209 | 0 0 0 0 4.289999961853027 0 210 | 0 0 1 3 2.859999895095825 7 211 | 0 1 1 0 2.390000104904175 0 212 | 0 0 1 2 3.3399999141693115 1 213 | 0 1 0 0 3.619999885559082 8 214 | 0 1 1 1 3.5899999141693115 8 215 | 0 0 1 2 4.539999961853027 1 216 | 0 1 1 0 3.5899999141693115 1 217 | 0 0 0 0 4.289999961853027 9 218 | 0 0 1 0 1.5199999809265137 4 219 | 0 0 1 0 4.289999961853027 6 220 | 0 1 1 0 4.619999885559082 5 221 | 0 0 1 1 4.289999961853027 13 222 | 0 0 1 0 2.509999990463257 3 223 | 0 0 0 0 2.859999895095825 3 224 | 0 1 1 0 2.9600000381469727 8 225 | 0 1 1 0 3.690000057220459 3 226 | 0 1 1 0 3.4700000286102295 2 227 | 0 0 1 2 4.289999961853027 15 228 | 0 0 0 0 4.289999961853027 0 229 | 0 1 0 0 3.9200000762939453 1 230 | 0 1 1 0 2 6 231 | 0 0 1 0 2.319999933242798 4 232 | 0 1 1 1 2 9 233 | 0 0 1 1 1.6799999475479126 27 234 | 0 0 0 0 3.4700000286102295 0 235 | 0 0 1 0 2.200000047683716 2 236 | 0 0 1 1 2.119999885559082 10 237 | 0 1 0 0 1.399999976158142 14 238 | 0 1 0 0 3.359999895095825 3 239 | 0 0 1 2 4.289999961853027 24 240 | 0 0 1 1 2.0999999046325684 0 241 | 0 0 0 0 4.289999961853027 5 242 | 0 1 0 0 4.289999961853027 0 243 | 0 1 0 0 2.8299999237060547 1 244 | 0 0 1 1 2.5799999237060547 6 245 | 0 1 1 0 3.1500000953674316 7 246 | 0 0 0 0 4.289999961853027 2 247 | 0 0 0 0 2.9600000381469727 0 248 | 0 0 1 2 3.5899999141693115 5 249 | 0 1 0 0 2.390000104904175 15 250 | 0 1 0 0 1.25 2 251 | 0 1 0 0 2.759999990463257 1 252 | 0 0 1 0 3.4000000953674316 13 253 | 0 0 0 0 4.289999961853027 7 254 | 0 1 0 0 3.690000057220459 4 255 | 0 1 1 0 2.859999895095825 13 256 | 0 1 0 0 4.289999961853027 3 257 | 0 1 0 0 4.139999866485596 2 258 | 0 1 1 0 2.390000104904175 2 259 | 0 0 1 2 2.859999895095825 11 260 | 0 1 1 0 2.9600000381469727 7 261 | 0 0 0 0 3.5899999141693115 8 262 | 0 0 1 1 1.809999942779541 5 263 | 0 0 1 1 4.289999961853027 2 264 | 0 0 1 1 2.0999999046325684 0 265 | 0 1 0 0 3.190000057220459 0 266 | 0 0 1 2 4.289999961853027 2 267 | 0 0 1 0 4.289999961853027 2 268 | 0 0 1 2 3.4000000953674316 0 269 | 0 0 0 0 4.289999961853027 35 270 | 0 0 1 2 2.319999933242798 5 271 | 0 0 0 0 1.6799999475479126 5 272 | 0 1 1 0 2.5799999237060547 0 273 | 0 0 1 0 3.690000057220459 3 274 | 0 0 1 2 3.4100000858306885 7 275 | 0 0 1 0 2.319999933242798 0 276 | 0 0 1 1 3.690000057220459 8 277 | 1 0 1 0 2.5399999618530273 4 278 | 1 1 1 1 1.7599999904632568 4 279 | 1 0 1 1 4.289999961853027 13 280 | 1 0 1 3 2.0999999046325684 0 281 | 1 1 1 1 3.0899999141693115 12 282 | 1 1 0 0 3.359999895095825 6 283 | 1 0 1 0 2.0999999046325684 0 284 | 1 1 1 1 3.75 7 285 | 1 1 0 0 2.140000104904175 6 286 | 1 1 0 0 4.289999961853027 22 287 | 1 1 0 0 2 2 288 | 1 0 1 0 2 4 289 | 1 0 1 1 2 16 290 | 1 0 1 1 2.609999895095825 6 291 | 1 0 1 0 4.25 12 292 | 1 0 0 0 2.359999895095825 14 293 | 1 1 1 0 4.289999961853027 11 294 | 1 0 1 0 2.0999999046325684 2 295 | 1 1 1 0 2.869999885559082 5 296 | 1 1 1 0 3.4700000286102295 3 297 | 1 0 1 0 1.7799999713897705 1 298 | 1 0 1 0 4.139999866485596 15 299 | 1 0 1 3 4.289999961853027 0 300 | 1 0 0 0 4.289999961853027 3 301 | 1 0 0 0 1.809999942779541 10 302 | 1 1 0 0 2.319999933242798 15 303 | 1 1 1 0 2.609999895095825 7 304 | 1 0 1 0 1.6799999475479126 34 305 | 1 1 1 0 3.190000057220459 3 306 | 1 0 1 1 3.5899999141693115 5 307 | 1 0 1 0 2.5 0 308 | 1 0 0 0 3.4000000953674316 6 309 | 1 1 0 0 2.259999990463257 4 310 | 1 0 0 0 3.4000000953674316 12 311 | 1 0 1 1 3.9200000762939453 0 312 | 1 0 1 2 2.9600000381469727 0 313 | 1 0 0 0 4.289999961853027 15 314 | 1 0 1 0 4.289999961853027 11 315 | 1 1 1 1 3.319999933242798 24 316 | 1 0 0 0 1.8600000143051147 15 317 | 1 1 0 0 2.869999885559082 6 318 | 1 1 1 0 1.6399999856948853 22 319 | 1 1 0 0 2.25 7 320 | 1 1 0 0 2.0999999046325684 8 321 | 1 1 1 0 4.289999961853027 6 322 | 1 0 1 0 1.75 3 323 | 1 0 1 0 2.9600000381469727 8 324 | 1 1 0 0 2.25 4 325 | 1 0 1 2 4.25 20 326 | 1 0 1 2 2.559999942779541 22 327 | 1 1 0 0 3.190000057220459 11 328 | 1 0 1 2 3.9200000762939453 2 329 | 1 0 1 1 1.7799999713897705 77 330 | 1 1 1 0 3.4000000953674316 14 331 | 1 0 1 1 1.7400000095367432 4 332 | 1 0 0 0 4.289999961853027 6 333 | 1 1 0 0 2 6 334 | 1 1 1 0 2.869999885559082 18 335 | 1 0 1 0 4.289999961853027 6 336 | 1 1 1 0 3.190000057220459 2 337 | 1 0 1 1 1.2200000286102295 1 338 | 1 1 1 2 3.5399999618530273 17 339 | 1 0 1 1 4.289999961853027 1 340 | 1 0 1 1 4.619999885559082 3 341 | 1 1 0 0 2 6 342 | 1 1 0 0 3.5899999141693115 3 343 | 1 1 0 0 3.190000057220459 6 344 | 1 0 1 1 3.4000000953674316 2 345 | 1 1 1 0 1.2799999713897705 2 346 | 1 0 1 2 1.7400000095367432 4 347 | 1 0 1 1 2.259999990463257 1 348 | 1 1 1 0 3.5899999141693115 5 349 | 1 0 1 1 4.619999885559082 16 350 | 1 1 0 0 1.399999976158142 2 351 | 1 1 0 0 0.9200000166893005 4 352 | 1 0 1 1 2.390000104904175 3 353 | 1 0 1 0 2.259999990463257 1 354 | 1 0 1 2 2.859999895095825 7 355 | 1 0 1 2 2.759999990463257 8 356 | 1 0 1 1 1.6299999952316284 8 357 | 1 0 1 2 4.289999961853027 12 358 | 1 1 0 0 1.399999976158142 3 359 | 1 1 1 0 2.119999885559082 0 360 | 1 1 1 1 2.609999895095825 6 361 | 1 0 1 0 1.7999999523162842 1 362 | 1 1 1 0 4.289999961853027 21 363 | 1 1 0 0 3.690000057220459 18 364 | 1 1 1 0 4.619999885559082 16 365 | 1 0 0 0 2.1500000953674316 10 366 | 1 0 1 2 2.0999999046325684 0 367 | 1 1 1 0 2 5 368 | 1 1 0 0 1.2549999952316284 0 369 | 1 0 1 2 1.75 2 370 | 1 1 0 0 3.8499999046325684 2 371 | 1 0 0 0 4.25 4 372 | 1 1 0 0 3.4700000286102295 3 373 | 1 0 1 1 3.9200000762939453 8 374 | 1 0 1 2 2.509999990463257 3 375 | 1 1 0 0 1.399999976158142 12 376 | 1 0 1 1 2 11 377 | 1 1 0 0 2 10 378 | 1 0 0 0 4.289999961853027 2 379 | 1 0 0 0 3.359999895095825 10 380 | 1 0 1 0 4.619999885559082 1 381 | 1 0 1 1 2.559999942779541 16 382 | 1 0 1 0 4.619999885559082 18 383 | 1 0 1 0 3.5899999141693115 10 384 | 1 0 1 2 4.289999961853027 6 385 | 1 1 0 0 3.5899999141693115 8 386 | 1 1 0 0 4.289999961853027 6 387 | 1 0 1 0 3.190000057220459 3 388 | 1 1 1 0 3.5899999141693115 1 389 | 1 1 1 0 2.9600000381469727 19 390 | 1 1 0 0 2.5799999237060547 2 391 | 1 0 1 3 2.9600000381469727 6 392 | 1 1 1 0 4.539999961853027 13 393 | 1 0 1 1 4.539999961853027 10 394 | 1 0 1 2 2.5 4 395 | 1 0 1 0 1.6799999475479126 8 396 | 1 0 1 0 2.319999933242798 4 397 | 1 1 1 3 3.190000057220459 9 398 | 1 1 0 0 2.2100000381469727 2 399 | 1 1 1 0 1.25 5 400 | 1 0 0 0 4.139999866485596 5 401 | 1 1 1 0 4.619999885559082 4 402 | 1 1 0 0 4.539999961853027 37 403 | 1 1 1 0 3.1500000953674316 9 404 | 1 0 0 0 4.289999961853027 11 405 | 1 1 1 0 1.9500000476837158 3 406 | 1 0 0 0 2 9 407 | 1 1 1 0 4.539999961853027 2 408 | 1 1 1 2 2.5799999237060547 2 409 | 1 1 1 0 3.8499999046325684 29 410 | 1 0 1 1 3.4100000858306885 10 411 | 1 1 1 1 2.259999990463257 9 412 | 1 1 0 0 4.25 2 413 | 1 0 1 1 3.5899999141693115 6 414 | 1 1 1 1 2.119999885559082 9 415 | 1 0 1 0 2.9600000381469727 7 416 | 1 1 0 0 2.5199999809265137 0 417 | 1 1 0 0 2.259999990463257 4 418 | 1 1 0 0 4.539999961853027 9 419 | 1 1 1 2 2.859999895095825 7 420 | 1 0 1 0 4.539999961853027 2 421 | 1 0 1 0 3.5899999141693115 12 422 | 1 1 1 0 4.619999885559082 18 423 | 1 0 1 3 1.6699999570846558 3 424 | 1 0 1 2 1.75 1 425 | 1 1 1 1 2.8299999237060547 5 426 | 1 0 1 3 3.1500000953674316 11 427 | 1 0 1 1 3.5899999141693115 9 428 | 1 1 0 0 3.75 24 429 | 1 1 0 0 2.049999952316284 17 430 | 1 0 1 0 4.289999961853027 0 431 | 1 0 1 2 4.289999961853027 0 432 | 1 1 1 0 2.390000104904175 7 433 | 1 0 1 1 4.539999961853027 3 434 | 1 0 1 0 2.319999933242798 4 435 | 1 1 0 0 4.289999961853027 25 436 | 1 0 1 1 3.690000057220459 5 437 | 1 0 0 0 4.289999961853027 9 438 | 1 0 0 0 3.9200000762939453 2 439 | 1 0 1 0 2.759999990463257 5 440 | 1 0 0 0 2.759999990463257 6 441 | 1 0 1 2 3.619999885559082 4 442 | 1 0 1 1 4.289999961853027 14 443 | 1 0 0 0 2.869999885559082 12 444 | 1 1 1 1 2.390000104904175 9 445 | 1 0 0 0 4.289999961853027 14 446 | 1 0 1 0 1.7799999713897705 4 447 | 1 0 1 1 2.0999999046325684 8 448 | 1 1 1 0 2.25 3 449 | 1 0 1 1 2.259999990463257 5 450 | 1 0 1 0 1.8600000143051147 5 451 | 1 1 0 0 3.9200000762939453 4 452 | 1 0 1 0 4.340000152587891 10 453 | 1 1 0 0 3.359999895095825 4 454 | 1 1 0 0 2.509999990463257 1 455 | 1 1 0 0 3.75 21 456 | 1 0 1 1 4.539999961853027 45 457 | 1 1 0 0 1.6299999952316284 8 458 | 1 1 0 0 4.619999885559082 10 459 | 1 0 1 0 1.5199999809265137 2 460 | 1 0 1 0 2.549999952316284 11 461 | 1 1 1 0 1.1799999475479126 1 462 | 1 1 1 2 3.0899999141693115 9 463 | 1 1 0 0 4.619999885559082 5 464 | 1 1 1 0 2.319999933242798 1 465 | 1 0 0 0 2.049999952316284 3 466 | 1 0 1 0 4.289999961853027 1 467 | 1 0 0 0 3.4000000953674316 2 468 | 1 0 0 0 4.289999961853027 10 469 | 1 0 1 1 3.5899999141693115 2 470 | 1 0 0 0 3.359999895095825 5 471 | 1 1 0 0 2.049999952316284 13 472 | 1 1 1 0 3.5899999141693115 32 473 | 1 1 0 0 3.8499999046325684 18 474 | 1 0 0 0 3.359999895095825 0 475 | 1 1 1 0 1.75 0 476 | 1 1 1 0 3.4700000286102295 3 477 | 1 0 1 1 2.609999895095825 4 478 | 1 0 0 0 2.759999990463257 4 479 | 1 0 1 2 4.289999961853027 9 480 | 1 1 0 0 1.399999976158142 3 481 | 1 1 1 0 4.619999885559082 18 482 | 1 0 1 1 2.859999895095825 17 483 | 1 0 1 1 2.859999895095825 17 484 | 1 0 1 0 2.859999895095825 39 485 | 1 0 0 0 2 7 486 | 1 1 1 0 1.7599999904632568 8 487 | 1 0 1 0 2.5 0 488 | 1 1 0 0 4.289999961853027 17 489 | 1 0 0 0 2.559999942779541 16 490 | 1 1 1 1 3.5399999618530273 14 491 | 1 1 1 0 3.690000057220459 1 492 | 1 0 0 0 2.869999885559082 5 493 | 1 1 0 0 1.6549999713897705 0 494 | 1 0 1 1 4.340000152587891 4 495 | 1 0 1 0 3.5399999618530273 11 496 | 1 0 1 2 4.289999961853027 10 497 | 1 0 1 0 1.809999942779541 0 498 | 1 1 0 0 3.4000000953674316 7 499 | 1 0 1 1 1.809999942779541 5 500 | 1 1 0 0 3.190000057220459 18 501 | 1 1 1 1 2.319999933242798 5 502 | 1 1 0 0 4.539999961853027 19 503 | 1 0 1 1 4.289999961853027 2 504 | 1 0 1 1 2.859999895095825 2 505 | 1 1 1 1 2.859999895095825 2 506 | 1 0 1 1 1.2200000286102295 0 507 | 1 0 1 0 2.0999999046325684 0 508 | 1 0 1 1 1.6799999475479126 12 509 | 1 1 0 0 3.359999895095825 34 510 | 1 1 1 0 2.869999885559082 15 511 | 1 0 1 1 3.4700000286102295 1 512 | 1 0 0 0 2.759999990463257 7 513 | 1 0 1 1 3.5399999618530273 21 514 | 1 1 1 0 1.9500000476837158 11 515 | 1 0 1 1 3.5899999141693115 19 516 | 1 1 0 0 3.190000057220459 3 517 | 1 1 0 0 2.759999990463257 2 518 | 1 1 1 2 4.289999961853027 13 519 | 1 0 1 2 3.190000057220459 5 520 | 1 0 1 0 2.9600000381469727 0 521 | 1 0 1 1 2.759999990463257 7 522 | 1 1 1 0 1.809999942779541 11 523 | 2 0 1 0 3.359999895095825 4 524 | 2 0 0 0 3.0899999141693115 14 525 | 2 1 0 0 3.75 8 526 | 2 1 1 0 4.539999961853027 10 527 | 2 1 0 0 1.6299999952316284 6 528 | 2 1 1 0 3.0899999141693115 2 529 | 2 1 0 0 3.690000057220459 4 530 | 2 1 0 0 2.119999885559082 2 531 | 2 1 0 0 2.259999990463257 3 532 | 2 0 1 1 4.289999961853027 2 533 | 2 1 1 1 1.7400000095367432 4 534 | 2 0 1 2 2.9600000381469727 8 535 | 2 1 0 0 2.5799999237060547 5 536 | 2 0 1 1 4.539999961853027 10 537 | 2 1 0 0 4.619999885559082 12 538 | 2 0 1 1 4.289999961853027 13 539 | 2 1 0 0 4.539999961853027 13 540 | 2 1 1 0 3.5899999141693115 0 541 | 2 1 0 0 4.289999961853027 5 542 | 2 0 0 0 3.690000057220459 26 543 | 2 1 0 0 3.2100000381469727 0 544 | 2 1 1 1 3.4700000286102295 11 545 | 2 0 0 0 4.539999961853027 37 546 | 2 0 1 0 1.9500000476837158 4 547 | 2 0 1 2 2.9600000381469727 46 548 | 2 0 1 1 4.539999961853027 53 549 | 2 0 1 0 3.4000000953674316 9 550 | 2 0 1 0 2.0999999046325684 4 551 | 2 0 1 2 4.289999961853027 8 552 | 2 1 0 0 3.75 5 553 | 2 1 0 0 2.049999952316284 2 554 | 2 0 1 1 2.5 2 555 | 2 1 1 1 4.619999885559082 17 556 | 2 1 1 0 1.7999999523162842 10 557 | 2 1 0 0 3.0899999141693115 0 558 | 2 0 0 0 4.289999961853027 10 559 | 2 0 1 2 1.7999999523162842 6 560 | 2 1 1 0 3.5899999141693115 5 561 | 2 0 1 0 2.759999990463257 3 562 | 2 0 1 0 4.139999866485596 16 563 | 2 0 0 0 3.5899999141693115 3 564 | 2 1 0 0 4.539999961853027 12 565 | 2 1 1 0 0.9200000166893005 0 566 | 2 1 0 0 4.289999961853027 15 567 | 2 0 1 1 2.390000104904175 4 568 | 2 0 1 0 1.1799999475479126 0 569 | 2 0 1 2 4.539999961853027 2 570 | 2 1 1 2 3.8499999046325684 47 571 | 2 0 1 2 2.9600000381469727 21 572 | 2 1 1 0 3.359999895095825 8 573 | 2 1 1 0 3.190000057220459 5 574 | 2 1 1 1 3.690000057220459 15 575 | 2 0 1 0 1.6299999952316284 8 576 | 2 0 1 0 1.7200000286102295 1 577 | 2 1 1 0 2.5799999237060547 5 578 | 2 0 1 0 1.5199999809265137 5 579 | 2 0 0 0 2.859999895095825 4 580 | 2 1 1 2 1.2200000286102295 4 581 | 2 1 0 0 2.5399999618530273 4 582 | 2 1 0 0 3.359999895095825 4 583 | 2 0 1 2 2.869999885559082 20 584 | 2 0 1 1 3.1500000953674316 7 585 | 2 1 0 0 4.619999885559082 14 586 | 2 0 1 1 1.7599999904632568 7 587 | 2 0 1 1 1.5199999809265137 3 588 | 2 0 0 0 4.289999961853027 5 589 | 2 0 1 2 4.289999961853027 10 590 | 2 0 1 1 2.559999942779541 5 591 | 2 0 0 0 4.539999961853027 29 592 | 2 1 1 0 1.8600000143051147 5 593 | 2 1 0 0 4.289999961853027 1 594 | 2 1 1 0 3.9200000762939453 7 595 | 2 0 0 0 2.259999990463257 2 596 | 2 0 1 1 4.289999961853027 14 597 | 2 1 1 1 3.690000057220459 10 598 | 2 0 1 2 1.9700000286102295 0 599 | 2 0 1 1 3.190000057220459 10 600 | 2 1 0 0 4.539999961853027 5 601 | 2 0 0 0 4.289999961853027 25 602 | 2 0 1 0 2.8299999237060547 1 603 | 2 0 1 3 4.340000152587891 7 604 | 2 1 0 0 3.359999895095825 3 605 | 2 1 0 0 3.4100000858306885 14 606 | 2 1 0 0 2.859999895095825 3 607 | 2 0 1 0 4.289999961853027 24 608 | 2 0 1 1 2.759999990463257 8 609 | 2 1 1 0 3.75 0 610 | 2 0 1 1 4.289999961853027 12 611 | 2 0 1 2 2.2100000381469727 12 612 | 2 0 1 0 3.5399999618530273 21 613 | 2 0 1 0 2.9600000381469727 2 614 | 2 1 1 0 2.119999885559082 2 615 | 2 0 0 0 2.390000104904175 4 616 | 2 1 0 0 3.190000057220459 5 617 | 2 1 0 0 1.9700000286102295 8 618 | 2 1 1 0 4.539999961853027 9 619 | 2 1 0 0 2.549999952316284 36 620 | 2 0 1 0 1.7599999904632568 0 621 | 2 0 1 2 4.340000152587891 17 622 | 2 1 1 0 2.5199999809265137 16 623 | 2 0 1 1 2.559999942779541 7 624 | 2 0 0 0 4.539999961853027 10 625 | 2 1 1 0 1.399999976158142 0 626 | 2 0 1 2 3.319999933242798 27 627 | 2 0 1 0 2.509999990463257 5 628 | 2 1 1 2 2.5799999237060547 3 629 | 2 0 1 2 3.5399999618530273 16 630 | 2 0 1 0 4.539999961853027 11 631 | 2 0 0 0 4.289999961853027 17 632 | 2 0 1 0 4.340000152587891 5 633 | 2 1 1 1 3.4700000286102295 4 634 | 2 1 0 0 3.4700000286102295 5 635 | 2 0 1 2 2.5 5 636 | 2 0 1 3 2 8 637 | 2 0 0 0 3.9200000762939453 10 638 | 2 1 1 0 4.619999885559082 23 639 | 2 1 0 0 3.0899999141693115 6 640 | 2 1 0 0 2.319999933242798 4 641 | 2 1 0 0 4.289999961853027 21 642 | 2 1 0 0 3.359999895095825 7 643 | 2 0 1 0 3.690000057220459 5 644 | 2 0 1 0 4.289999961853027 17 645 | 2 1 0 0 3.5899999141693115 11 646 | 2 0 1 0 4.289999961853027 13 647 | 2 1 1 1 3.2100000381469727 7 648 | 2 1 0 0 2.259999990463257 8 649 | 2 0 1 0 2.5 0 650 | 2 0 0 0 2.259999990463257 3 651 | 2 1 1 0 3.75 4 652 | 2 1 0 0 4.289999961853027 4 653 | 2 0 1 0 3.190000057220459 11 654 | 2 1 1 1 2 7 655 | 2 1 1 0 3.0899999141693115 1 656 | 2 1 1 1 3.5899999141693115 2 657 | 2 1 1 0 4.340000152587891 11 658 | 2 1 0 0 2.869999885559082 9 659 | 2 0 1 0 2.0999999046325684 2 660 | 2 1 0 0 2.8299999237060547 26 661 | 2 0 1 1 2.0999999046325684 3 662 | 2 0 1 1 1.7799999713897705 1 663 | 2 0 1 0 2.509999990463257 0 664 | 2 0 1 1 2.5199999809265137 10 665 | 2 1 1 1 3.4000000953674316 4 666 | 2 1 1 1 3.0899999141693115 3 667 | 2 0 1 0 4.289999961853027 13 668 | 2 1 1 0 2.9600000381469727 8 669 | 2 1 1 2 3.5899999141693115 11 670 | 2 0 0 0 4.539999961853027 55 671 | 2 1 1 0 3.5899999141693115 11 672 | 2 0 1 1 3.4200000762939453 3 673 | 2 1 1 1 4.289999961853027 4 674 | 2 1 0 0 3.190000057220459 4 675 | 2 1 1 1 4.289999961853027 11 676 | 2 0 1 0 4.619999885559082 3 677 | 2 0 1 2 1.8600000143051147 6 678 | 2 0 1 0 4.619999885559082 8 679 | 2 1 1 2 4.539999961853027 48 680 | 2 1 1 0 3.75 8 681 | 2 0 1 0 1.7599999904632568 0 682 | 2 1 1 0 3.4100000858306885 31 683 | 2 0 1 0 2.549999952316284 24 684 | 2 0 1 1 1.25 2 685 | 2 1 1 0 1.4800000190734863 9 686 | 2 1 1 0 2.8299999237060547 3 687 | 2 1 1 0 4.289999961853027 6 688 | 2 1 0 0 3.190000057220459 10 689 | 2 1 0 0 1.399999976158142 0 690 | 2 1 1 2 3.2100000381469727 2 691 | 2 0 0 0 4.619999885559082 9 692 | 2 1 1 0 1.4500000476837158 7 693 | 2 1 1 1 3.3399999141693115 4 694 | 2 1 0 0 4.539999961853027 23 695 | 2 0 1 2 4.289999961853027 30 696 | 2 1 0 0 3.690000057220459 7 697 | 2 1 0 0 2.5799999237060547 2 698 | 2 1 1 1 4.539999961853027 7 699 | 2 0 1 1 4.539999961853027 6 700 | 2 0 0 0 4.289999961853027 9 701 | 3 1 0 0 4.289999961853027 12 702 | 3 1 0 0 3.75 0 703 | 3 0 0 0 4.619999885559082 8 704 | 3 0 1 2 4.539999961853027 31 705 | 3 0 1 0 3.0899999141693115 5 706 | 3 0 1 1 1.9500000476837158 14 707 | 3 1 1 1 3.5899999141693115 3 708 | 3 0 1 1 2.609999895095825 5 709 | 3 0 0 0 2.869999885559082 12 710 | 3 0 0 0 4.289999961853027 30 711 | 3 1 1 0 3.4100000858306885 19 712 | 3 1 1 1 2.319999933242798 5 713 | 3 1 1 0 2.119999885559082 3 714 | 3 0 1 0 4.539999961853027 37 715 | 3 0 1 0 2.9600000381469727 8 716 | 3 0 0 0 2.869999885559082 12 717 | 3 0 1 0 2.319999933242798 4 718 | 3 0 1 2 1.7999999523162842 6 719 | 3 1 0 0 2.390000104904175 11 720 | 3 1 1 1 4.619999885559082 14 721 | 3 1 0 0 4.289999961853027 10 722 | 3 0 1 0 4.539999961853027 5 723 | 3 0 0 0 3.5899999141693115 16 724 | 3 0 1 2 4.289999961853027 0 725 | 3 0 1 1 2.9600000381469727 5 726 | 3 0 1 2 4.289999961853027 25 727 | 3 0 1 1 4.289999961853027 18 728 | 3 0 1 1 2.5399999618530273 6 729 | 3 0 0 0 3.9200000762939453 0 730 | 3 0 0 0 2.319999933242798 8 731 | 3 1 0 0 4.619999885559082 22 732 | 3 1 1 0 3.75 6 733 | 3 0 1 2 1.9500000476837158 3 734 | 3 0 1 1 1.7999999523162842 4 735 | 3 0 1 0 2.759999990463257 8 736 | 3 0 0 0 3.5899999141693115 7 737 | 3 0 1 0 4.25 20 738 | 3 0 1 2 2.8299999237060547 7 739 | 3 1 0 0 2.5199999809265137 4 740 | 3 1 1 0 1.1799999475479126 0 741 | 3 1 1 0 3.359999895095825 11 742 | 3 1 1 0 4.289999961853027 21 743 | 3 1 1 1 4.619999885559082 49 744 | 3 0 0 0 2.8299999237060547 8 745 | 3 0 1 0 3.690000057220459 24 746 | 3 0 1 0 2.259999990463257 12 747 | 3 1 0 0 2.049999952316284 5 748 | 3 0 1 1 4.289999961853027 19 749 | 3 0 1 0 1.4199999570846558 3 750 | 3 1 1 2 1.8899999856948853 16 751 | 3 0 1 0 3.690000057220459 26 752 | 3 1 1 0 2.9600000381469727 22 753 | 3 1 1 1 1.8600000143051147 1 754 | 3 1 1 0 2.259999990463257 6 755 | 3 0 1 0 4.289999961853027 15 756 | 3 1 1 0 3.1500000953674316 4 757 | 3 0 1 0 2.609999895095825 0 758 | 3 1 0 0 4.539999961853027 3 759 | 3 0 1 1 2.2100000381469727 10 760 | 3 0 1 0 1.7799999713897705 0 761 | 3 0 1 1 3.619999885559082 5 762 | 3 0 1 0 3.9200000762939453 0 763 | 3 1 1 0 4.619999885559082 10 764 | 3 0 1 0 2.609999895095825 5 765 | 3 1 1 1 3.359999895095825 3 766 | 3 1 1 0 3.690000057220459 12 767 | 3 0 1 0 3.4700000286102295 14 768 | 3 0 1 0 1.25 4 769 | 3 0 1 0 4.619999885559082 18 770 | 3 0 1 1 3.5899999141693115 14 771 | 3 1 0 0 3.359999895095825 2 772 | 3 0 1 0 4.289999961853027 38 773 | 3 0 0 0 3.5399999618530273 12 774 | 3 0 1 2 2.5799999237060547 2 775 | 3 0 0 0 1.5199999809265137 3 776 | 3 0 1 0 3.4700000286102295 8 777 | 3 1 1 0 4.619999885559082 14 778 | 3 0 0 0 3.5399999618530273 25 779 | 3 0 1 2 2.0999999046325684 4 780 | 3 0 1 2 1.3799999952316284 8 781 | 3 0 1 1 3.0899999141693115 5 782 | 3 0 1 3 2.859999895095825 3 783 | 3 1 0 0 4.289999961853027 5 784 | 3 1 1 1 1.2200000286102295 3 785 | 4 0 1 0 2.8299999237060547 5 786 | 4 1 1 0 4.539999961853027 16 787 | 4 0 1 0 4.289999961853027 22 788 | 4 0 0 0 4.289999961853027 7 789 | 4 0 1 0 3.5399999618530273 9 790 | 4 0 0 0 2.5199999809265137 6 791 | 4 1 0 0 1.399999976158142 3 792 | 4 1 1 0 4.340000152587891 6 793 | 4 1 1 1 2.509999990463257 2 794 | 4 0 1 0 4.289999961853027 2 795 | 4 0 1 1 2.9600000381469727 0 796 | 4 0 0 0 3.8499999046325684 47 797 | 4 0 0 0 4.539999961853027 5 798 | 4 1 1 0 3.4000000953674316 5 799 | 4 0 0 0 3.4100000858306885 21 800 | 4 0 1 1 2.9600000381469727 57 801 | 4 1 1 0 3.5399999618530273 5 802 | 4 0 1 2 1.7200000286102295 2 803 | 4 0 1 2 2.859999895095825 15 804 | 4 0 1 2 4.539999961853027 66 805 | 4 0 1 1 1.7599999904632568 11 806 | 4 0 1 2 1.8600000143051147 15 807 | 4 0 1 0 3.4700000286102295 4 808 | 4 0 0 0 3.4100000858306885 7 809 | 4 0 1 0 1.809999942779541 10 810 | 4 0 1 2 2.049999952316284 13 811 | 4 0 0 0 4.619999885559082 18 812 | 4 1 1 0 2.259999990463257 3 813 | 4 0 1 1 1.8600000143051147 47 814 | 4 1 1 1 4.289999961853027 4 815 | 4 0 1 2 3.5399999618530273 2 816 | 4 0 1 0 4.289999961853027 30 817 | 4 0 1 0 2.5399999618530273 11 818 | 4 1 1 0 4.25 13 819 | 4 1 0 0 2.5 10 820 | 4 1 1 1 3.5399999618530273 12 821 | 4 1 1 0 3.190000057220459 8 822 | 4 0 1 2 4.539999961853027 21 823 | 4 1 0 0 2.859999895095825 39 824 | 4 1 1 0 2.5799999237060547 14 825 | 4 1 1 0 1.2799999713897705 4 826 | 4 1 1 0 3.2100000381469727 14 827 | 4 1 0 0 2.869999885559082 6 828 | 4 1 0 0 3.3399999141693115 11 829 | 4 1 0 0 3.1500000953674316 8 830 | 4 1 0 0 2.319999933242798 8 831 | 4 0 0 0 4.289999961853027 19 832 | 4 0 0 0 3.690000057220459 7 833 | 4 0 1 1 2.549999952316284 21 834 | 4 1 0 0 4.619999885559082 3 835 | 4 1 1 0 2.359999895095825 5 836 | 4 0 1 1 4.25 29 837 | 4 0 1 1 4.289999961853027 5 838 | 4 1 1 0 3.190000057220459 7 839 | 4 1 0 0 3.190000057220459 3 840 | 4 1 0 0 2.559999942779541 23 841 | 4 0 1 0 1.7599999904632568 7 842 | 4 0 1 1 2.5799999237060547 25 843 | 4 1 1 0 3.4100000858306885 4 844 | 4 1 0 0 3.5899999141693115 9 845 | 4 1 1 0 1.7799999713897705 4 846 | 4 1 0 0 3.75 3 847 | 4 0 1 2 3.690000057220459 16 848 | 4 0 1 0 3.619999885559082 5 849 | 4 0 1 0 1.5049999952316284 4 850 | 4 0 1 2 2.5799999237060547 6 851 | 4 0 0 0 2.869999885559082 15 852 | 5 1 1 0 2.259999990463257 16 853 | 5 1 0 0 3.75 13 854 | 5 0 1 2 3.8499999046325684 15 855 | 5 0 1 1 3.1500000953674316 5 856 | 5 0 1 1 3.1500000953674316 27 857 | 5 0 1 0 4.340000152587891 6 858 | 5 0 1 0 4.25 12 859 | 5 0 0 0 3.359999895095825 11 860 | 5 1 0 0 3.5899999141693115 14 861 | 5 0 1 1 2.5799999237060547 25 862 | 5 0 1 1 3.5899999141693115 21 863 | 5 0 1 1 2.9600000381469727 4 864 | 5 0 1 0 4.539999961853027 13 865 | 5 0 1 0 4.289999961853027 30 866 | 5 1 0 0 2.869999885559082 12 867 | 5 1 1 0 4.539999961853027 15 868 | 5 1 1 1 3.8499999046325684 1 869 | 5 0 1 0 2.5 4 870 | 5 0 1 1 3.4700000286102295 0 871 | 5 1 1 0 2.859999895095825 24 872 | 5 1 1 0 4.619999885559082 0 873 | 5 0 1 1 4.539999961853027 53 874 | 5 0 1 0 1.25 2 875 | 5 1 1 0 3.2100000381469727 19 876 | 5 0 1 1 1.7599999904632568 11 877 | 5 1 0 0 2.5799999237060547 6 878 | 5 1 0 0 3.619999885559082 3 879 | 6 0 1 1 4.619999885559082 8 880 | 6 1 1 0 2.0999999046325684 36 881 | 6 1 1 2 1.8600000143051147 38 882 | 6 0 1 0 4.340000152587891 9 883 | 6 1 1 0 4.289999961853027 24 884 | 6 0 1 2 2.509999990463257 11 885 | 6 0 1 1 2.9600000381469727 13 886 | 6 0 0 0 4.289999961853027 18 887 | 6 0 0 0 3.4000000953674316 14 888 | 6 1 0 0 4.539999961853027 12 889 | 6 0 1 1 3.8499999046325684 16 890 | 6 1 0 0 3.1500000953674316 9 891 | 6 1 0 0 4.539999961853027 15 892 | 6 0 0 0 3.4700000286102295 6 893 | 6 1 1 0 4.289999961853027 1 894 | 6 0 0 0 1.9700000286102295 4 895 | 6 1 0 0 3.319999933242798 6 896 | 7 0 1 0 3.5899999141693115 1 897 | 7 0 0 0 2.5399999618530273 6 898 | 7 0 0 0 3.4100000858306885 20 899 | 7 0 1 1 1.9700000286102295 0 900 | 7 1 0 0 3.1500000953674316 9 901 | 7 0 0 0 4.619999885559082 15 902 | 7 0 0 0 4.539999961853027 42 903 | 7 0 1 0 3.690000057220459 9 904 | 7 0 0 0 4.340000152587891 19 905 | 7 0 0 0 4.289999961853027 19 906 | 7 0 1 1 3.5899999141693115 27 907 | 7 0 0 0 3.690000057220459 19 908 | 8 0 1 0 2.509999990463257 11 909 | 9 0 1 1 2.9600000381469727 23 910 | 9 0 1 1 1.8600000143051147 47 911 | 10 1 1 0 3.5899999141693115 18 912 | 11 0 1 2 2.859999895095825 7 913 | 12 0 1 1 4.289999961853027 35 914 | 12 0 1 1 1.8600000143051147 5 915 | 16 0 1 0 1.7400000095367432 21 916 | 19 0 1 0 1.8600000143051147 42 917 | -------------------------------------------------------------------------------- /data/test-biochemists-nb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from autoencoder.io import read_text 3 | from autoencoder.network import MLP 4 | from keras.callbacks import TensorBoard 5 | 6 | count = read_text('biochemists.tsv', header='infer') 7 | y = count[:, 0].astype(int) 8 | x = count[:, 1:] 9 | 10 | net = MLP(x.shape[1], output_size=1, hidden_size=(), masking=False, loss_type='nb') 11 | net.build() 12 | model = net.model 13 | tb = TensorBoard(log_dir='./logs', histogram_freq=1) 14 | 15 | model.summary() 16 | model.compile(loss=net.loss, optimizer='Adam') 17 | model.fit(x, y, epochs=700, batch_size=32, callbacks=[tb]) 18 | 19 | 20 | print('Theta: %f' % net.extra_models['dispersion']()) 21 | -------------------------------------------------------------------------------- /data/test-biochemists-zinb-ae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from autoencoder.io import read_text, preprocess 4 | from autoencoder.api import autoencode 5 | import keras.backend as K 6 | 7 | # for full reproducibility 8 | np.random.seed(1) 9 | tf.set_random_seed(1) 10 | sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, 11 | inter_op_parallelism_threads=1)) 12 | K.set_session(sess) 13 | 14 | x = read_text('biochemists.tsv', header='infer') 15 | print(x.shape) 16 | 17 | # test API 18 | result = autoencode(x, 'test-ae', type='zinb-conddisp', hidden_size=(1,), epochs=3) 19 | -------------------------------------------------------------------------------- /data/test-biochemists-zinb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from autoencoder.io import read_text 3 | from autoencoder.network import MLP 4 | from keras.callbacks import TensorBoard 5 | 6 | count = read_text('biochemists.tsv', header='infer') 7 | y = count[:, 0].astype(int) 8 | x = count[:, 1:] 9 | 10 | net = MLP(x.shape[1], output_size=1, hidden_size=(), masking=False, loss_type='zinb') 11 | net.build() 12 | model = net.model 13 | tb = TensorBoard(log_dir='./logs', histogram_freq=1) 14 | 15 | model.summary() 16 | model.compile(loss=net.loss, optimizer='Adam') 17 | model.fit(x, y, epochs=700, batch_size=32, callbacks=[tb]) 18 | 19 | print('Theta: %f' % net.extra_models['dispersion']()) 20 | -------------------------------------------------------------------------------- /dca/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['KERAS_BACKEND'] = 'tensorflow' 3 | -------------------------------------------------------------------------------- /dca/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Goekcen Eraslan 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os, sys, argparse 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Autoencoder') 20 | 21 | parser.add_argument('input', type=str, help='Input is raw count data in TSV/CSV ' 22 | 'or H5AD (anndata) format. ' 23 | 'Row/col names are mandatory. Note that TSV/CSV files must be in ' 24 | 'gene x cell layout where rows are genes and cols are cells (scRNA-seq ' 25 | 'convention).' 26 | 'Use the -t/--transpose option if your count matrix in cell x gene layout. ' 27 | 'H5AD files must be in cell x gene format (stats and scanpy convention).') 28 | parser.add_argument('outputdir', type=str, help='The path of the output directory') 29 | 30 | # IO and norm options 31 | parser.add_argument('--normtype', type=str, default='zheng', 32 | help='Type of size factor estimation. Possible values: deseq, zheng.' 33 | ' (default: zheng)') 34 | parser.add_argument('-t', '--transpose', dest='transpose', 35 | action='store_true', help='Transpose input matrix (default: False)') 36 | parser.add_argument('--testsplit', dest='testsplit', 37 | action='store_true', help="Use one fold as a test set (default: False)") 38 | 39 | # training options 40 | parser.add_argument('--type', type=str, default='nb-conddisp', 41 | help="Type of autoencoder. Possible values: normal, poisson, nb, " 42 | "nb-shared, nb-conddisp (default), nb-fork, zinb, " 43 | "zinb-shared, zinb-conddisp( zinb-fork") 44 | parser.add_argument('--threads', type=int, default=None, 45 | help='Number of threads for training (default is all cores)') 46 | parser.add_argument('-b', '--batchsize', type=int, default=32, 47 | help="Batch size (default:32)") 48 | parser.add_argument('--sizefactors', dest='sizefactors', 49 | action='store_true', help="Normalize means by library size (default: True)") 50 | parser.add_argument('--nosizefactors', dest='sizefactors', 51 | action='store_false', help="Do not normalize means by library size") 52 | parser.add_argument('--norminput', dest='norminput', 53 | action='store_true', help="Zero-mean normalize input (default: True)") 54 | parser.add_argument('--nonorminput', dest='norminput', 55 | action='store_false', help="Do not zero-mean normalize inputs") 56 | parser.add_argument('--loginput', dest='loginput', 57 | action='store_true', help="Log-transform input (default: True)") 58 | parser.add_argument('--nologinput', dest='loginput', 59 | action='store_false', help="Do not log-transform inputs") 60 | parser.add_argument('-d', '--dropoutrate', type=str, default='0.0', 61 | help="Dropout rate (default: 0)") 62 | parser.add_argument('--batchnorm', dest='batchnorm', action='store_true', 63 | help="Batchnorm (default: True)") 64 | parser.add_argument('--nobatchnorm', dest='batchnorm', action='store_false', 65 | help="Do not use batchnorm") 66 | parser.add_argument('--l2', type=float, default=0.0, 67 | help="L2 regularization coefficient (default: 0.0)") 68 | parser.add_argument('--l1', type=float, default=0.0, 69 | help="L1 regularization coefficient (default: 0.0)") 70 | parser.add_argument('--l2enc', type=float, default=0.0, 71 | help="Encoder-specific L2 regularization coefficient (default: 0.0)") 72 | parser.add_argument('--l1enc', type=float, default=0.0, 73 | help="Encoder-specific L1 regularization coefficient (default: 0.0)") 74 | parser.add_argument('--ridge', type=float, default=0.0, 75 | help="L2 regularization coefficient for dropout probabilities (default: 0.0)") 76 | parser.add_argument('--gradclip', type=float, default=5.0, 77 | help="Clip grad values (default: 5.0)") 78 | parser.add_argument('--activation', type=str, default='relu', 79 | help="Activation function of hidden units (default: relu)") 80 | parser.add_argument('--optimizer', type=str, default='RMSprop', 81 | help="Optimization method (default: RMSprop)") 82 | parser.add_argument('--init', type=str, default='glorot_uniform', 83 | help="Initialization method for weights (default: glorot_uniform)") 84 | parser.add_argument('-e', '--epochs', type=int, default=300, 85 | help="Max number of epochs to continue training in case of no " 86 | "improvement on validation loss (default: 300)") 87 | parser.add_argument('--earlystop', type=int, default=15, 88 | help="Number of epochs to stop training if no improvement in loss " 89 | "occurs (default: 15)") 90 | parser.add_argument('--reducelr', type=int, default=10, 91 | help="Number of epochs to reduce learning rate if no improvement " 92 | "in loss occurs (default: 10)") 93 | parser.add_argument('-s', '--hiddensize', type=str, default='64,32,64', 94 | help="Size of hidden layers (default: 64,32,64)") 95 | parser.add_argument('--inputdropout', type=float, default=0.0, 96 | help="Input layer dropout probability"), 97 | parser.add_argument('-r', '--learningrate', type=float, default=None, 98 | help="Learning rate (default: 0.001)") 99 | parser.add_argument('--saveweights', dest='saveweights', 100 | action='store_true', help="Save weights (default: False)") 101 | parser.add_argument('--no-saveweights', dest='saveweights', 102 | action='store_false', help="Do not save weights") 103 | parser.add_argument('--hyper', dest='hyper', 104 | action='store_true', help="Optimizer hyperparameters (default: False)") 105 | parser.add_argument('--hypern', dest='hypern', type=int, default=1000, 106 | help="Number of samples drawn from hyperparameter distributions during optimization. " 107 | "(default: 1000)") 108 | parser.add_argument('--hyperepoch', dest='hyperepoch', type=int, default=100, 109 | help="Number of epochs used in each hyperpar optimization iteration. " 110 | "(default: 100)") 111 | parser.add_argument('--debug', dest='debug', 112 | action='store_true', help="Enable debugging. Checks whether every term in " 113 | "loss functions is finite. (default: False)") 114 | parser.add_argument('--tensorboard', dest='tensorboard', 115 | action='store_true', help="Use tensorboard for saving weight distributions and " 116 | "visualization. (default: False)") 117 | parser.add_argument('--checkcounts', dest='checkcounts', action='store_true', 118 | help="Check if the expression matrix has raw (unnormalized) counts (default: True)") 119 | parser.add_argument('--nocheckcounts', dest='checkcounts', action='store_false', 120 | help="Do not check if the expression matrix has raw (unnormalized) counts") 121 | parser.add_argument('--denoisesubset', dest='denoisesubset', type=str, 122 | help='Perform denoising only for the subset of genes ' 123 | 'in the given file. Gene names should be line ' 124 | 'separated.') 125 | 126 | parser.set_defaults(transpose=False, 127 | testsplit=False, 128 | saveweights=False, 129 | sizefactors=True, 130 | batchnorm=True, 131 | checkcounts=True, 132 | norminput=True, 133 | hyper=False, 134 | debug=False, 135 | tensorboard=False, 136 | loginput=True) 137 | 138 | return parser.parse_args() 139 | 140 | 141 | def main(): 142 | args = parse_args() 143 | 144 | try: 145 | import tensorflow as tf 146 | except ImportError: 147 | raise ImportError('DCA requires TensorFlow v2+. Please follow instructions' 148 | ' at https://www.tensorflow.org/install/ to install' 149 | ' it.') 150 | 151 | # import tf and the rest after parse_args() to make argparse help faster 152 | from . import train 153 | 154 | train.train_with_args(args) 155 | -------------------------------------------------------------------------------- /dca/api.py: -------------------------------------------------------------------------------- 1 | import os, tempfile, shutil, random 2 | import anndata 3 | import numpy as np 4 | import scanpy as sc 5 | 6 | try: 7 | import tensorflow as tf 8 | except ImportError: 9 | raise ImportError('DCA requires TensorFlow v2+. Please follow instructions' 10 | ' at https://www.tensorflow.org/install/ to install' 11 | ' it.') 12 | 13 | 14 | from .io import read_dataset, normalize 15 | from .train import train 16 | from .network import AE_types 17 | 18 | 19 | def dca(adata, 20 | mode='denoise', 21 | ae_type='nb-conddisp', 22 | normalize_per_cell=True, 23 | scale=True, 24 | log1p=True, 25 | hidden_size=(64, 32, 64), # network args 26 | hidden_dropout=0., 27 | batchnorm=True, 28 | activation='relu', 29 | init='glorot_uniform', 30 | network_kwds={}, 31 | epochs=300, # training args 32 | reduce_lr=10, 33 | early_stop=15, 34 | batch_size=32, 35 | optimizer='RMSprop', 36 | learning_rate=None, 37 | random_state=0, 38 | threads=None, 39 | verbose=False, 40 | training_kwds={}, 41 | return_model=False, 42 | return_info=False, 43 | copy=False, 44 | check_counts=True, 45 | ): 46 | """Deep count autoencoder(DCA) API. 47 | 48 | Fits a count autoencoder to the count data given in the anndata object 49 | in order to denoise the data and capture hidden representation of 50 | cells in low dimensions. Type of the autoencoder and return values are 51 | determined by the parameters. 52 | 53 | Parameters 54 | ---------- 55 | adata : :class:`~scanpy.api.AnnData` 56 | An anndata file with `.raw` attribute representing raw counts. 57 | mode : `str`, optional. `denoise`(default), or `latent`. 58 | `denoise` overwrites `adata.X` with denoised expression values. 59 | In `latent` mode DCA adds `adata.obsm['X_dca']` to given adata 60 | object. This matrix represent latent representation of cells via DCA. 61 | ae_type : `str`, optional. `nb-conddisp`(default), `zinb`, `nb-conddisp` or `nb`. 62 | Type of the autoencoder. Return values and the architecture is 63 | determined by the type e.g. `nb` does not provide dropout 64 | probabilities. 65 | normalize_per_cell : `bool`, optional. Default: `True`. 66 | If true, library size normalization is performed using 67 | the `sc.pp.normalize_per_cell` function in Scanpy and saved into adata 68 | object. Mean layer is re-introduces library size differences by 69 | scaling the mean value of each cell in the output layer. See the 70 | manuscript for more details. 71 | scale : `bool`, optional. Default: `True`. 72 | If true, the input of the autoencoder is centered using 73 | `sc.pp.scale` function of Scanpy. Note that the output is kept as raw 74 | counts as loss functions are designed for the count data. 75 | log1p : `bool`, optional. Default: `True`. 76 | If true, the input of the autoencoder is log transformed with a 77 | pseudocount of one using `sc.pp.log1p` function of Scanpy. 78 | hidden_size : `tuple` or `list`, optional. Default: (64, 32, 64). 79 | Width of hidden layers. 80 | hidden_dropout : `float`, `tuple` or `list`, optional. Default: 0.0. 81 | Probability of weight dropout in the autoencoder (per layer if list 82 | or tuple). 83 | batchnorm : `bool`, optional. Default: `True`. 84 | If true, batch normalization is performed. 85 | activation : `str`, optional. Default: `relu`. 86 | Activation function of hidden layers. 87 | init : `str`, optional. Default: `glorot_uniform`. 88 | Initialization method used to initialize weights. 89 | network_kwds : `dict`, optional. 90 | Additional keyword arguments for the autoencoder. 91 | epochs : `int`, optional. Default: 300. 92 | Number of total epochs in training. 93 | reduce_lr : `int`, optional. Default: 10. 94 | Reduces learning rate if validation loss does not improve in given number of epochs. 95 | early_stop : `int`, optional. Default: 15. 96 | Stops training if validation loss does not improve in given number of epochs. 97 | batch_size : `int`, optional. Default: 32. 98 | Number of samples in the batch used for SGD. 99 | learning_rate : `float`, optional. Default: None. 100 | Learning rate to use in the training. 101 | optimizer : `str`, optional. Default: "RMSprop". 102 | Type of optimization method used for training. 103 | random_state : `int`, optional. Default: 0. 104 | Seed for python, numpy and tensorflow. 105 | threads : `int` or None, optional. Default: None 106 | Number of threads to use in training. All cores are used by default. 107 | verbose : `bool`, optional. Default: `False`. 108 | If true, prints additional information about training and architecture. 109 | training_kwds : `dict`, optional. 110 | Additional keyword arguments for the training process. 111 | return_model : `bool`, optional. Default: `False`. 112 | If true, trained autoencoder object is returned. See "Returns". 113 | return_info : `bool`, optional. Default: `False`. 114 | If true, all additional parameters of DCA are stored in `adata.obsm` such as dropout 115 | probabilities (obsm['X_dca_dropout']) and estimated dispersion values 116 | (obsm['X_dca_dispersion']), in case that autoencoder is of type 117 | zinb or zinb-conddisp. 118 | copy : `bool`, optional. Default: `False`. 119 | If true, a copy of anndata is returned. 120 | check_counts : `bool`. Default `True`. 121 | Check if the counts are unnormalized (raw) counts. 122 | 123 | Returns 124 | ------- 125 | If `copy` is true and `return_model` is false, AnnData object is returned. 126 | 127 | In "denoise" mode, `adata.X` is overwritten with the denoised values. In "latent" mode, latent 128 | low dimensional representation of cells are stored in `adata.obsm['X_dca']` and `adata.X` 129 | is not modified. Note that these values are not corrected for library size effects. 130 | 131 | If `return_info` is true, all estimated distribution parameters are stored in AnnData such as: 132 | 133 | - `.obsm["X_dca_dropout"]` which is the mixture coefficient (pi) of the zero component 134 | in ZINB, i.e. dropout probability. (Only if ae_type is zinb or zinb-conddisp) 135 | 136 | - `.obsm["X_dca_dispersion"]` which is the dispersion parameter of NB. 137 | 138 | - `.uns["dca_loss_history"]` which stores the loss history of the training. 139 | 140 | Finally, the raw counts are stored as `.raw`. 141 | 142 | If `return_model` is given, trained model is returned. When both `copy` and `return_model` 143 | are true, a tuple of anndata and model is returned in that order. 144 | """ 145 | 146 | assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance' 147 | assert mode in ('denoise', 'latent'), '%s is not a valid mode.' % mode 148 | 149 | # set seed for reproducibility 150 | random.seed(random_state) 151 | np.random.seed(random_state) 152 | tf.random.set_seed(random_state) 153 | os.environ['PYTHONHASHSEED'] = '0' 154 | 155 | # this creates adata.raw with raw counts and copies adata if copy==True 156 | adata = read_dataset(adata, 157 | transpose=False, 158 | test_split=False, 159 | copy=copy, 160 | check_counts=check_counts) 161 | 162 | # check for zero genes 163 | nonzero_genes, _ = sc.pp.filter_genes(adata.X, min_counts=1) 164 | assert nonzero_genes.all(), 'Please remove all-zero genes before using DCA.' 165 | 166 | adata = normalize(adata, 167 | filter_min_counts=False, # no filtering, keep cell and gene idxs same 168 | size_factors=normalize_per_cell, 169 | normalize_input=scale, 170 | logtrans_input=log1p) 171 | 172 | network_kwds = {**network_kwds, 173 | 'hidden_size': hidden_size, 174 | 'hidden_dropout': hidden_dropout, 175 | 'batchnorm': batchnorm, 176 | 'activation': activation, 177 | 'init': init 178 | } 179 | 180 | from tensorflow.python.framework.ops import disable_eager_execution 181 | disable_eager_execution() 182 | 183 | input_size = output_size = adata.n_vars 184 | net = AE_types[ae_type](input_size=input_size, 185 | output_size=output_size, 186 | **network_kwds) 187 | net.save() 188 | net.build() 189 | 190 | training_kwds = {**training_kwds, 191 | 'epochs': epochs, 192 | 'reduce_lr': reduce_lr, 193 | 'early_stop': early_stop, 194 | 'batch_size': batch_size, 195 | 'optimizer': optimizer, 196 | 'verbose': verbose, 197 | 'threads': threads, 198 | 'learning_rate': learning_rate 199 | } 200 | 201 | hist = train(adata[adata.obs.dca_split == 'train'], net, **training_kwds) 202 | res = net.predict(adata, mode, return_info, copy) 203 | adata = res if copy else adata 204 | 205 | if return_info: 206 | adata.uns['dca_loss_history'] = hist.history 207 | 208 | if return_model: 209 | return (adata, net) if copy else net 210 | else: 211 | return adata if copy else None 212 | -------------------------------------------------------------------------------- /dca/hyper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import json 4 | 5 | import numpy as np 6 | from kopt import CompileFN, test_fn 7 | from hyperopt import fmin, tpe, hp, Trials 8 | import keras.optimizers as opt 9 | 10 | from . import io 11 | from .network import AE_types 12 | 13 | 14 | def hyper(args): 15 | adata = io.read_dataset(args.input, 16 | transpose=args.transpose, 17 | test_split=False) 18 | 19 | hyper_params = { 20 | "data": { 21 | "norm_input_log": hp.choice('d_norm_log', (True, False)), 22 | "norm_input_zeromean": hp.choice('d_norm_zeromean', (True, False)), 23 | "norm_input_sf": hp.choice('d_norm_sf', (True, False)), 24 | }, 25 | "model": { 26 | "lr": hp.loguniform("m_lr", np.log(1e-3), np.log(1e-2)), 27 | "ridge": hp.loguniform("m_ridge", np.log(1e-7), np.log(1e-1)), 28 | "l1_enc_coef": hp.loguniform("m_l1_enc_coef", np.log(1e-7), np.log(1e-1)), 29 | "hidden_size": hp.choice("m_hiddensize", ((64,32,64), (32,16,32), 30 | (64,64), (32,32), (16,16), 31 | (16,), (32,), (64,), (128,))), 32 | "activation": hp.choice("m_activation", ('relu', 'selu', 'elu', 33 | 'PReLU', 'linear', 'LeakyReLU')), 34 | "aetype": hp.choice("m_aetype", ('zinb', 'zinb-conddisp')), 35 | "batchnorm": hp.choice("m_batchnorm", (True, False)), 36 | "dropout": hp.uniform("m_do", 0, 0.7), 37 | "input_dropout": hp.uniform("m_input_do", 0, 0.8), 38 | }, 39 | "fit": { 40 | "epochs": args.hyperepoch 41 | } 42 | } 43 | 44 | def data_fn(norm_input_log, norm_input_zeromean, norm_input_sf): 45 | 46 | ad = adata.copy() 47 | ad = io.normalize(ad, 48 | size_factors=norm_input_sf, 49 | logtrans_input=norm_input_log, 50 | normalize_input=norm_input_zeromean) 51 | 52 | x_train = {'count': ad.X, 'size_factors': ad.obs.size_factors} 53 | y_train = ad.raw.X 54 | 55 | return (x_train, y_train), 56 | 57 | def model_fn(train_data, lr, hidden_size, activation, aetype, batchnorm, 58 | dropout, input_dropout, ridge, l1_enc_coef): 59 | 60 | net = AE_types[aetype](train_data[1].shape[1], 61 | hidden_size=hidden_size, 62 | l2_coef=0.0, 63 | l1_coef=0.0, 64 | l2_enc_coef=0.0, 65 | l1_enc_coef=l1_enc_coef, 66 | ridge=ridge, 67 | hidden_dropout=dropout, 68 | input_dropout=input_dropout, 69 | batchnorm=batchnorm, 70 | activation=activation, 71 | init='glorot_uniform', 72 | debug=args.debug) 73 | net.build() 74 | net.model.summary() 75 | 76 | optimizer = opt.__dict__['RMSprop'](lr=lr, clipvalue=5.0) 77 | net.model.compile(loss=net.loss, optimizer=optimizer) 78 | 79 | return net.model 80 | 81 | output_dir = os.path.join(args.outputdir, 'hyperopt_results') 82 | objective = CompileFN('autoencoder_hyperpar_db', 'myexp1', 83 | data_fn=data_fn, 84 | model_fn=model_fn, 85 | loss_metric='loss', 86 | loss_metric_mode='min', 87 | valid_split=.2, 88 | save_model=None, 89 | save_results=True, 90 | use_tensorboard=False, 91 | save_dir=output_dir) 92 | 93 | test_fn(objective, hyper_params, save_model=None) 94 | 95 | trials = Trials() 96 | best = fmin(objective, 97 | hyper_params, 98 | trials=trials, 99 | algo=tpe.suggest, 100 | max_evals=args.hypern, 101 | catch_eval_exceptions=True) 102 | 103 | with open(os.path.join(output_dir, 'trials.pickle'), 'wb') as f: 104 | pickle.dump(trials, f) 105 | 106 | #TODO: map indices in "best" back to choice-based hyperpars before saving 107 | with open(os.path.join(output_dir, 'best.json'), 'wt') as f: 108 | json.dump(best, f, sort_keys=True, indent=4) 109 | 110 | print(best) 111 | 112 | #TODO: not just save the best conf but also train the model with these params 113 | -------------------------------------------------------------------------------- /dca/io.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Goekcen Eraslan 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import pickle, os, numbers 22 | 23 | import numpy as np 24 | import scipy as sp 25 | import pandas as pd 26 | import scanpy as sc 27 | from sklearn.model_selection import train_test_split 28 | from sklearn.preprocessing import scale 29 | 30 | 31 | #TODO: Fix this 32 | class AnnSequence: 33 | def __init__(self, matrix, batch_size, sf=None): 34 | self.matrix = matrix 35 | if sf is None: 36 | self.size_factors = np.ones((self.matrix.shape[0], 1), 37 | dtype=np.float32) 38 | else: 39 | self.size_factors = sf 40 | self.batch_size = batch_size 41 | 42 | def __len__(self): 43 | return len(self.matrix) // self.batch_size 44 | 45 | def __getitem__(self, idx): 46 | batch = self.matrix[idx*self.batch_size:(idx+1)*self.batch_size] 47 | batch_sf = self.size_factors[idx*self.batch_size:(idx+1)*self.batch_size] 48 | 49 | # return an (X, Y) pair 50 | return {'count': batch, 'size_factors': batch_sf}, batch 51 | 52 | 53 | def read_dataset(adata, transpose=False, test_split=False, copy=False, check_counts=True): 54 | 55 | if isinstance(adata, sc.AnnData): 56 | if copy: 57 | adata = adata.copy() 58 | elif isinstance(adata, str): 59 | adata = sc.read(adata, first_column_names=True) 60 | else: 61 | raise NotImplementedError 62 | 63 | if check_counts: 64 | # check if observations are unnormalized using first 10 65 | X_subset = adata.X[:10] 66 | norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.' 67 | if sp.sparse.issparse(X_subset): 68 | assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error 69 | else: 70 | assert np.all(X_subset.astype(int) == X_subset), norm_error 71 | 72 | if transpose: adata = adata.transpose() 73 | 74 | if test_split: 75 | train_idx, test_idx = train_test_split(np.arange(adata.n_obs), test_size=0.1, random_state=42) 76 | spl = pd.Series(['train'] * adata.n_obs) 77 | spl.iloc[test_idx] = 'test' 78 | adata.obs['dca_split'] = spl.values 79 | else: 80 | adata.obs['dca_split'] = 'train' 81 | 82 | adata.obs['dca_split'] = adata.obs['dca_split'].astype('category') 83 | print('dca: Successfully preprocessed {} genes and {} cells.'.format(adata.n_vars, adata.n_obs)) 84 | 85 | return adata 86 | 87 | 88 | def normalize(adata, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True): 89 | 90 | if filter_min_counts: 91 | sc.pp.filter_genes(adata, min_counts=1) 92 | sc.pp.filter_cells(adata, min_counts=1) 93 | 94 | if size_factors or normalize_input or logtrans_input: 95 | adata.raw = adata.copy() 96 | else: 97 | adata.raw = adata 98 | 99 | if size_factors: 100 | sc.pp.normalize_per_cell(adata) 101 | adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts) 102 | else: 103 | adata.obs['size_factors'] = 1.0 104 | 105 | if logtrans_input: 106 | sc.pp.log1p(adata) 107 | 108 | if normalize_input: 109 | sc.pp.scale(adata) 110 | 111 | return adata 112 | 113 | def read_genelist(filename): 114 | genelist = list(set(open(filename, 'rt').read().strip().split('\n'))) 115 | assert len(genelist) > 0, 'No genes detected in genelist file' 116 | print('dca: Subset of {} genes will be denoised.'.format(len(genelist))) 117 | 118 | return genelist 119 | 120 | def write_text_matrix(matrix, filename, rownames=None, colnames=None, transpose=False): 121 | if transpose: 122 | matrix = matrix.T 123 | rownames, colnames = colnames, rownames 124 | 125 | pd.DataFrame(matrix, index=rownames, columns=colnames).to_csv(filename, 126 | sep='\t', 127 | index=(rownames is not None), 128 | header=(colnames is not None), 129 | float_format='%.6f') 130 | def read_pickle(inputfile): 131 | return pickle.load(open(inputfile, "rb")) 132 | -------------------------------------------------------------------------------- /dca/layers.py: -------------------------------------------------------------------------------- 1 | from keras.engine.topology import Layer 2 | from keras.layers import Lambda, Dense 3 | from keras.engine.base_layer import InputSpec 4 | from keras import backend as K 5 | import tensorflow as tf 6 | 7 | 8 | class ConstantDispersionLayer(Layer): 9 | ''' 10 | An identity layer which allows us to inject extra parameters 11 | such as dispersion to Keras models 12 | ''' 13 | def __init__(self, **kwargs): 14 | super().__init__(**kwargs) 15 | 16 | def build(self, input_shape): 17 | self.theta = self.add_weight(shape=(1, input_shape[1]), 18 | initializer='zeros', 19 | trainable=True, 20 | name='theta') 21 | self.theta_exp = tf.clip_by_value(K.exp(self.theta), 1e-3, 1e4) 22 | super().build(input_shape) 23 | 24 | def call(self, x): 25 | return tf.identity(x) 26 | 27 | def compute_output_shape(self, input_shape): 28 | return input_shape 29 | 30 | 31 | class SliceLayer(Layer): 32 | def __init__(self, index, **kwargs): 33 | self.index = index 34 | super().__init__(**kwargs) 35 | 36 | def build(self, input_shape): 37 | if not isinstance(input_shape, list): 38 | raise ValueError('Input should be a list') 39 | 40 | super().build(input_shape) 41 | 42 | def call(self, x): 43 | assert isinstance(x, list), 'SliceLayer input is not a list' 44 | return x[self.index] 45 | 46 | def compute_output_shape(self, input_shape): 47 | return input_shape[self.index] 48 | 49 | 50 | class ElementwiseDense(Dense): 51 | def build(self, input_shape): 52 | assert len(input_shape) >= 2 53 | input_dim = input_shape[-1] 54 | assert (input_dim == self.units) or (self.units == 1), \ 55 | "Input and output dims are not compatible" 56 | 57 | # shape=(input_units, ) makes this elementwise bcs of broadcasting 58 | self.kernel = self.add_weight(shape=(self.units,), 59 | initializer=self.kernel_initializer, 60 | name='kernel', 61 | regularizer=self.kernel_regularizer, 62 | constraint=self.kernel_constraint) 63 | if self.use_bias: 64 | self.bias = self.add_weight(shape=(self.units,), 65 | initializer=self.bias_initializer, 66 | name='bias', 67 | regularizer=self.bias_regularizer, 68 | constraint=self.bias_constraint) 69 | else: 70 | self.bias = None 71 | self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim}) 72 | self.built = True 73 | 74 | def call(self, inputs): 75 | # use * instead of tf.matmul, we need broadcasting here 76 | output = inputs * self.kernel 77 | if self.use_bias: 78 | output = output + self.bias 79 | if self.activation is not None: 80 | output = self.activation(output) 81 | return output 82 | 83 | 84 | nan2zeroLayer = Lambda(lambda x: tf.where(tf.is_nan(x), tf.zeros_like(x), x)) 85 | ColwiseMultLayer = Lambda(lambda l: l[0]*tf.reshape(l[1], (-1,1))) 86 | -------------------------------------------------------------------------------- /dca/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from keras import backend as K 4 | 5 | 6 | def _nan2zero(x): 7 | return tf.where(tf.math.is_nan(x), tf.zeros_like(x), x) 8 | 9 | def _nan2inf(x): 10 | return tf.where(tf.math.is_nan(x), tf.zeros_like(x)+np.inf, x) 11 | 12 | def _nelem(x): 13 | nelem = tf.reduce_sum(tf.cast(~tf.math.is_nan(x), tf.float32)) 14 | return tf.cast(tf.where(tf.equal(nelem, 0.), 1., nelem), x.dtype) 15 | 16 | 17 | def _reduce_mean(x): 18 | nelem = _nelem(x) 19 | x = _nan2zero(x) 20 | return tf.divide(tf.reduce_sum(x), nelem) 21 | 22 | 23 | def mse_loss(y_true, y_pred): 24 | ret = tf.square(y_pred - y_true) 25 | 26 | return _reduce_mean(ret) 27 | 28 | 29 | # In the implementations, I try to keep the function signature 30 | # similar to those of Keras objective functions so that 31 | # later on we can use them in Keras smoothly: 32 | # https://github.com/fchollet/keras/blob/master/keras/objectives.py#L7 33 | def poisson_loss(y_true, y_pred): 34 | y_pred = tf.cast(y_pred, tf.float32) 35 | y_true = tf.cast(y_true, tf.float32) 36 | 37 | # we can use the Possion PMF from TensorFlow as well 38 | # dist = tf.contrib.distributions 39 | # return -tf.reduce_mean(dist.Poisson(y_pred).log_pmf(y_true)) 40 | 41 | nelem = _nelem(y_true) 42 | y_true = _nan2zero(y_true) 43 | 44 | # last term can be avoided since it doesn't depend on y_pred 45 | # however keeping it gives a nice lower bound to zero 46 | ret = y_pred - y_true*tf.math.log(y_pred+1e-10) + tf.math.lgamma(y_true+1.0) 47 | 48 | return tf.divide(tf.reduce_sum(ret), nelem) 49 | 50 | 51 | # We need a class (or closure) here, 52 | # because it's not possible to 53 | # pass extra arguments to Keras loss functions 54 | # See https://github.com/fchollet/keras/issues/2121 55 | 56 | # dispersion (theta) parameter is a scalar by default. 57 | # scale_factor scales the nbinom mean before the 58 | # calculation of the loss to balance the 59 | # learning rates of theta and network weights 60 | class NB(object): 61 | def __init__(self, theta=None, masking=False, scope='nbinom_loss/', 62 | scale_factor=1.0, debug=False): 63 | 64 | # for numerical stability 65 | self.eps = 1e-10 66 | self.scale_factor = scale_factor 67 | self.debug = debug 68 | self.scope = scope 69 | self.masking = masking 70 | self.theta = theta 71 | 72 | def loss(self, y_true, y_pred, mean=True): 73 | scale_factor = self.scale_factor 74 | eps = self.eps 75 | 76 | with tf.name_scope(self.scope): 77 | y_true = tf.cast(y_true, tf.float32) 78 | y_pred = tf.cast(y_pred, tf.float32) * scale_factor 79 | 80 | if self.masking: 81 | nelem = _nelem(y_true) 82 | y_true = _nan2zero(y_true) 83 | 84 | # Clip theta 85 | theta = tf.minimum(self.theta, 1e6) 86 | 87 | t1 = tf.math.lgamma(theta+eps) + tf.math.lgamma(y_true+1.0) - tf.math.lgamma(y_true+theta+eps) 88 | t2 = (theta+y_true) * tf.math.log(1.0 + (y_pred/(theta+eps))) + (y_true * (tf.math.log(theta+eps) - tf.math.log(y_pred+eps))) 89 | 90 | if self.debug: 91 | assert_ops = [ 92 | tf.verify_tensor_all_finite(y_pred, 'y_pred has inf/nans'), 93 | tf.verify_tensor_all_finite(t1, 't1 has inf/nans'), 94 | tf.verify_tensor_all_finite(t2, 't2 has inf/nans')] 95 | 96 | tf.summary.histogram('t1', t1) 97 | tf.summary.histogram('t2', t2) 98 | 99 | with tf.control_dependencies(assert_ops): 100 | final = t1 + t2 101 | 102 | else: 103 | final = t1 + t2 104 | 105 | final = _nan2inf(final) 106 | 107 | if mean: 108 | if self.masking: 109 | final = tf.divide(tf.reduce_sum(final), nelem) 110 | else: 111 | final = tf.reduce_mean(final) 112 | 113 | 114 | return final 115 | 116 | class ZINB(NB): 117 | def __init__(self, pi, ridge_lambda=0.0, scope='zinb_loss/', **kwargs): 118 | super().__init__(scope=scope, **kwargs) 119 | self.pi = pi 120 | self.ridge_lambda = ridge_lambda 121 | 122 | def loss(self, y_true, y_pred, mean=True): 123 | scale_factor = self.scale_factor 124 | eps = self.eps 125 | 126 | with tf.name_scope(self.scope): 127 | # reuse existing NB neg.log.lik. 128 | # mean is always False here, because everything is calculated 129 | # element-wise. we take the mean only in the end 130 | nb_case = super().loss(y_true, y_pred, mean=False) - tf.math.log(1.0-self.pi+eps) 131 | 132 | y_true = tf.cast(y_true, tf.float32) 133 | y_pred = tf.cast(y_pred, tf.float32) * scale_factor 134 | theta = tf.minimum(self.theta, 1e6) 135 | 136 | zero_nb = tf.pow(theta/(theta+y_pred+eps), theta) 137 | zero_case = -tf.math.log(self.pi + ((1.0-self.pi)*zero_nb)+eps) 138 | result = tf.where(tf.less(y_true, 1e-8), zero_case, nb_case) 139 | ridge = self.ridge_lambda*tf.square(self.pi) 140 | result += ridge 141 | 142 | if mean: 143 | if self.masking: 144 | result = _reduce_mean(result) 145 | else: 146 | result = tf.reduce_mean(result) 147 | 148 | result = _nan2inf(result) 149 | 150 | if self.debug: 151 | tf.summary.histogram('nb_case', nb_case) 152 | tf.summary.histogram('zero_nb', zero_nb) 153 | tf.summary.histogram('zero_case', zero_case) 154 | tf.summary.histogram('ridge', ridge) 155 | 156 | return result 157 | -------------------------------------------------------------------------------- /dca/network.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Goekcen Eraslan 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import os 17 | import pickle 18 | from abc import ABCMeta, abstractmethod 19 | 20 | import numpy as np 21 | import scanpy as sc 22 | 23 | import keras 24 | from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda 25 | from keras.models import Model 26 | from keras.regularizers import l1_l2 27 | from keras.objectives import mean_squared_error 28 | from keras.initializers import Constant 29 | from keras import backend as K 30 | 31 | import tensorflow as tf 32 | 33 | from .loss import poisson_loss, NB, ZINB 34 | from .layers import ConstantDispersionLayer, SliceLayer, ColwiseMultLayer, ElementwiseDense 35 | from .io import write_text_matrix 36 | 37 | 38 | MeanAct = lambda x: tf.clip_by_value(K.exp(x), 1e-5, 1e6) 39 | DispAct = lambda x: tf.clip_by_value(tf.nn.softplus(x), 1e-4, 1e4) 40 | 41 | advanced_activations = ('PReLU', 'LeakyReLU') 42 | 43 | class Autoencoder(): 44 | def __init__(self, 45 | input_size, 46 | output_size=None, 47 | hidden_size=(64, 32, 64), 48 | l2_coef=0., 49 | l1_coef=0., 50 | l2_enc_coef=0., 51 | l1_enc_coef=0., 52 | ridge=0., 53 | hidden_dropout=0., 54 | input_dropout=0., 55 | batchnorm=True, 56 | activation='relu', 57 | init='glorot_uniform', 58 | file_path=None, 59 | debug=False): 60 | 61 | self.input_size = input_size 62 | self.output_size = output_size 63 | self.hidden_size = hidden_size 64 | self.l2_coef = l2_coef 65 | self.l1_coef = l1_coef 66 | self.l2_enc_coef = l2_enc_coef 67 | self.l1_enc_coef = l1_enc_coef 68 | self.ridge = ridge 69 | self.hidden_dropout = hidden_dropout 70 | self.input_dropout = input_dropout 71 | self.batchnorm = batchnorm 72 | self.activation = activation 73 | self.init = init 74 | self.loss = None 75 | self.file_path = file_path 76 | self.extra_models = {} 77 | self.model = None 78 | self.encoder = None 79 | self.decoder = None 80 | self.input_layer = None 81 | self.sf_layer = None 82 | self.debug = debug 83 | 84 | if self.output_size is None: 85 | self.output_size = input_size 86 | 87 | if isinstance(self.hidden_dropout, list): 88 | assert len(self.hidden_dropout) == len(self.hidden_size) 89 | else: 90 | self.hidden_dropout = [self.hidden_dropout]*len(self.hidden_size) 91 | 92 | def build(self): 93 | 94 | self.input_layer = Input(shape=(self.input_size,), name='count') 95 | self.sf_layer = Input(shape=(1,), name='size_factors') 96 | last_hidden = self.input_layer 97 | 98 | if self.input_dropout > 0.0: 99 | last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden) 100 | 101 | for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)): 102 | center_idx = int(np.floor(len(self.hidden_size) / 2.0)) 103 | if i == center_idx: 104 | layer_name = 'center' 105 | stage = 'center' # let downstream know where we are 106 | elif i < center_idx: 107 | layer_name = 'enc%s' % i 108 | stage = 'encoder' 109 | else: 110 | layer_name = 'dec%s' % (i-center_idx) 111 | stage = 'decoder' 112 | 113 | # use encoder-specific l1/l2 reg coefs if given 114 | if self.l1_enc_coef != 0. and stage in ('center', 'encoder'): 115 | l1 = self.l1_enc_coef 116 | else: 117 | l1 = self.l1_coef 118 | 119 | if self.l2_enc_coef != 0. and stage in ('center', 'encoder'): 120 | l2 = self.l2_enc_coef 121 | else: 122 | l2 = self.l2_coef 123 | 124 | last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init, 125 | kernel_regularizer=l1_l2(l1, l2), 126 | name=layer_name)(last_hidden) 127 | if self.batchnorm: 128 | last_hidden = BatchNormalization(center=True, scale=False)(last_hidden) 129 | 130 | # Use separate act. layers to give user the option to get pre-activations 131 | # of layers when requested 132 | if self.activation in advanced_activations: 133 | last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden) 134 | else: 135 | last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden) 136 | 137 | if hid_drop > 0.0: 138 | last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden) 139 | 140 | self.decoder_output = last_hidden 141 | self.build_output() 142 | 143 | def build_output(self): 144 | 145 | self.loss = mean_squared_error 146 | mean = Dense(self.output_size, kernel_initializer=self.init, 147 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 148 | name='mean')(self.decoder_output) 149 | output = ColwiseMultLayer([mean, self.sf_layer]) 150 | 151 | # keep unscaled output as an extra model 152 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 153 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 154 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 155 | 156 | self.encoder = self.get_encoder() 157 | 158 | def save(self): 159 | if self.file_path: 160 | os.makedirs(self.file_path, exist_ok=True) 161 | with open(os.path.join(self.file_path, 'model.pickle'), 'wb') as f: 162 | pickle.dump(self, f) 163 | 164 | def load_weights(self, filename): 165 | self.model.load_weights(filename) 166 | self.encoder = self.get_encoder() 167 | self.decoder = None # get_decoder() 168 | 169 | def get_decoder(self): 170 | i = 0 171 | for l in self.model.layers: 172 | if l.name == 'center_drop': 173 | break 174 | i += 1 175 | 176 | return Model(inputs=self.model.get_layer(index=i+1).input, 177 | outputs=self.model.output) 178 | 179 | def get_encoder(self, activation=False): 180 | if activation: 181 | ret = Model(inputs=self.model.input, 182 | outputs=self.model.get_layer('center_act').output) 183 | else: 184 | ret = Model(inputs=self.model.input, 185 | outputs=self.model.get_layer('center').output) 186 | return ret 187 | 188 | def predict(self, adata, mode='denoise', return_info=False, copy=False): 189 | 190 | assert mode in ('denoise', 'latent', 'full'), 'Unknown mode' 191 | 192 | adata = adata.copy() if copy else adata 193 | 194 | if mode in ('latent', 'full'): 195 | print('dca: Calculating low dimensional representations...') 196 | 197 | adata.obsm['X_dca'] = self.encoder.predict({'count': adata.X, 198 | 'size_factors': adata.obs.size_factors}) 199 | if mode in ('denoise', 'full'): 200 | print('dca: Calculating reconstructions...') 201 | 202 | adata.X = self.model.predict({'count': adata.X, 203 | 'size_factors': adata.obs.size_factors}) 204 | 205 | #adata.uns['dca_loss'] = self.model.test_on_batch({'count': adata.X, 206 | # 'size_factors': adata.obs.size_factors}, 207 | # adata.raw.X) 208 | if mode == 'latent': 209 | adata.X = adata.raw.X.copy() #recover normalized expression values 210 | 211 | return adata if copy else None 212 | 213 | def write(self, adata, file_path, mode='denoise', colnames=None): 214 | 215 | colnames = adata.var_names.values if colnames is None else colnames 216 | rownames = adata.obs_names.values 217 | 218 | print('dca: Saving output(s)...') 219 | os.makedirs(file_path, exist_ok=True) 220 | 221 | if mode in ('denoise', 'full'): 222 | print('dca: Saving denoised expression...') 223 | write_text_matrix(adata.X, 224 | os.path.join(file_path, 'mean.tsv'), 225 | rownames=rownames, colnames=colnames, transpose=True) 226 | 227 | if mode in ('latent', 'full'): 228 | print('dca: Saving latent representations...') 229 | write_text_matrix(adata.obsm['X_dca'], 230 | os.path.join(file_path, 'latent.tsv'), 231 | rownames=rownames, transpose=False) 232 | 233 | class PoissonAutoencoder(Autoencoder): 234 | 235 | def build_output(self): 236 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 237 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 238 | name='mean')(self.decoder_output) 239 | output = ColwiseMultLayer([mean, self.sf_layer]) 240 | self.loss = poisson_loss 241 | 242 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 243 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 244 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 245 | 246 | self.encoder = self.get_encoder() 247 | 248 | 249 | class NBConstantDispAutoencoder(Autoencoder): 250 | 251 | def build_output(self): 252 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 253 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 254 | name='mean')(self.decoder_output) 255 | 256 | # Plug in dispersion parameters via fake dispersion layer 257 | disp = ConstantDispersionLayer(name='dispersion') 258 | mean = disp(mean) 259 | 260 | output = ColwiseMultLayer([mean, self.sf_layer]) 261 | 262 | nb = NB(disp.theta_exp) 263 | self.loss = nb.loss 264 | self.extra_models['dispersion'] = lambda :K.function([], [nb.theta])([])[0].squeeze() 265 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 266 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 267 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 268 | 269 | self.encoder = self.get_encoder() 270 | 271 | def predict(self, adata, mode='denoise', return_info=False, copy=False): 272 | colnames = adata.var_names.values 273 | rownames = adata.obs_names.values 274 | res = super().predict(adata, mode, return_info, copy) 275 | adata = res if copy else adata 276 | 277 | if return_info: 278 | adata.var['X_dca_dispersion'] = self.extra_models['dispersion']() 279 | 280 | return adata if copy else None 281 | 282 | def write(self, adata, file_path, mode='denoise', colnames=None): 283 | colnames = adata.var_names.values if colnames is None else colnames 284 | rownames = adata.obs_names.values 285 | 286 | super().write(adata, file_path, mode, colnames=colnames) 287 | if 'X_dca_dispersion' in adata.var_keys(): 288 | write_text_matrix(adata.var['X_dca_dispersion'].reshape(1, -1), 289 | os.path.join(file_path, 'dispersion.tsv'), 290 | colnames=colnames, transpose=True) 291 | 292 | 293 | class NBAutoencoder(Autoencoder): 294 | 295 | def build_output(self): 296 | disp = Dense(self.output_size, activation=DispAct, 297 | kernel_initializer=self.init, 298 | kernel_regularizer=l1_l2(self.l1_coef, 299 | self.l2_coef), 300 | name='dispersion')(self.decoder_output) 301 | 302 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 303 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 304 | name='mean')(self.decoder_output) 305 | output = ColwiseMultLayer([mean, self.sf_layer]) 306 | output = SliceLayer(0, name='slice')([output, disp]) 307 | 308 | nb = NB(theta=disp, debug=self.debug) 309 | self.loss = nb.loss 310 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 311 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 312 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 313 | 314 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 315 | 316 | self.encoder = self.get_encoder() 317 | 318 | def predict(self, adata, mode='denoise', return_info=False, copy=False): 319 | colnames = adata.var_names.values 320 | rownames = adata.obs_names.values 321 | 322 | res = super().predict(adata, mode, return_info, copy) 323 | adata = res if copy else adata 324 | 325 | if return_info: 326 | adata.obsm['X_dca_dispersion'] = self.extra_models['dispersion'].predict(adata.X) 327 | 328 | return adata if copy else None 329 | 330 | def write(self, adata, file_path, mode='denoise', colnames=None): 331 | colnames = adata.var_names.values if colnames is None else colnames 332 | rownames = adata.obs_names.values 333 | 334 | super().write(adata, file_path, mode, colnames=colnames) 335 | 336 | if 'X_dca_dispersion' in adata.obsm_keys(): 337 | write_text_matrix(adata.obsm['X_dca_dispersion'], 338 | os.path.join(file_path, 'dispersion.tsv'), 339 | colnames=colnames, transpose=True) 340 | 341 | class NBSharedAutoencoder(NBAutoencoder): 342 | 343 | def build_output(self): 344 | disp = Dense(1, activation=DispAct, 345 | kernel_initializer=self.init, 346 | kernel_regularizer=l1_l2(self.l1_coef, 347 | self.l2_coef), 348 | name='dispersion')(self.decoder_output) 349 | 350 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 351 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 352 | name='mean')(self.decoder_output) 353 | output = ColwiseMultLayer([mean, self.sf_layer]) 354 | output = SliceLayer(0, name='slice')([output, disp]) 355 | 356 | nb = NB(theta=disp, debug=self.debug) 357 | self.loss = nb.loss 358 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 359 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 360 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 361 | 362 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 363 | self.encoder = self.get_encoder() 364 | 365 | 366 | class ZINBAutoencoder(Autoencoder): 367 | 368 | def build_output(self): 369 | pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init, 370 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 371 | name='pi')(self.decoder_output) 372 | 373 | disp = Dense(self.output_size, activation=DispAct, 374 | kernel_initializer=self.init, 375 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 376 | name='dispersion')(self.decoder_output) 377 | 378 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 379 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 380 | name='mean')(self.decoder_output) 381 | output = ColwiseMultLayer([mean, self.sf_layer]) 382 | output = SliceLayer(0, name='slice')([output, disp, pi]) 383 | 384 | zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug) 385 | self.loss = zinb.loss 386 | self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi) 387 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 388 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 389 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 390 | 391 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 392 | 393 | self.encoder = self.get_encoder() 394 | 395 | def predict(self, adata, mode='denoise', return_info=False, copy=False, colnames=None): 396 | 397 | adata = adata.copy() if copy else adata 398 | 399 | if return_info: 400 | adata.obsm['X_dca_dispersion'] = self.extra_models['dispersion'].predict(adata.X) 401 | adata.obsm['X_dca_dropout'] = self.extra_models['pi'].predict(adata.X) 402 | 403 | # warning! this may overwrite adata.X 404 | super().predict(adata, mode, return_info, copy=False) 405 | return adata if copy else None 406 | 407 | def write(self, adata, file_path, mode='denoise', colnames=None): 408 | colnames = adata.var_names.values if colnames is None else colnames 409 | rownames = adata.obs_names.values 410 | 411 | super().write(adata, file_path, mode, colnames=colnames) 412 | 413 | if 'X_dca_dispersion' in adata.obsm_keys(): 414 | write_text_matrix(adata.obsm['X_dca_dispersion'], 415 | os.path.join(file_path, 'dispersion.tsv'), 416 | colnames=colnames, transpose=True) 417 | 418 | if 'X_dca_dropout' in adata.obsm_keys(): 419 | write_text_matrix(adata.obsm['X_dca_dropout'], 420 | os.path.join(file_path, 'dropout.tsv'), 421 | colnames=colnames, transpose=True) 422 | 423 | 424 | class ZINBAutoencoderElemPi(ZINBAutoencoder): 425 | def __init__(self, sharedpi=False, **kwds): 426 | super().__init__(**kwds) 427 | self.sharedpi = sharedpi 428 | 429 | def build_output(self): 430 | disp = Dense(self.output_size, activation=DispAct, 431 | kernel_initializer=self.init, 432 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 433 | name='dispersion')(self.decoder_output) 434 | 435 | mean_no_act = Dense(self.output_size, activation=None, kernel_initializer=self.init, 436 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 437 | name='mean_no_act')(self.decoder_output) 438 | 439 | minus = Lambda(lambda x: -x) 440 | mean_no_act = minus(mean_no_act) 441 | pidim = self.output_size if not self.sharedpi else 1 442 | 443 | pi = ElementwiseDense(pidim, activation='sigmoid', kernel_initializer=self.init, 444 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 445 | name='pi')(mean_no_act) 446 | 447 | mean = Activation(MeanAct, name='mean')(mean_no_act) 448 | 449 | output = ColwiseMultLayer([mean, self.sf_layer]) 450 | output = SliceLayer(0, name='slice')([output, disp, pi]) 451 | 452 | zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug) 453 | self.loss = zinb.loss 454 | self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi) 455 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 456 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 457 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 458 | 459 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 460 | 461 | self.encoder = self.get_encoder() 462 | 463 | 464 | 465 | class ZINBSharedAutoencoder(ZINBAutoencoder): 466 | 467 | def build_output(self): 468 | pi = Dense(1, activation='sigmoid', kernel_initializer=self.init, 469 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 470 | name='pi')(self.decoder_output) 471 | 472 | disp = Dense(1, activation=DispAct, 473 | kernel_initializer=self.init, 474 | kernel_regularizer=l1_l2(self.l1_coef, 475 | self.l2_coef), 476 | name='dispersion')(self.decoder_output) 477 | 478 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 479 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 480 | name='mean')(self.decoder_output) 481 | output = ColwiseMultLayer([mean, self.sf_layer]) 482 | output = SliceLayer(0, name='slice')([output, disp, pi]) 483 | 484 | zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug) 485 | self.loss = zinb.loss 486 | self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi) 487 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 488 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 489 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 490 | 491 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 492 | 493 | self.encoder = self.get_encoder() 494 | 495 | 496 | class ZINBConstantDispAutoencoder(Autoencoder): 497 | 498 | def build_output(self): 499 | pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init, 500 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 501 | name='pi')(self.decoder_output) 502 | 503 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 504 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 505 | name='mean')(self.decoder_output) 506 | 507 | # NB dispersion layer 508 | disp = ConstantDispersionLayer(name='dispersion') 509 | mean = disp(mean) 510 | 511 | output = ColwiseMultLayer([mean, self.sf_layer]) 512 | 513 | zinb = ZINB(pi, theta=disp.theta_exp, ridge_lambda=self.ridge, debug=self.debug) 514 | self.loss = zinb.loss 515 | self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi) 516 | self.extra_models['dispersion'] = lambda :K.function([], [zinb.theta])([])[0].squeeze() 517 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 518 | self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output) 519 | 520 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 521 | 522 | self.encoder = self.get_encoder() 523 | 524 | def predict(self, adata, mode='denoise', return_info=False, copy=False): 525 | colnames = adata.var_names.values 526 | rownames = adata.obs_names.values 527 | adata = adata.copy() if copy else adata 528 | 529 | if return_info: 530 | adata.var['X_dca_dispersion'] = self.extra_models['dispersion']() 531 | adata.obsm['X_dca_dropout'] = self.extra_models['pi'].predict(adata.X) 532 | 533 | super().predict(adata, mode, return_info, copy=False) 534 | return adata if copy else None 535 | 536 | def write(self, adata, file_path, mode='denoise', colnames=None): 537 | colnames = adata.var_names.values if colnames is None else colnames 538 | rownames = adata.obs_names.values 539 | 540 | super().write(adata, file_path, mode) 541 | 542 | if 'X_dca_dispersion' in adata.var_keys(): 543 | write_text_matrix(adata.var['X_dca_dispersion'].values.reshape(1, -1), 544 | os.path.join(file_path, 'dispersion.tsv'), 545 | colnames=colnames, transpose=True) 546 | 547 | if 'X_dca_dropout' in adata.obsm_keys(): 548 | write_text_matrix(adata.obsm['X_dca_dropout'], 549 | os.path.join(file_path, 'dropout.tsv'), 550 | colnames=colnames, transpose=True) 551 | 552 | 553 | class ZINBForkAutoencoder(ZINBAutoencoder): 554 | 555 | def build(self): 556 | 557 | self.input_layer = Input(shape=(self.input_size,), name='count') 558 | self.sf_layer = Input(shape=(1,), name='size_factors') 559 | last_hidden = self.input_layer 560 | 561 | if self.input_dropout > 0.0: 562 | last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden) 563 | 564 | for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)): 565 | center_idx = int(np.floor(len(self.hidden_size) / 2.0)) 566 | if i == center_idx: 567 | layer_name = 'center' 568 | stage = 'center' # let downstream know where we are 569 | elif i < center_idx: 570 | layer_name = 'enc%s' % i 571 | stage = 'encoder' 572 | else: 573 | layer_name = 'dec%s' % (i-center_idx) 574 | stage = 'decoder' 575 | 576 | # use encoder-specific l1/l2 reg coefs if given 577 | if self.l1_enc_coef != 0. and stage in ('center', 'encoder'): 578 | l1 = self.l1_enc_coef 579 | else: 580 | l1 = self.l1_coef 581 | 582 | if self.l2_enc_coef != 0. and stage in ('center', 'encoder'): 583 | l2 = self.l2_enc_coef 584 | else: 585 | l2 = self.l2_coef 586 | 587 | if i > center_idx: 588 | self.last_hidden_mean = Dense(hid_size, activation=None, kernel_initializer=self.init, 589 | kernel_regularizer=l1_l2(l1, l2), 590 | name='%s_last_mean'%layer_name)(last_hidden) 591 | self.last_hidden_disp = Dense(hid_size, activation=None, kernel_initializer=self.init, 592 | kernel_regularizer=l1_l2(l1, l2), 593 | name='%s_last_disp'%layer_name)(last_hidden) 594 | self.last_hidden_pi = Dense(hid_size, activation=None, kernel_initializer=self.init, 595 | kernel_regularizer=l1_l2(l1, l2), 596 | name='%s_last_pi'%layer_name)(last_hidden) 597 | 598 | if self.batchnorm: 599 | self.last_hidden_mean = BatchNormalization(center=True, scale=False)(self.last_hidden_mean) 600 | self.last_hidden_disp = BatchNormalization(center=True, scale=False)(self.last_hidden_disp) 601 | self.last_hidden_pi = BatchNormalization(center=True, scale=False)(self.last_hidden_pi) 602 | 603 | # Use separate act. layers to give user the option to get pre-activations 604 | # of layers when requested 605 | self.last_hidden_mean = Activation(self.activation, name='%s_mean_act'%layer_name)(self.last_hidden_mean) 606 | self.last_hidden_disp = Activation(self.activation, name='%s_disp_act'%layer_name)(self.last_hidden_disp) 607 | self.last_hidden_pi = Activation(self.activation, name='%s_pi_act'%layer_name)(self.last_hidden_pi) 608 | 609 | if hid_drop > 0.0: 610 | self.last_hidden_mean = Dropout(hid_drop, name='%s_mean_drop'%layer_name)(self.last_hidden_mean) 611 | self.last_hidden_disp = Dropout(hid_drop, name='%s_disp_drop'%layer_name)(self.last_hidden_disp) 612 | self.last_hidden_pi = Dropout(hid_drop, name='%s_pi_drop'%layer_name)(self.last_hidden_pi) 613 | 614 | else: 615 | last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init, 616 | kernel_regularizer=l1_l2(l1, l2), 617 | name=layer_name)(last_hidden) 618 | 619 | if self.batchnorm: 620 | last_hidden = BatchNormalization(center=True, scale=False)(last_hidden) 621 | 622 | # Use separate act. layers to give user the option to get pre-activations 623 | # of layers when requested 624 | if self.activation in advanced_activations: 625 | last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden) 626 | else: 627 | last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden) 628 | 629 | if hid_drop > 0.0: 630 | last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden) 631 | 632 | self.build_output() 633 | 634 | 635 | def build_output(self): 636 | pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init, 637 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 638 | name='pi')(self.last_hidden_pi) 639 | 640 | disp = Dense(self.output_size, activation=DispAct, 641 | kernel_initializer=self.init, 642 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 643 | name='dispersion')(self.last_hidden_disp) 644 | 645 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 646 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 647 | name='mean')(self.last_hidden_mean) 648 | 649 | output = ColwiseMultLayer([mean, self.sf_layer]) 650 | output = SliceLayer(0, name='slice')([output, disp, pi]) 651 | 652 | zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug) 653 | self.loss = zinb.loss 654 | self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi) 655 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 656 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 657 | 658 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 659 | 660 | self.encoder = self.get_encoder() 661 | 662 | 663 | class NBForkAutoencoder(NBAutoencoder): 664 | 665 | def build(self): 666 | 667 | self.input_layer = Input(shape=(self.input_size,), name='count') 668 | self.sf_layer = Input(shape=(1,), name='size_factors') 669 | last_hidden = self.input_layer 670 | 671 | if self.input_dropout > 0.0: 672 | last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden) 673 | 674 | for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)): 675 | center_idx = int(np.floor(len(self.hidden_size) / 2.0)) 676 | if i == center_idx: 677 | layer_name = 'center' 678 | stage = 'center' # let downstream know where we are 679 | elif i < center_idx: 680 | layer_name = 'enc%s' % i 681 | stage = 'encoder' 682 | else: 683 | layer_name = 'dec%s' % (i-center_idx) 684 | stage = 'decoder' 685 | 686 | # use encoder-specific l1/l2 reg coefs if given 687 | if self.l1_enc_coef != 0. and stage in ('center', 'encoder'): 688 | l1 = self.l1_enc_coef 689 | else: 690 | l1 = self.l1_coef 691 | 692 | if self.l2_enc_coef != 0. and stage in ('center', 'encoder'): 693 | l2 = self.l2_enc_coef 694 | else: 695 | l2 = self.l2_coef 696 | 697 | if i > center_idx: 698 | self.last_hidden_mean = Dense(hid_size, activation=None, kernel_initializer=self.init, 699 | kernel_regularizer=l1_l2(l1, l2), 700 | name='%s_last_mean'%layer_name)(last_hidden) 701 | self.last_hidden_disp = Dense(hid_size, activation=None, kernel_initializer=self.init, 702 | kernel_regularizer=l1_l2(l1, l2), 703 | name='%s_last_disp'%layer_name)(last_hidden) 704 | 705 | if self.batchnorm: 706 | self.last_hidden_mean = BatchNormalization(center=True, scale=False)(self.last_hidden_mean) 707 | self.last_hidden_disp = BatchNormalization(center=True, scale=False)(self.last_hidden_disp) 708 | 709 | # Use separate act. layers to give user the option to get pre-activations 710 | # of layers when requested 711 | self.last_hidden_mean = Activation(self.activation, name='%s_mean_act'%layer_name)(self.last_hidden_mean) 712 | self.last_hidden_disp = Activation(self.activation, name='%s_disp_act'%layer_name)(self.last_hidden_disp) 713 | 714 | if hid_drop > 0.0: 715 | self.last_hidden_mean = Dropout(hid_drop, name='%s_mean_drop'%layer_name)(self.last_hidden_mean) 716 | self.last_hidden_disp = Dropout(hid_drop, name='%s_disp_drop'%layer_name)(self.last_hidden_disp) 717 | 718 | else: 719 | last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init, 720 | kernel_regularizer=l1_l2(l1, l2), 721 | name=layer_name)(last_hidden) 722 | 723 | if self.batchnorm: 724 | last_hidden = BatchNormalization(center=True, scale=False)(last_hidden) 725 | 726 | # Use separate act. layers to give user the option to get pre-activations 727 | # of layers when requested 728 | if self.activation in advanced_activations: 729 | last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden) 730 | else: 731 | last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden) 732 | 733 | if hid_drop > 0.0: 734 | last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden) 735 | 736 | self.build_output() 737 | 738 | 739 | def build_output(self): 740 | 741 | disp = Dense(self.output_size, activation=DispAct, 742 | kernel_initializer=self.init, 743 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 744 | name='dispersion')(self.last_hidden_disp) 745 | 746 | mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init, 747 | kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef), 748 | name='mean')(self.last_hidden_mean) 749 | 750 | output = ColwiseMultLayer([mean, self.sf_layer]) 751 | output = SliceLayer(0, name='slice')([output, disp]) 752 | 753 | nb = NB(theta=disp, debug=self.debug) 754 | self.loss = nb.loss 755 | self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp) 756 | self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean) 757 | 758 | self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output) 759 | 760 | self.encoder = self.get_encoder() 761 | 762 | 763 | AE_types = {'normal': Autoencoder, 'poisson': PoissonAutoencoder, 764 | 'nb': NBConstantDispAutoencoder, 'nb-conddisp': NBAutoencoder, 765 | 'nb-shared': NBSharedAutoencoder, 'nb-fork': NBForkAutoencoder, 766 | 'zinb': ZINBConstantDispAutoencoder, 'zinb-conddisp': ZINBAutoencoder, 767 | 'zinb-shared': ZINBSharedAutoencoder, 'zinb-fork': ZINBForkAutoencoder, 768 | 'zinb-elempi': ZINBAutoencoderElemPi} 769 | 770 | -------------------------------------------------------------------------------- /dca/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scanpy as sc 3 | 4 | from .api import dca 5 | 6 | def test_api(): 7 | adata = sc.datasets.paul15() 8 | epochs = 1 9 | 10 | # simple tests for denoise 11 | ret = dca(adata, mode='denoise', copy=True, epochs=epochs, verbose=True) 12 | assert not np.allclose(ret.X[:10], adata.X[:10]) 13 | 14 | ret, _ = dca(adata, mode='denoise', ae_type='nb-conddisp', copy=True, epochs=epochs, 15 | return_model=True, return_info=True) 16 | assert not np.allclose(ret.X[:10], adata.X[:10]) 17 | assert 'X_dca_dispersion' in ret.obsm_keys() 18 | assert _ is not None 19 | 20 | ret = dca(adata, mode='denoise', ae_type='nb', copy=True, epochs=epochs, 21 | return_model=False, return_info=True) 22 | assert not np.allclose(ret.X[:10], adata.X[:10]) 23 | assert 'X_dca_dispersion' in ret.var_keys() 24 | 25 | ret = dca(adata, mode='denoise', ae_type='zinb', copy=True, epochs=epochs, 26 | return_model=False, return_info=True) 27 | assert not np.allclose(ret.X[:10], adata.X[:10]) 28 | assert 'X_dca_dropout' in ret.obsm_keys() 29 | assert 'dca_loss_history' in ret.uns_keys() 30 | 31 | ret = dca(adata, mode='denoise', ae_type='zinb-elempi', copy=True, epochs=epochs, 32 | return_model=False, return_info=True) 33 | assert not np.allclose(ret.X[:10], adata.X[:10]) 34 | assert 'X_dca_dropout' in ret.obsm_keys() 35 | assert 'dca_loss_history' in ret.uns_keys() 36 | 37 | ret = dca(adata, mode='denoise', ae_type='zinb-elempi', copy=True, epochs=epochs, 38 | return_model=False, return_info=True, network_kwds={'sharedpi': True}) 39 | assert not np.allclose(ret.X[:10], adata.X[:10]) 40 | assert 'X_dca_dropout' in ret.obsm_keys() 41 | assert 'dca_loss_history' in ret.uns_keys() 42 | 43 | # simple tests for latent 44 | hid_size = (10, 2, 10) 45 | ret = dca(adata, mode='latent', hidden_size=hid_size, copy=True, epochs=epochs) 46 | assert 'X_dca' in ret.obsm_keys() 47 | assert ret.obsm['X_dca'].shape[1] == hid_size[1] 48 | 49 | ret = dca(adata, mode='latent', ae_type='nb-conddisp', hidden_size=hid_size, copy=True, epochs=epochs) 50 | assert 'X_dca' in ret.obsm_keys() 51 | assert ret.obsm['X_dca'].shape[1] == hid_size[1] 52 | 53 | ret = dca(adata, mode='latent', ae_type='nb', hidden_size=hid_size, copy=True, epochs=epochs, return_info=True) 54 | assert 'X_dca' in ret.obsm_keys() 55 | assert ret.obsm['X_dca'].shape[1] == hid_size[1] 56 | 57 | ret = dca(adata, mode='latent', ae_type='zinb', hidden_size=hid_size, copy=True, epochs=epochs) 58 | assert 'X_dca' in ret.obsm_keys() 59 | assert ret.obsm['X_dca'].shape[1] == hid_size[1] 60 | -------------------------------------------------------------------------------- /dca/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Goekcen Eraslan 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | import random 22 | 23 | from . import io 24 | from .network import AE_types 25 | from .hyper import hyper 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | import keras.optimizers as opt 30 | from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 31 | from keras import backend as K 32 | from keras.preprocessing.image import Iterator 33 | 34 | 35 | def train(adata, network, output_dir=None, optimizer='RMSprop', learning_rate=None, 36 | epochs=300, reduce_lr=10, output_subset=None, use_raw_as_output=True, 37 | early_stop=15, batch_size=32, clip_grad=5., save_weights=False, 38 | validation_split=0.1, tensorboard=False, verbose=True, threads=None, 39 | **kwds): 40 | 41 | tf.compat.v1.keras.backend.set_session( 42 | tf.compat.v1.Session( 43 | config=tf.compat.v1.ConfigProto( 44 | intra_op_parallelism_threads=threads, 45 | inter_op_parallelism_threads=threads, 46 | ) 47 | ) 48 | ) 49 | model = network.model 50 | loss = network.loss 51 | if output_dir is not None: 52 | os.makedirs(output_dir, exist_ok=True) 53 | 54 | if learning_rate is None: 55 | optimizer = opt.__dict__[optimizer](clipvalue=clip_grad) 56 | else: 57 | optimizer = opt.__dict__[optimizer](lr=learning_rate, clipvalue=clip_grad) 58 | 59 | model.compile(loss=loss, optimizer=optimizer) 60 | 61 | # Callbacks 62 | callbacks = [] 63 | 64 | if save_weights and output_dir is not None: 65 | checkpointer = ModelCheckpoint(filepath="%s/weights.hdf5" % output_dir, 66 | verbose=verbose, 67 | save_weights_only=True, 68 | save_best_only=True) 69 | callbacks.append(checkpointer) 70 | if reduce_lr: 71 | lr_cb = ReduceLROnPlateau(monitor='val_loss', patience=reduce_lr, verbose=verbose) 72 | callbacks.append(lr_cb) 73 | if early_stop: 74 | es_cb = EarlyStopping(monitor='val_loss', patience=early_stop, verbose=verbose) 75 | callbacks.append(es_cb) 76 | if tensorboard: 77 | tb_log_dir = os.path.join(output_dir, 'tb') 78 | tb_cb = TensorBoard(log_dir=tb_log_dir, histogram_freq=1, write_grads=True) 79 | callbacks.append(tb_cb) 80 | 81 | if verbose: model.summary() 82 | 83 | inputs = {'count': adata.X, 'size_factors': adata.obs.size_factors} 84 | 85 | if output_subset: 86 | gene_idx = [np.where(adata.raw.var_names == x)[0][0] for x in output_subset] 87 | output = adata.raw.X[:, gene_idx] if use_raw_as_output else adata.X[:, gene_idx] 88 | else: 89 | output = adata.raw.X if use_raw_as_output else adata.X 90 | 91 | loss = model.fit(inputs, output, 92 | epochs=epochs, 93 | batch_size=batch_size, 94 | shuffle=True, 95 | callbacks=callbacks, 96 | validation_split=validation_split, 97 | verbose=verbose, 98 | **kwds) 99 | 100 | return loss 101 | 102 | 103 | def train_with_args(args): 104 | 105 | tf.compat.v1.keras.backend.set_session( 106 | tf.compat.v1.Session( 107 | config=tf.compat.v1.ConfigProto( 108 | intra_op_parallelism_threads=args.threads, 109 | inter_op_parallelism_threads=args.threads, 110 | ) 111 | ) 112 | ) 113 | # set seed for reproducibility 114 | random.seed(42) 115 | np.random.seed(42) 116 | tf.random.set_seed(42) 117 | os.environ['PYTHONHASHSEED'] = '0' 118 | 119 | # do hyperpar optimization and exit 120 | if args.hyper: 121 | hyper(args) 122 | return 123 | 124 | adata = io.read_dataset(args.input, 125 | transpose=(not args.transpose), # assume gene x cell by default 126 | check_counts=args.checkcounts, 127 | test_split=args.testsplit) 128 | 129 | adata = io.normalize(adata, 130 | size_factors=args.sizefactors, 131 | logtrans_input=args.loginput, 132 | normalize_input=args.norminput) 133 | 134 | if args.denoisesubset: 135 | genelist = list(set(io.read_genelist(args.denoisesubset))) 136 | assert len(set(genelist) - set(adata.var_names.values)) == 0, \ 137 | 'Gene list is not overlapping with genes from the dataset' 138 | output_size = len(genelist) 139 | else: 140 | genelist = None 141 | output_size = adata.n_vars 142 | 143 | hidden_size = [int(x) for x in args.hiddensize.split(',')] 144 | hidden_dropout = [float(x) for x in args.dropoutrate.split(',')] 145 | if len(hidden_dropout) == 1: 146 | hidden_dropout = hidden_dropout[0] 147 | 148 | assert args.type in AE_types, 'loss type not supported' 149 | input_size = adata.n_vars 150 | 151 | from tensorflow.python.framework.ops import disable_eager_execution 152 | disable_eager_execution() 153 | 154 | net = AE_types[args.type](input_size=input_size, 155 | output_size=output_size, 156 | hidden_size=hidden_size, 157 | l2_coef=args.l2, 158 | l1_coef=args.l1, 159 | l2_enc_coef=args.l2enc, 160 | l1_enc_coef=args.l1enc, 161 | ridge=args.ridge, 162 | hidden_dropout=hidden_dropout, 163 | input_dropout=args.inputdropout, 164 | batchnorm=args.batchnorm, 165 | activation=args.activation, 166 | init=args.init, 167 | debug=args.debug, 168 | file_path=args.outputdir) 169 | 170 | net.save() 171 | net.build() 172 | 173 | losses = train(adata[adata.obs.dca_split == 'train'], net, 174 | output_dir=args.outputdir, 175 | learning_rate=args.learningrate, 176 | epochs=args.epochs, batch_size=args.batchsize, 177 | early_stop=args.earlystop, 178 | reduce_lr=args.reducelr, 179 | output_subset=genelist, 180 | optimizer=args.optimizer, 181 | clip_grad=args.gradclip, 182 | save_weights=args.saveweights, 183 | tensorboard=args.tensorboard) 184 | 185 | if genelist: 186 | predict_columns = adata.var_names[[np.where(adata.var_names==x)[0][0] for x in genelist]] 187 | else: 188 | predict_columns = adata.var_names 189 | 190 | net.predict(adata, mode='full', return_info=True) 191 | net.write(adata, args.outputdir, mode='full', colnames=predict_columns) 192 | -------------------------------------------------------------------------------- /dca/utils.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | import scipy as sp 7 | import tensorflow as tf 8 | from tensorflow.contrib.opt import ScipyOptimizerInterface 9 | 10 | 11 | nb_zero = lambda t, mu: (t/(mu+t))**t 12 | zinb_zero = lambda t, mu, p: p + ((1.-p)*((t/(mu+t))**t)) 13 | sigmoid = lambda x: 1. / (1.+np.exp(-x)) 14 | logit = lambda x: np.log(x + 1e-7) - np.log(1. - x + 1e-7) 15 | tf_logit = lambda x: tf.cast(tf.log(x + 1e-7) - tf.log(1. - x + 1e-7), 'float32') 16 | log_loss = lambda pred, label: np.sum(-(label*np.log(pred+1e-7)) - ((1.-label)*np.log(1.-pred+1e-7))) 17 | 18 | 19 | def _lrt(ll_full, ll_reduced, df_full, df_reduced): 20 | # Compute the difference in degrees of freedom. 21 | delta_df = df_full - df_reduced 22 | # Compute the deviance test statistic. 23 | delta_dev = 2 * (ll_full - ll_reduced) 24 | # Compute the p-values based on the deviance and its expection based on the chi-square distribution. 25 | pvals = 1. - sp.stats.chi2(delta_df).cdf(delta_dev) 26 | 27 | return pvals 28 | 29 | 30 | def _fitquad(x, y): 31 | coef, res, _, _ = np.linalg.lstsq((x**2)[:, np.newaxis] , y-x, rcond=None) 32 | ss_exp = res[0] 33 | ss_tot = (np.var(y-x)*len(x)) 34 | r2 = 1 - (ss_exp / ss_tot) 35 | #print('Coefs:', coef) 36 | return np.array([coef[0], 1, 0]), r2 37 | 38 | 39 | def _tf_zinb_zero(mu, t=None): 40 | a, b = tf.Variable([-1.0], dtype='float32'), tf.Variable([0.0], dtype='float32') 41 | 42 | if t is None: 43 | t_log = tf.Variable([-10.], dtype='float32') 44 | t = tf.exp(t_log) 45 | 46 | p = tf.sigmoid((tf.log(mu+1e-7)*a) + b) 47 | pred = p + ((1.-p)*((t/(mu+t))**t)) 48 | pred = tf.cast(pred, 'float32') 49 | return pred, a, b, t 50 | 51 | 52 | def _optimize_zinb(mu, dropout, theta=None): 53 | pred, a, b, t = _tf_zinb_zero(mu, theta) 54 | #loss = tf.reduce_mean(tf.abs(tf_logit(pred) - tf_logit(dropout))) 55 | loss = tf.losses.log_loss(labels=dropout.astype('float32'), 56 | predictions=pred) 57 | 58 | optimizer = ScipyOptimizerInterface(loss, options={'maxiter': 100}) 59 | 60 | with tf.Session() as sess: 61 | sess.run(tf.global_variables_initializer()) 62 | optimizer.minimize(sess) 63 | ret_a = sess.run(a) 64 | ret_b = sess.run(b) 65 | if theta is None: 66 | ret_t = sess.run(t) 67 | else: 68 | ret_t = t 69 | 70 | return ret_a, ret_b, ret_t 71 | 72 | 73 | def plot_mean_dropout(ad, title, ax, opt_zinb_theta=False, legend_out=False): 74 | expr = ad.X 75 | mu = expr.mean(0) 76 | do = np.mean(expr == 0, 0) 77 | v = expr.var(axis=0) 78 | 79 | coefs, r2 = _fitquad(mu, v) 80 | theta = 1.0/coefs[0] 81 | 82 | # zinb fit 83 | coefs = _optimize_zinb(mu, do, theta=theta if not opt_zinb_theta else None) 84 | print(coefs) 85 | 86 | #pois_pred = np.exp(-mu) 87 | nb_pred = nb_zero(theta, mu) 88 | zinb_pred = zinb_zero(coefs[2], 89 | mu, 90 | sigmoid((np.log(mu+1e-7)*coefs[0])+coefs[1])) 91 | 92 | # calculate log loss for all distr. 93 | #pois_ll = log_loss(pois_pred, do) 94 | nb_ll = log_loss(nb_pred, do) 95 | zinb_ll = log_loss(zinb_pred, do) 96 | 97 | ax.plot(mu, do, 'o', c='black', markersize=1) 98 | ax.set(xscale="log") 99 | 100 | #sns.lineplot(mu, pois_pred, ax=ax, color='blue') 101 | sns.lineplot(mu, nb_pred, ax=ax, color='red') 102 | sns.lineplot(mu, zinb_pred, ax=ax, color='green') 103 | 104 | ax.set_title(title) 105 | ax.set_ylabel('Empirical dropout rate') 106 | ax.set_xlabel(r'Mean expression') 107 | 108 | 109 | leg_loc = 'best' if not legend_out else 'upper left' 110 | leg_bbox = None if not legend_out else (1.02, 1.) 111 | ax.legend(['Genes', 112 | #r'Poisson $L=%.4f$' % pois_ll, 113 | r'NB($\theta=%.2f)\ L=%.4f$' % ((1./theta), nb_ll), 114 | r'ZINB($\theta=%.2f,\pi=\sigma(%.2f\mu%+.2f)) \ L=%.4f$' % (1.0/coefs[2], coefs[0], coefs[1], zinb_ll)], 115 | loc=leg_loc, bbox_to_anchor=leg_bbox) 116 | zinb_pval = _lrt(-zinb_ll, -nb_ll, 3, 1) 117 | print('p-value: %e' % zinb_pval) 118 | 119 | 120 | def plot_mean_var(ad, title, ax): 121 | ad = ad.copy() 122 | 123 | sc.pp.filter_cells(ad, min_counts=1) 124 | sc.pp.filter_genes(ad, min_counts=1) 125 | 126 | m = ad.X.mean(axis=0) 127 | v = ad.X.var(axis=0) 128 | 129 | coefs, r2 = _fitquad(m, v) 130 | 131 | ax.set(xscale="log", yscale="log") 132 | ax.plot(m, v, 'o', c='black', markersize=1) 133 | 134 | poly = np.poly1d(coefs) 135 | sns.lineplot(m, poly(m), ax=ax, color='red') 136 | 137 | ax.set_title(title) 138 | ax.set_ylabel('Variance') 139 | ax.set_xlabel(r'$\mu$') 140 | 141 | sns.lineplot(m, m, ax=ax, color='blue') 142 | ax.legend(['Genes', r'NB ($\theta=%.2f)\ r^2=%.3f$' % (coefs[0], r2), 'Poisson']) 143 | 144 | return coefs[0] 145 | 146 | 147 | def plot_zeroinf(ad, title, mean_var_plot=False, opt_theta=True): 148 | if mean_var_plot: 149 | f, axs = plt.subplots(1, 2, figsize=(15, 5)) 150 | plot_mean_var(ad, title, ax=axs[0]) 151 | plot_mean_dropout(ad, title, axs[1], opt_zinb_theta=opt_theta, legend_out=True) 152 | plt.tight_layout() 153 | else: 154 | f, ax = plt.subplots(1, 1, figsize=(10, 5)) 155 | plot_mean_dropout(ad, title, ax, opt_zinb_theta=opt_theta, legend_out=True) 156 | plt.tight_layout() 157 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/autoencoder.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/autoencoder.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/autoencoder" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/autoencoder" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # autoencoder documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Apr 9 12:35:06 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../autoencoder')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.todo', 36 | 'sphinx.ext.coverage', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.githubpages', 40 | 'sphinx.ext.napoleon', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The encoding of source files. 53 | # 54 | # source_encoding = 'utf-8-sig' 55 | 56 | # The master toctree document. 57 | master_doc = 'index' 58 | 59 | # General information about the project. 60 | project = 'autoencoder' 61 | copyright = '2017, Gokcen Eraslan' 62 | author = 'Gokcen Eraslan' 63 | 64 | # The version info for the project you're documenting, acts as replacement for 65 | # |version| and |release|, also used in various other places throughout the 66 | # built documents. 67 | # 68 | # The short X.Y version. 69 | version = '0.1' 70 | # The full version, including alpha/beta/rc tags. 71 | release = '0.1' 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = None 79 | 80 | # There are two options for replacing |today|: either, you set today to some 81 | # non-false value, then it is used: 82 | # 83 | # today = '' 84 | # 85 | # Else, today_fmt is used as the format for a strftime call. 86 | # 87 | # today_fmt = '%B %d, %Y' 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This patterns also effect to html_static_path and html_extra_path 92 | exclude_patterns = [] 93 | 94 | # The reST default role (used for this markup: `text`) to use for all 95 | # documents. 96 | # 97 | # default_role = None 98 | 99 | # If true, '()' will be appended to :func: etc. cross-reference text. 100 | # 101 | # add_function_parentheses = True 102 | 103 | # If true, the current module name will be prepended to all description 104 | # unit titles (such as .. function::). 105 | # 106 | # add_module_names = True 107 | 108 | # If true, sectionauthor and moduleauthor directives will be shown in the 109 | # output. They are ignored by default. 110 | # 111 | # show_authors = False 112 | 113 | # The name of the Pygments (syntax highlighting) style to use. 114 | pygments_style = 'sphinx' 115 | 116 | # A list of ignored prefixes for module index sorting. 117 | # modindex_common_prefix = [] 118 | 119 | # If true, keep warnings as "system message" paragraphs in the built documents. 120 | # keep_warnings = False 121 | 122 | # If true, `todo` and `todoList` produce output, else they produce nothing. 123 | todo_include_todos = True 124 | 125 | 126 | # -- Options for HTML output ---------------------------------------------- 127 | 128 | # The theme to use for HTML and HTML Help pages. See the documentation for 129 | # a list of builtin themes. 130 | # 131 | html_theme = 'alabaster' 132 | 133 | # Theme options are theme-specific and customize the look and feel of a theme 134 | # further. For a list of options available for each theme, see the 135 | # documentation. 136 | # 137 | # html_theme_options = {} 138 | 139 | # Add any paths that contain custom themes here, relative to this directory. 140 | # html_theme_path = [] 141 | 142 | # The name for this set of Sphinx documents. 143 | # " v documentation" by default. 144 | # 145 | # html_title = 'autoencoder v0.1' 146 | 147 | # A shorter title for the navigation bar. Default is the same as html_title. 148 | # 149 | # html_short_title = None 150 | 151 | # The name of an image file (relative to this directory) to place at the top 152 | # of the sidebar. 153 | # 154 | # html_logo = None 155 | 156 | # The name of an image file (relative to this directory) to use as a favicon of 157 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 158 | # pixels large. 159 | # 160 | # html_favicon = None 161 | 162 | # Add any paths that contain custom static files (such as style sheets) here, 163 | # relative to this directory. They are copied after the builtin static files, 164 | # so a file named "default.css" will overwrite the builtin "default.css". 165 | html_static_path = ['_static'] 166 | 167 | # Add any extra paths that contain custom files (such as robots.txt or 168 | # .htaccess) here, relative to this directory. These files are copied 169 | # directly to the root of the documentation. 170 | # 171 | # html_extra_path = [] 172 | 173 | # If not None, a 'Last updated on:' timestamp is inserted at every page 174 | # bottom, using the given strftime format. 175 | # The empty string is equivalent to '%b %d, %Y'. 176 | # 177 | # html_last_updated_fmt = None 178 | 179 | # If true, SmartyPants will be used to convert quotes and dashes to 180 | # typographically correct entities. 181 | # 182 | # html_use_smartypants = True 183 | 184 | # Custom sidebar templates, maps document names to template names. 185 | # 186 | # html_sidebars = {} 187 | 188 | # Additional templates that should be rendered to pages, maps page names to 189 | # template names. 190 | # 191 | # html_additional_pages = {} 192 | 193 | # If false, no module index is generated. 194 | # 195 | # html_domain_indices = True 196 | 197 | # If false, no index is generated. 198 | # 199 | # html_use_index = True 200 | 201 | # If true, the index is split into individual pages for each letter. 202 | # 203 | # html_split_index = False 204 | 205 | # If true, links to the reST sources are added to the pages. 206 | # 207 | # html_show_sourcelink = True 208 | 209 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 210 | # 211 | # html_show_sphinx = True 212 | 213 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 214 | # 215 | # html_show_copyright = True 216 | 217 | # If true, an OpenSearch description file will be output, and all pages will 218 | # contain a tag referring to it. The value of this option must be the 219 | # base URL from which the finished HTML is served. 220 | # 221 | # html_use_opensearch = '' 222 | 223 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 224 | # html_file_suffix = None 225 | 226 | # Language to be used for generating the HTML full-text search index. 227 | # Sphinx supports the following languages: 228 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 229 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 230 | # 231 | # html_search_language = 'en' 232 | 233 | # A dictionary with options for the search language support, empty by default. 234 | # 'ja' uses this config value. 235 | # 'zh' user can custom change `jieba` dictionary path. 236 | # 237 | # html_search_options = {'type': 'default'} 238 | 239 | # The name of a javascript file (relative to the configuration directory) that 240 | # implements a search results scorer. If empty, the default will be used. 241 | # 242 | # html_search_scorer = 'scorer.js' 243 | 244 | # Output file base name for HTML help builder. 245 | htmlhelp_basename = 'autoencoderdoc' 246 | 247 | # -- Options for LaTeX output --------------------------------------------- 248 | 249 | latex_elements = { 250 | # The paper size ('letterpaper' or 'a4paper'). 251 | # 252 | # 'papersize': 'letterpaper', 253 | 254 | # The font size ('10pt', '11pt' or '12pt'). 255 | # 256 | # 'pointsize': '10pt', 257 | 258 | # Additional stuff for the LaTeX preamble. 259 | # 260 | # 'preamble': '', 261 | 262 | # Latex figure (float) alignment 263 | # 264 | # 'figure_align': 'htbp', 265 | } 266 | 267 | # Grouping the document tree into LaTeX files. List of tuples 268 | # (source start file, target name, title, 269 | # author, documentclass [howto, manual, or own class]). 270 | latex_documents = [ 271 | (master_doc, 'autoencoder.tex', 'autoencoder Documentation', 272 | 'Gokcen Eraslan', 'manual'), 273 | ] 274 | 275 | # The name of an image file (relative to this directory) to place at the top of 276 | # the title page. 277 | # 278 | # latex_logo = None 279 | 280 | # For "manual" documents, if this is true, then toplevel headings are parts, 281 | # not chapters. 282 | # 283 | # latex_use_parts = False 284 | 285 | # If true, show page references after internal links. 286 | # 287 | # latex_show_pagerefs = False 288 | 289 | # If true, show URL addresses after external links. 290 | # 291 | # latex_show_urls = False 292 | 293 | # Documents to append as an appendix to all manuals. 294 | # 295 | # latex_appendices = [] 296 | 297 | # It false, will not define \strong, \code, itleref, \crossref ... but only 298 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 299 | # packages. 300 | # 301 | # latex_keep_old_macro_names = True 302 | 303 | # If false, no module index is generated. 304 | # 305 | # latex_domain_indices = True 306 | 307 | 308 | # -- Options for manual page output --------------------------------------- 309 | 310 | # One entry per manual page. List of tuples 311 | # (source start file, name, description, authors, manual section). 312 | man_pages = [ 313 | (master_doc, 'autoencoder', 'autoencoder Documentation', 314 | [author], 1) 315 | ] 316 | 317 | # If true, show URL addresses after external links. 318 | # 319 | # man_show_urls = False 320 | 321 | 322 | # -- Options for Texinfo output ------------------------------------------- 323 | 324 | # Grouping the document tree into Texinfo files. List of tuples 325 | # (source start file, target name, title, author, 326 | # dir menu entry, description, category) 327 | texinfo_documents = [ 328 | (master_doc, 'autoencoder', 'autoencoder Documentation', 329 | author, 'autoencoder', 'One line description of project.', 330 | 'Miscellaneous'), 331 | ] 332 | 333 | # Documents to append as an appendix to all manuals. 334 | # 335 | # texinfo_appendices = [] 336 | 337 | # If false, no module index is generated. 338 | # 339 | # texinfo_domain_indices = True 340 | 341 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 342 | # 343 | # texinfo_show_urls = 'footnote' 344 | 345 | # If true, do not generate a @detailmenu in the "Top" node's menu. 346 | # 347 | # texinfo_no_detailmenu = False 348 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. autoencoder documentation master file, created by 2 | sphinx-quickstart on Sun Apr 9 12:35:06 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to autoencoder's documentation! 7 | ======================================= 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | 15 | API documentation 16 | ================= 17 | 18 | 19 | .. automodule:: autoencoder.api 20 | :members: 21 | .. automodule:: autoencoder.loss 22 | :members: 23 | .. automodule:: autoencoder.network 24 | :members: 25 | .. automodule:: autoencoder.encode 26 | :members: 27 | .. automodule:: autoencoder.io 28 | :members: 29 | 30 | 31 | 32 | Indices and tables 33 | ================== 34 | 35 | * :ref:`genindex` 36 | * :ref:`modindex` 37 | * :ref:`search` 38 | 39 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files=dca/test*.py 3 | -------------------------------------------------------------------------------- /reproducibility/code/Figure4.R: -------------------------------------------------------------------------------- 1 | # Load matrices #### 2 | withoutDropout <- read.csv("../data/francesconi/francesconi_withoutDropout.csv", row.names = 1) 3 | withDropout <- read.csv("../data/francesconi/francesconi_withDropout.csv", row.names = 1) 4 | dca <- read.csv("../data/francesconi/francesconi_dca.csv", row.names = 1) 5 | magic <- read.csv("../data/francesconi/francesconi_magic.csv", row.names = 1) 6 | scimpute <- read.csv("../data/francesconi/francesconi_scimpute.csv", row.names = 1) 7 | saver <- read.csv("../data/francesconi/francesconi_saver.csv", row.names = 1) 8 | 9 | # Generate heatmaps #### 10 | cors <- apply(withoutDropout,1,function(x) cor.test(method = "pearson",x,1:ncol(withoutDropout))) 11 | pvals <- unlist(lapply(cors, function(x) x$p.value)) 12 | coefs <- unlist(lapply(cors, function(x) x$estimate)) 13 | genes.up <- names(head(sort(pvals[coefs > 0]), 100)) 14 | genes.down <- names(head(sort(pvals[coefs < 0]), 100)) 15 | 16 | genHeatmap <- function(matr){ 17 | library(gplots) 18 | load("../data/BlueYellowColormaps_V1.RData") 19 | genes <- c(genes.up, genes.down ) 20 | rowOrd <- order(unlist(lapply(cors[genes], function(x) x$estimate))) 21 | matr <- matr[match(genes, rownames(withoutDropout)),] 22 | tmp <- data.matrix(matr) 23 | tmp <- t(apply(tmp, 1, function(x) (x - mean(x)) / sd(x))) 24 | tmp[which(tmp > 2)] <- 2 25 | tmp[which(tmp < (-2))] <- (-2) 26 | tmp[which(matr == 0)] <- NA 27 | heatmap.2(tmp[rowOrd,], Rowv = NA, Colv = NA, density.info = "none", trace = "none", col = yellow2blue, na.color = "grey", scale = "none", labRow = "", labCol = "") 28 | } 29 | 30 | genHeatmap(withoutDropout) # Panel A 31 | genHeatmap(withDropout) # Panel B 32 | genHeatmap(dca) # Panel C 33 | 34 | # Generate boxplot #### 35 | cors <- apply(withoutDropout, 1, function(x) cor.test(method = "pearson",x,1:ncol(withoutDropout))) 36 | pvals <- unlist(lapply(cors, function(x) x$p.value)) 37 | coefs <- unlist(lapply(cors, function(x) x$estimate)) 38 | 39 | genes <- names(head(sort(unlist(lapply(cors, function(x) x$p.value))), 500)) 40 | 41 | calc.cor <- function(x){ 42 | ok <- match(genes, rownames(withoutDropout)) 43 | C <- apply(x[ok,],1,function(y) cor(y,1:ncol(x))) 44 | return(abs(C)) 45 | } 46 | 47 | imputed <- list(withoutDropout, withDropout, dca, saver, scimpute, magic) 48 | imputed <- lapply(imputed, data.matrix) 49 | 50 | imp <- lapply(imputed, calc.cor) 51 | boxplot(imp, main = "Correlation with Time", ylab = "Pearson Correlation", names = c("Without noise", "With noise", "AE","SAVER", "scImpute", "MAGIC"), 52 | cex.main = 2, cex.lab = 1.5, cex.axis = 1.5, outline = F) 53 | 54 | 55 | # Generate correlation plots #### 56 | genes <- c("tbx-36", "his-8") 57 | 58 | cors <- apply(withoutDropout,1,function(x) cor.test(method = "pearson", x, 1:ncol(withoutDropout))) 59 | pvals <- unlist(lapply(cors, function(x) x$p.value)) 60 | coefs <- unlist(lapply(cors, function(x) x$estimate)) 61 | 62 | genes.up <- names(head(sort(pvals[coefs > 0]), 100)) 63 | genes.down <- names(head(sort(pvals[coefs < 0]), 100)) 64 | 65 | scale01 <- function(x) (x - min(x)) / (max(x) - min(x)) 66 | genCorPlot <- function(gene1, gene2, matr){ 67 | par(mfrow = c(1, 3)) 68 | library(plotrix) 69 | farben <- color.scale(1:206, extremes = c("blue", "red"), alpha = 0.8) 70 | plot(scale01(exp(unlist(withoutDropout[gene1,]))), scale01(exp(unlist(withoutDropout[gene2,]))), col = farben, pch = 16, main = "Original", ylab = gene2, xlab = gene1) 71 | correl <- signif(cor(method = "spearman", scale01(exp(unlist(withoutDropout[gene1,]))), scale01(exp(unlist(withoutDropout[gene2,])))), 2) 72 | legend("topright", paste("Spearman Rho", correl)) 73 | 74 | plot(scale01(unlist(withDropout[gene1,])), scale01(unlist(withDropout[gene2,])), col = farben, pch = 16, main = "Dropout", ylab = gene2, xlab = gene1) 75 | correl <- signif(cor(method = "spearman", scale01(unlist(withDropout[gene1,])), scale01(unlist(withDropout[gene2,]))), 2) 76 | legend("topright", paste("Spearman Rho", correl)) 77 | 78 | plot(scale01(unlist(matr[gene1,])), scale01(unlist(matr[gene2,])), col = farben, pch = 16, main = "Denoised", ylab = gene2, xlab = gene1) 79 | correl <- signif(cor(method = "spearman", scale01(unlist(matr[gene1,])), scale01(unlist(matr[gene2,]))), 2) 80 | legend("topright", paste("Spearman Rho", correl)) 81 | 82 | } 83 | 84 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = dca) # Panels E, F, G 85 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = saver) 86 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = magic) 87 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = scimpute) 88 | -------------------------------------------------------------------------------- /reproducibility/code/Figure5.R: -------------------------------------------------------------------------------- 1 | # Load libraries #### 2 | library(DESeq2) 3 | library(plotrix) 4 | library(ggplot2) 5 | library(beeswarm) 6 | 7 | # Load DESeq2 results #### 8 | load("../data/chu/chu_deseq2_results.RData") 9 | 10 | # Generate plots foldchange plots #### 11 | # Panels A and B 12 | pdf(useDingbats = F, "../figs/Fig5_A_B.pdf", width = 8, height = 4.5) 13 | par(mfrow = c(1, 2)) 14 | diffs <- list(abs(res_original$log2FoldChange - res_bulk$log2FoldChange), 15 | abs(res_dca$log2FoldChange - res_bulk$log2FoldChange)) 16 | farben <- color.scale(unlist(diffs), alpha = 0.8, extremes = c("darkblue", "darkred")) 17 | plot(res_original$log2FoldChange, res_bulk$log2FoldChange, main = "Original", ylab = "Bulk", xlab = "Esimtated fold change", ylim = c(-5, 15), xlim = c(-30, 30), col = farben[1:1000], pch = 16) 18 | abline(0, 1) 19 | abline(v = 0, h = 0, col = "grey", lty = 2) 20 | legend("bottomright", paste("Rho:", signif(cor(res_original$log2FoldChange, res_bulk$log2FoldChange), 2)), bty = "n") 21 | plot(res_dca$log2FoldChange, res_bulk$log2FoldChange, main = "DCA denoised", ylab = "Bulk", xlab = "Esimtated fold change", ylim = c(-5, 15), xlim = c(-30, 30), col = farben[1001:2000], pch = 16) 22 | abline(0, 1) 23 | abline(v = 0, h = 0, col = "grey", lty = 2) 24 | legend("bottomright", paste("Rho:", signif(cor(res_dca$log2FoldChange, res_bulk$log2FoldChange), 2)), bty = "n") 25 | dev.off() 26 | 27 | # Load expression tables #### 28 | bulk <- data.matrix(read.csv("../data/chu/chu_bulk.csv", row.names = 1)) 29 | treat_bulk <- colnames(bulk) 30 | treat_bulk <- unlist(lapply(treat_bulk, function(x) strsplit(x, "_", fixed = T)[[1]][1])) 31 | ok <- which(treat_bulk %in% c("H1", "DEC")) 32 | treat_bulk <- treat_bulk[ok] 33 | bulk <- bulk[, ok] 34 | 35 | counts <- read.csv("../data/chu/chu_original.csv", row.names = 1) 36 | counts <- round(counts) 37 | treat <- unlist(lapply(colnames(counts), function(x) strsplit(x, "_", fixed = T)[[1]][1])) 38 | farben <- c("black", "yellow", "blue", "purple", "green", "red", "grey") 39 | names(farben) <- c("H1", "H9", "EC", "NPC", "DEC", "HFF", "TB") 40 | ok <- which(treat %in% c("H1", "DEC")) 41 | counts <- counts[, ok] 42 | treat <- treat[ok] 43 | dca <- data.matrix(read.csv("../data/chu/chu_dca.csv", row.names = 1)) 44 | original <- data.matrix(counts[rownames(dca),]) 45 | bulk <- data.matrix(bulk[rownames(dca),]) 46 | 47 | # Generate single gene plots #### 48 | # Panels C, D and E 49 | pdf(useDingbats = F, "../figs/Fig5_C_D_E.pdf", width = 9, height = 3.5) 50 | par(mfrow = c(1, 3)) 51 | gene <- "LEFTY1" 52 | boxplot(split(original[gene,], treat)[c("H1", "DEC")], outline = FALSE, main = "Original", ylim = c(0, 5000), ylab = gene) 53 | #beeswarm(split(original[gene,], treat)[c("H1", "DEC")], pch = 16, add = TRUE, cex = 0.8) 54 | boxplot(split(dca[gene,], treat)[c("H1", "DEC")], outline = FALSE, main = "DCA denoised", ylim = c(0, 5000), ylab = gene) 55 | #beeswarm(split(dca[gene,], treat)[c("H1", "DEC")], pch = 16, add = TRUE, cex = 0.8) 56 | boxplot(split(bulk[gene,], treat_bulk)[c("H1", "DEC")], outline = FALSE, main = "Bulk", ylab = gene) 57 | dev.off() 58 | 59 | # Generate boxplot #### 60 | # Panel F 61 | load("../data/chu/HundredTimes_20cells.RData") 62 | load("../data/chu/chu_deseq2_results.RData") 63 | 64 | res_bulk <- res_bulk[rownames(res_original), ] 65 | res_bulk$log2FoldChange <- res_bulk$log2FoldChange*(-1) 66 | tmp <- lapply(1:5, function(y) unlist(lapply(1:100, function(x) cor(res_bulk$log2FoldChange, hundredTimes[[x]][[y]]$log2FoldChange, use = "complete")))) 67 | 68 | colors <- list(c(192, 81, 158), c(73, 93, 115), c(152, 201, 125), c(117, 90, 36)) 69 | colors <- c("white", unlist(lapply(colors, function(x) rgb((x/sum(x))[1], (x/sum(x))[2], (x/sum(x))[3])))) 70 | 71 | pdf(useDingbats = F, "../figs/Fig5_F.pdf", height = 4, width = 3.5) 72 | boxplot(tmp, names = c("original", "DCA", "SAVER", "MAGIC", "scImpute"), ylab = "Pearson correlation", las = 2, col = colors, outline = F) 73 | dev.off() 74 | 75 | 76 | -------------------------------------------------------------------------------- /reproducibility/code/Figure6.R: -------------------------------------------------------------------------------- 1 | # Load pre-calculated Seurat object #### 2 | library(Seurat) 3 | load("../data/stoeckius/CBMC.seurat.RData") 4 | 5 | # Generate tSNE visualization showing celltype clustering (Fig Panel A) #### 6 | panelA <- TSNEPlot(cbmc, do.label = TRUE, pt.size = 0.5) 7 | panelA 8 | 9 | # Load imputed data #### 10 | dca <- read.csv("../data/stoeckius/stoeckius_dca.csv", row.names = 1) 11 | magic <- read.csv("../data/stoeckius/stoeckius_magic.csv", row.names = 1) 12 | saver <- read.csv("../data/stoeckius/stoeckius_saver.csv", row.names = 1) 13 | scimpute <- read.csv("../data/stoeckius/stoeckius_scimpute.csv", row.names = 1) 14 | 15 | # Define protein-mRNA pairs #### 16 | protein <- c("CD3", "CD19", "CD4", "CD8", "CD56", "CD16", "CD11c", "CD14") 17 | rna <- c("CD3E", "CD19", "CD4", "CD8A", "NCAM1", "FCGR3A", "ITGAX", "CD14") 18 | 19 | # Add imputed RNA levels to Seurat object #### 20 | tmp <- dca 21 | rownames(tmp) <- gsub("HUMAN", "IMPUTED", rownames(tmp)) 22 | cbmc <- SetAssayData(cbmc, assay.type = "IMPUTED", slot = "raw.data", new.data = data.matrix(tmp)) 23 | cbmc <- NormalizeData(cbmc, assay.type = "IMPUTED") 24 | cbmc <- ScaleData(cbmc, assay.type = "IMPUTED", display.progress = FALSE) 25 | 26 | tmp <- magic 27 | rownames(tmp) <- gsub("HUMAN", "MAGIC", rownames(tmp)) 28 | cbmc <- SetAssayData(cbmc, assay.type = "MAGIC", slot = "raw.data", new.data = data.matrix(tmp)) 29 | cbmc <- NormalizeData(cbmc, assay.type = "MAGIC") 30 | cbmc <- ScaleData(cbmc, assay.type = "MAGIC", display.progress = FALSE) 31 | 32 | tmp <- saver 33 | rownames(tmp) <- gsub("HUMAN", "SAVER", rownames(tmp)) 34 | cbmc <- SetAssayData(cbmc, assay.type = "SAVER", slot = "raw.data", new.data = data.matrix(tmp)) 35 | cbmc <- NormalizeData(cbmc, assay.type = "SAVER") 36 | cbmc <- ScaleData(cbmc, assay.type = "SAVER", display.progress = FALSE) 37 | 38 | tmp <- scimpute 39 | rownames(tmp) <- gsub("HUMAN", "SCIMPUTE", rownames(tmp)) 40 | cbmc <- SetAssayData(cbmc, assay.type = "SCIMPUTE", slot = "raw.data", new.data = data.matrix(tmp)) 41 | cbmc <- NormalizeData(cbmc, assay.type = "SCIMPUTE") 42 | cbmc <- ScaleData(cbmc, assay.type = "SCIMPUTE", display.progress = FALSE) 43 | 44 | # tSNE colored by imputed and original RNA expression (Fig Panel B) #### 45 | panelB1 <- FeaturePlot(cbmc, features.plot = c(paste0("CITE_", protein[1:4]), paste0("HUMAN_", rna[1:4]), paste0("IMPUTED_", rna[1:4])), 46 | min.cutoff = "q05", max.cutoff = "q95", nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5, do.return = T) 47 | panelB2 <- FeaturePlot(cbmc, features.plot = c(paste0("CITE_", protein[5:8]), paste0("HUMAN_", rna[5:8]), paste0("IMPUTED_", rna[5:8])), 48 | min.cutoff = "q05", max.cutoff = "q95", nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5, do.return = T) 49 | 50 | # Example plot of CD3 expression in T cells (Fig Panel C) #### 51 | tmp <- SubsetData(cbmc, ident.use = c(0, 5)) 52 | rna.raw <- tmp@data["HUMAN_CD3E",] 53 | protein <- tmp@assay$CITE@scale.data["CITE_CD3",] 54 | rna.imputed <- tmp@assay$IMPUTED@scale.data["IMPUTED_CD3E",] 55 | table(rna.raw == 0)[["TRUE"]]/length(rna.raw) 56 | scale01 <- function(x){ 57 | x <- (x-min(x)) / (max(x) - min(x)) 58 | x - median(x) 59 | } 60 | aframe <- data.frame(Relative.expresion = c(scale01(protein), scale01(rna.raw), scale01(rna.imputed)), type = c(rep("Protein", length(protein)), rep("Original", length(protein)), rep("Denoised", length(protein)))) 61 | panelC <- ggplot(aframe, aes(Relative.expresion, colour = type)) + geom_density() + ggtitle("CD3 in T cells") 62 | panelC 63 | 64 | # Calculate likelihoods of co-occurrence (Fig Panel D) #### 65 | protein <- c("CD3", "CD19", "CD4", "CD8", "CD56", "CD16", "CD11c", "CD14") 66 | rna <- c("CD3E", "CD19", "CD4", "CD8A", "NCAM1", "FCGR3A", "ITGAX", "CD14")= 67 | l <- list(cor(t(cbmc@scale.data[paste0("HUMAN_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"), 68 | cor(t(cbmc@assay$IMPUTED@scale.data[paste0("IMPUTED_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"), 69 | cor(t(cbmc@assay$MAGIC@scale.data[paste0("MAGIC_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"), 70 | cor(t(cbmc@assay$SAVER@scale.data[paste0("SAVER_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"), 71 | cor(t(cbmc@assay$SCIMPUTE@scale.data[paste0("SCIMPUTE_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman")) 72 | l <- lapply(l, diag) 73 | boxplot(l, ylab = "Spearman Correlation", names = c("Original", "DCA", "MAGIC", "SAVER", "scImpute"), las = 2) 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /reproducibility/code/Figure8.R: -------------------------------------------------------------------------------- 1 | # Load pre-calculated Seurat object #### 2 | library(Seurat) 3 | load("../data/stoeckius/CBMC.seurat.RData") 4 | 5 | # Load DCA denoised data #### 6 | dca <- read.csv("../data/stoeckius/stoeckius_dca.csv", row.names = 1) 7 | 8 | # Add imputed data to Seurat object #### 9 | tmp <- dca 10 | rownames(tmp) <- gsub("HUMAN", "IMPUTED", rownames(tmp)) 11 | cbmc <- SetAssayData(cbmc, assay.type = "IMPUTED", slot = "raw.data", new.data = data.matrix(tmp)) 12 | cbmc <- NormalizeData(cbmc, assay.type = "IMPUTED") 13 | cbmc <- ScaleData(cbmc, assay.type = "IMPUTED", display.progress = FALSE) 14 | 15 | # Subset to NK cells #### 16 | sub <- SubsetData(cbmc, ident.use = 3) 17 | sub <- ScaleData(sub, assay.type = "CITE", display.progress = FALSE) 18 | sub <- ScaleData(sub, display.progress = FALSE, vars.to.regress = "nUMI") 19 | 20 | # Generate tSNEs colored by protein levels #### 21 | FeaturePlot(sub, c("CITE_CD56", "CITE_CD16"), min.cutoff = "q01", max.cutoff = "q99", cols.use = c("grey", "blue")) # Panel A & B 22 | 23 | # Generate scatterplot of expression levels #### 24 | par(mfrow = c(1,3)) 25 | library(mclust) 26 | tmp <- sub@assay$CITE@scale.data[c('CITE_CD56', 'CITE_CD16'),] 27 | m_prot <- Mclust(t(tmp), G = 2) 28 | plot(t(tmp), col = m_prot$classification, main = 'Protein', pch = 16) # Panel C 29 | 30 | tmp <- data.matrix(sub@data[c('HUMAN_NCAM1', 'HUMAN_FCGR3A'),]) 31 | m_orig <- Mclust(t(tmp), G = 2) 32 | plot(t(tmp), col = m_prot$classification, main = 'Original RNA', pch = 16) # Panel D 33 | 34 | tmp <- data.matrix(sub@assay$IMPUTED@data[c('IMPUTED_NCAM1', 'IMPUTED_FCGR3A'),]) 35 | m_imp <- Mclust(t(tmp), G = 2) 36 | plot(t(tmp), col = m_prot$classification, main = 'Original RNA', pch = 16) # Panel E 37 | 38 | fisher.test(table(m_prot$classification==1, m_imp$classification==2)) 39 | fisher.test(table(m_prot$classification==1, m_orig$classification==2)) 40 | -------------------------------------------------------------------------------- /reproducibility/code/ImputeUsingDCA.sh: -------------------------------------------------------------------------------- 1 | dca ../data/chu/chu_original.csv ../data/chu/res_dca 2 | dca ../data/francesconi/francesconi_original.csv ../data/francesconi/res_dca 3 | dca --type nb-conddisp ../data/stoeckius/stoeckius_original.csv ../data/stoeckius/res_dca 4 | -------------------------------------------------------------------------------- /reproducibility/code/ImputeUsingMAGIC.py: -------------------------------------------------------------------------------- 1 | import magic 2 | import os 3 | 4 | scdata = magic.mg.SCData.from_csv("../data/chu/chu_original.csv", cell_axis="columns", data_type='sc-seq') 5 | scdata.run_magic() 6 | mdata = scdata.magic.data 7 | mdata=mdata.transpose() 8 | mdata.to_csv("../data/chu/chu_magic.csv") 9 | 10 | scdata = magic.mg.SCData.from_csv("../data/francesconi/francesconi_original.csv", cell_axis="columns", data_type='sc-seq') 11 | scdata.run_magic() 12 | mdata = scdata.magic.data 13 | mdata=mdata.transpose() 14 | mdata.to_csv("../data/francesconi/francesconi_magic.csv") 15 | 16 | scdata = magic.mg.SCData.from_csv("../data/stoeckius/stoeckius_original.csv", cell_axis="columns", data_type='sc-seq') 17 | scdata.run_magic() 18 | mdata = scdata.magic.data 19 | mdata=mdata.transpose() 20 | mdata.to_csv("../data/stoeckius/stoeckius_magic.csv") 21 | -------------------------------------------------------------------------------- /reproducibility/code/ImputeUsingSAVER.R: -------------------------------------------------------------------------------- 1 | # Imputation using SAVER #### 2 | library(SAVER) 3 | library(doParallel) 4 | registerDoParallel(cores = 5) 5 | sc <- read.csv("../data/francesconi/francesconi_withDropout.csv", row.names = 1) 6 | sav <- saver(as.matrix(sc), parallel = TRUE) 7 | sav <- sav$estimate 8 | write.csv(sav, file = "../data/francesconi/francesconi_saver.csv", quote = F) 9 | 10 | sc <- read.csv("../data/chu/chu_original.csv", row.names = 1) 11 | sav <- saver(as.matrix(sc), parallel = TRUE) 12 | sav <- sav$estimate 13 | write.csv(sav, file = "../data/chu/chu_saver.csv", quote = F) 14 | 15 | sc <- read.csv("../data/stoeckius/stoeckius_original.csv", row.names = 1) 16 | sav <- saver(as.matrix(sc), parallel = TRUE) 17 | sav <- sav$estimate 18 | write.csv(sav, file = "../data/stoeckius/stoeckius_saver.csv", quote = F) 19 | -------------------------------------------------------------------------------- /reproducibility/code/ImputeUsingSCIMPUTE.R: -------------------------------------------------------------------------------- 1 | library(scImpute) 2 | 3 | scimpute(count_path = "../data/francesconi/francesconi_withDropout.csv", infile = "csv", outfile = "csv", out_dir = "../data/francesconi/scimpute", Kcluster = 1) 4 | scimpute(count_path = "../data/chu/chu_original.csv", infile = "csv", outfile = "csv", out_dir = "../data/chu/scimpute", Kcluster = 2) 5 | scimpute(count_path = "../data/stoeckius/stoeckius_original.csv", infile = "csv", outfile = "csv", out_dir = "../data/stoeckius/scimpute", Kcluster = 13) 6 | -------------------------------------------------------------------------------- /reproducibility/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # downloads 722M dataset file 4 | wget https://hmgubox.helmholtz-muenchen.de/f/1a014dc377f64b2b964c/?dl=1 -O datasets.zip 5 | mkdir data; cd data 6 | unzip ../datasets.zip 7 | -------------------------------------------------------------------------------- /scripts/seurat.R: -------------------------------------------------------------------------------- 1 | suppressMessages(library(Seurat, quietly = T)) 2 | suppressMessages(library(ggplot2, quietly = T)) 3 | suppressMessages(library(Rtsne, quietly = T)) 4 | 5 | normalize <- function(x) { 6 | sf <- rowSums(x) 7 | sf <- sf / median(sf) 8 | x <- x / sf 9 | x <- log(x+1) 10 | scale(x, center = T, scale = T) 11 | } 12 | 13 | 14 | `%+%` <- paste0 15 | args <- commandArgs(trailingOnly = T) 16 | stopifnot(length(args) == 1) 17 | arg <- args[[1]] 18 | 19 | if (!dir.exists(arg)) { 20 | files <- arg 21 | } else { 22 | files <- list.files(arg, recursive = T, pattern = '^counts\\..sv', full.names = T) 23 | } 24 | 25 | for (cnt.file in files) { 26 | print('Visualizing ' %+% cnt.file) 27 | 28 | output.dir <- dirname(cnt.file) 29 | tbl <- read.table(cnt.file, header = T) 30 | 31 | # Load labels if available ------------------------------------------------ 32 | 33 | if (file.exists(output.dir %+% '/info_cellinfo.tsv')) { 34 | labels <- read.table(output.dir %+% '/info_cellinfo.tsv', header=T)$Group 35 | } else if (file.exists(output.dir %+% '/../info_cellinfo.tsv')) { 36 | labels <- read.table(output.dir %+% '/../info_cellinfo.tsv', header=T)$Group 37 | } 38 | else labels <- NULL 39 | 40 | # Seurat PCA and tSNE ----------------------------------------------------- 41 | 42 | s <- CreateSeuratObject(tbl, min.cells = 1, min.genes = 1) 43 | print(s) 44 | s <- NormalizeData(s, display.progress = F) 45 | s <- ScaleData(s, display.progress = F) 46 | 47 | s <- RunPCA(s, pc.genes = rownames(s@data), do.print = F) 48 | s <- RunTSNE(s) 49 | s <- FindClusters(s, reduction.type = "pca", dims.use=1:5, save.SNN = T, print.output = 0) 50 | print('Number of clusters: ' %+% length(levels(s@ident))) 51 | 52 | DimPlot(s) 53 | ggsave(output.dir %+% '/seurat_PCA_all_CL.png') 54 | DimPlot(s, reduction.use = 'tsne') 55 | ggsave(output.dir %+% '/seurat_tSNE_all_CL.png') 56 | if (!is.null(labels)) { 57 | s@meta.data$ground.truth <- labels 58 | DimPlot(s, group.by='ground.truth') 59 | ggsave(output.dir %+% '/seurat_PCA_all_GT.png') 60 | DimPlot(s, reduction.use = 'tsne', group.by='ground.truth') 61 | ggsave(output.dir %+% '/seurat_tSNE_all_GT.png') 62 | } 63 | 64 | s <- FindVariableGenes(s, do.plot = F, display.progress = F) 65 | print('Number of variable genes: ' %+% length(s@var.genes)) 66 | s <- RunPCA(s, do.print = F) # use variable genes by default 67 | s <- RunTSNE(s) 68 | s <- FindClusters(s, reduction.type = "pca", dims.use = 1:5, save.SNN = T, print.output = 0, force.recalc = T) 69 | print('Number of clusters: ' %+% length(levels(s@ident))) 70 | 71 | DimPlot(s) 72 | ggsave(output.dir %+% '/seurat_PCA_var_CL.png') 73 | DimPlot(s, reduction.use = 'tsne') 74 | ggsave(output.dir %+% '/seurat_tSNE_var_CL.png') 75 | if (!is.null(labels)) { 76 | DimPlot(s, group.by='ground.truth') 77 | ggsave(output.dir %+% '/seurat_PCA_var_GT.png') 78 | DimPlot(s, reduction.use = 'tsne', group.by='ground.truth') 79 | ggsave(output.dir %+% '/seurat_tSNE_var_GT.png') 80 | } 81 | 82 | write.table(data.frame(label=unname(s@ident), cell=names(s@ident)), 83 | output.dir %+% '/seurat_cluster_labels.tsv', 84 | row.names = F, quote = F) 85 | 86 | saveRDS(s, output.dir %+% '/seurat.Rds') 87 | 88 | # PCA and tSNE with sf and lognorm ---------------------------------------- 89 | 90 | if (!is.null(labels)) { 91 | counts <- t(tbl) 92 | counts <- counts[, colSums(counts)>0] 93 | norm.counts <- normalize(counts) 94 | 95 | pca.counts <- prcomp(norm.counts, rank. = 2)$x 96 | qplot(pca.counts[,1], pca.counts[,2], color=labels, xlab='PC1', ylab='PC2') 97 | ggsave(output.dir %+% '/seurat_PCA_all_simplepre_GT.png') 98 | 99 | tsne.counts <- Rtsne(norm.counts)$Y 100 | qplot(tsne.counts[,1], tsne.counts[,2], color=labels, xlab='tsne1', ylab='tsne2') 101 | ggsave(output.dir %+% '/seurat_tSNE_all_simplepre_GT.png') 102 | 103 | if (file.exists(output.dir %+% '/info_truecounts.tsv')) { 104 | 105 | tr <- t(read.table(output.dir %+% '/info_truecounts.tsv')) 106 | tr<- tr[, colSums(tr)>0] 107 | tr.norm <- normalize(tr) 108 | pca.tr <- prcomp(tr.norm, rank. = 2)$x 109 | qplot(pca.tr[,1], pca.tr[,2], color=labels, xlab='pca1', ylab='pca2') 110 | ggsave(output.dir %+% '/seurat_TRUECOUNT_PCA_all_simplepre_GT.png') 111 | 112 | tsne.tr <- Rtsne(tr.norm)$Y 113 | qplot(tsne.tr[,1], tsne.tr[,2], color=labels, xlab='tsne1', ylab='tsne2') 114 | ggsave(output.dir %+% '/seurat_TRUECOUNT_tSNE_all_simplepre_GT.png') 115 | 116 | } 117 | } 118 | 119 | } -------------------------------------------------------------------------------- /scripts/simulate.R: -------------------------------------------------------------------------------- 1 | # Warning! R 3.4 and Bioconductor 3.5 are required for splatter! 2 | # library(BiocInstaller) 3 | # biocLite('splatter') 4 | library(splatter) # requires splatter >= 1.2.0 5 | 6 | save.sim <- function(sim, dir) { 7 | counts <- counts(sim) 8 | truecounts <- assays(sim)$TrueCounts 9 | drp <- 'Dropout' %in% names(assays(sim)) 10 | if (drp) { 11 | dropout <- assays(sim)$Dropout 12 | mode(dropout) <- 'integer' 13 | } 14 | cellinfo <- colData(sim) 15 | geneinfo <- rowData(sim) 16 | 17 | # save count matrices 18 | write.table(counts, paste0(dir, '/counts.tsv'), 19 | sep='\t', row.names=T, col.names=T, quote=F) 20 | write.table(truecounts, paste0(dir, '/info_truecounts.tsv'), 21 | sep='\t', row.names=T, col.names=T, quote=F) 22 | 23 | if (drp) { 24 | # save ground truth dropout labels 25 | write.table(dropout, paste0(dir, '/info_droupout.tsv'), 26 | sep='\t', row.names=T, col.names=T, quote=F) 27 | } 28 | 29 | # save metadata 30 | write.table(cellinfo, paste0(dir, '/info_cellinfo.tsv'), sep='\t', 31 | row.names=F, quote=F) 32 | write.table(geneinfo, paste0(dir, '/info_geneinfo.tsv'), sep='\t', 33 | row.names=F, quote=F) 34 | 35 | saveRDS(sim, paste0(dir, '/sce.rds')) 36 | } 37 | 38 | 39 | for (dropout in c(0, 1, 3, 5)) { 40 | for (ngroup in c(1, 2, 3, 6)) { 41 | for(swap in c(F, T)) { 42 | 43 | nGenes <- 200 44 | batchCells <- 2000 45 | 46 | if (swap) { 47 | tmp <- nGenes 48 | nGenes <- batchCells 49 | batchCells <- tmp 50 | } 51 | 52 | # split nCells into roughly ngroup groups 53 | if(ngroup==1) { 54 | group.prob <- 1 55 | } else { 56 | group.prob <- rep(1, ngroup)/ngroup 57 | } 58 | method <- ifelse(ngroup == 1, 'single', 'groups') 59 | 60 | dirname <- paste0('real/group', ngroup, '/dropout', dropout, ifelse(swap, '/swap', '')) 61 | if (!dir.exists(dirname)) 62 | dir.create(dirname, showWarnings=F, recursive=T) 63 | 64 | #### Estimate parameters from the real dataset 65 | data(sc_example_counts) 66 | params <- splatEstimate(sc_example_counts) 67 | 68 | # simulate scRNA data 69 | sim <- splatSimulate(params, group.prob=group.prob, nGenes=nGenes, 70 | dropout.present=(dropout!=0), dropout.shape=-1, 71 | dropout.mid=dropout, seed=42, method=method, 72 | bcv.common=1) # limit disp to get fewer true zeros 73 | save.sim(sim, dirname) 74 | 75 | 76 | dirname <- paste0('sim/group', ngroup, '/dropout', dropout, ifelse(swap, '/swap', '')) 77 | if (!dir.exists(dirname)) 78 | dir.create(dirname, showWarnings=F, recursive=T) 79 | 80 | #### Simulate data without using real data 81 | sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells, 82 | dropout.present=(dropout!=0), method=method, 83 | seed=42, dropout.shape=-1, dropout.mid=dropout) 84 | save.sim(sim, dirname) 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='DCA', 5 | version='0.3.3', 6 | description='Count autoencoder for scRNA-seq denoising', 7 | author='Gokcen Eraslan', 8 | author_email="gokcen.eraslan@gmail.com", 9 | packages=['dca'], 10 | install_requires=['numpy>=1.7', 11 | 'keras>=2.4,<2.6', 12 | 'tensorflow>=2.0,<2.5', 13 | 'h5py', 14 | 'six>=1.10.0', 15 | 'scikit-learn', 16 | 'scanpy', 17 | 'kopt', 18 | 'pandas' #for preprocessing 19 | ], 20 | url='https://github.com/theislab/dca', 21 | entry_points={ 22 | 'console_scripts': [ 23 | 'dca = dca.__main__:main' 24 | ]}, 25 | license='Apache License 2.0', 26 | classifiers=['License :: OSI Approved :: Apache Software License', 27 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 28 | 'Programming Language :: Python :: 3.5'], 29 | ) 30 | --------------------------------------------------------------------------------