├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── README.md
├── data
    ├── biochemists-nb-coef.tsv
    ├── biochemists-nb-predictions.tsv
    ├── biochemists-zinb-coef.tsv
    ├── biochemists-zinb-predictions.tsv
    ├── biochemists.R
    ├── biochemists.tsv
    ├── test-biochemists-nb.py
    ├── test-biochemists-zinb-ae.py
    └── test-biochemists-zinb.py
├── dca
    ├── __init__.py
    ├── __main__.py
    ├── api.py
    ├── hyper.py
    ├── io.py
    ├── layers.py
    ├── loss.py
    ├── network.py
    ├── test.py
    ├── train.py
    └── utils.py
├── docs
    ├── Makefile
    └── source
    │   ├── conf.py
    │   └── index.rst
├── pytest.ini
├── reproducibility
    ├── code
    │   ├── Figure2.ipynb
    │   ├── Figure4.R
    │   ├── Figure5.R
    │   ├── Figure6.R
    │   ├── Figure8.R
    │   ├── Figure9.ipynb
    │   ├── ImputeUsingDCA.sh
    │   ├── ImputeUsingMAGIC.py
    │   ├── ImputeUsingSAVER.R
    │   └── ImputeUsingSCIMPUTE.R
    └── download.sh
├── scripts
    ├── seurat.R
    └── simulate.R
├── setup.py
└── tutorial.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | dist
3 | logs
4 | build
5 | *.egg-info
6 | .Rproj.user
7 | docs/build
8 | data/simulation/
9 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2018 Gokcen Eraslan
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Deep count autoencoder for denoising scRNA-seq data
 2 | 
 3 | A deep count autoencoder network to denoise scRNA-seq data and remove the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function.
 4 | 
 5 | See our [manuscript](https://www.nature.com/articles/s41467-018-07931-2) and [tutorial](https://nbviewer.ipython.org/github/theislab/dca/blob/master/tutorial.ipynb) for more details.
 6 | 
 7 | ### Installation
 8 | 
 9 | #### pip
10 | 
11 | For a traditional Python installation of the count autoencoder and the required packages, use
12 | 
13 | ```
14 | $ pip install dca
15 | ```
16 | 
17 | #### conda
18 | 
19 | Another approach for installing count autoencoder and the required packages is to use [Conda](https://conda.io/docs/) (most easily obtained via the [Miniconda Python distribution](https://conda.io/miniconda.html)). Afterwards run the following commands.
20 | 
21 | ```
22 | $ conda install -c bioconda dca
23 | ```
24 | 
25 | ### Usage
26 | 
27 | You can run the autoencoder from the command line:
28 | 
29 | `dca matrix.csv results`
30 | 
31 | where `matrix.csv` is a CSV/TSV-formatted raw count matrix with genes in rows and cells in columns. Cell and gene labels are mandatory. 
32 | 
33 | ### Results
34 | 
35 | Output folder contains the main output file (representing the mean parameter of ZINB distribution) as well as some additional matrices in TSV format:
36 | 
37 | - `mean.tsv` is the main output of the method which represents the mean parameter of the ZINB distribution. This file has the same dimensions as the input file (except that the zero-expression genes or cells are excluded). It is formatted as a `gene x cell` matrix. Additionally, `mean_norm.tsv` file contains the library size-normalized expressions of each cell and gene. See `normalize_total` function from [Scanpy](https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_total.html) for the details about the default library size normalization method used in DCA.
38 | 
39 | - `pi.tsv` and `dispersion.tsv` files represent dropout probabilities and dispersion for each cell and gene. Matrix dimensions are same as `mean.tsv` and the input file.
40 | 
41 | - `reduced.tsv` file contains the hidden representation of each cell (in a 32-dimensional space by default), which denotes the activations of bottleneck neurons.
42 | 
43 | Use `-h` option to see all available parameters and defaults.
44 | 
45 | ### Hyperparameter optimization
46 | 
47 | You can run the autoencoder with `--hyper` option to perform hyperparameter search.
48 | 


--------------------------------------------------------------------------------
/data/biochemists-nb-coef.tsv:
--------------------------------------------------------------------------------
1 | val	coef
2 | 0.2561440246579784	intercept
3 | -0.2164184233058461	fem
4 | 0.15048945147514323	mar
5 | -0.17641524234590464	kid5
6 | 0.015271155545275089	phd
7 | 0.029082341647915382	ment
8 | 2.2643876948599235	theta
9 | 


--------------------------------------------------------------------------------
/data/biochemists-nb-predictions.tsv:
--------------------------------------------------------------------------------
  1 | count
  2 | 1.9130391956542903
  3 | 1.2782929044159645
  4 | 1.3119131406197488
  5 | 1.3986188605079646
  6 | 2.3469892052889474
  7 | 0.9515840123801975
  8 | 1.1920656151073028
  9 | 1.2402772942718756
 10 | 1.6506857562464894
 11 | 1.232819715385078
 12 | 2.0309413986296003
 13 | 1.53622140026988
 14 | 1.1704606628624212
 15 | 1.4311328733091957
 16 | 1.2430281022384175
 17 | 1.12299116940048
 18 | 1.5285716777907834
 19 | 1.6444182751891445
 20 | 1.3426067685941796
 21 | 1.4433689545810553
 22 | 1.4906643813523415
 23 | 1.0098859412651533
 24 | 1.5544832864428726
 25 | 1.4773103504298528
 26 | 0.9499161892728708
 27 | 1.3507692350418918
 28 | 1.350558801323357
 29 | 1.4666465194713034
 30 | 1.090802289250184
 31 | 1.69775973817029
 32 | 1.0792039269957219
 33 | 2.023453016270353
 34 | 1.553917453483086
 35 | 1.3340420248011764
 36 | 1.2884376881192057
 37 | 1.072795558035042
 38 | 1.0687076594085227
 39 | 1.7921183505325313
 40 | 1.3593596282612026
 41 | 1.4812607664459911
 42 | 1.7921183505325313
 43 | 1.1557186273567475
 44 | 1.5863228482269782
 45 | 1.1645769831130584
 46 | 1.5883686762018843
 47 | 1.281765207859098
 48 | 1.606236245712563
 49 | 3.270696955250543
 50 | 1.675663874964121
 51 | 1.62050319775802
 52 | 1.601413662937508
 53 | 0.8758755581848123
 54 | 1.5651688541739872
 55 | 2.023453016270353
 56 | 1.3811213317624642
 57 | 1.4799468981257047
 58 | 1.1582505203048081
 59 | 1.359891227621199
 60 | 1.375976287357663
 61 | 1.3516778135768421
 62 | 1.294203120133693
 63 | 1.299900416675326
 64 | 1.2377829072331636
 65 | 1.3531559787470975
 66 | 1.2944503035043777
 67 | 1.432775751678407
 68 | 2.1528400807185166
 69 | 1.8345203662229255
 70 | 1.3018870341740043
 71 | 1.0566178639331978
 72 | 1.4143690858831897
 73 | 0.8840728133241387
 74 | 1.2797090690833863
 75 | 1.4620242742742222
 76 | 1.3855421375184638
 77 | 1.269488774699007
 78 | 1.090000156097383
 79 | 1.3405095860554894
 80 | 1.5070329038492702
 81 | 1.2638438092708362
 82 | 1.1387825006673689
 83 | 1.1017965140431174
 84 | 2.2380888590256687
 85 | 1.8494286995758793
 86 | 2.699177836883008
 87 | 1.1017885307505892
 88 | 3.07443398911672
 89 | 1.3183766679275646
 90 | 2.7862907309045783
 91 | 1.6507134150824274
 92 | 1.3837730079424306
 93 | 1.2615781559887052
 94 | 1.6745852811807076
 95 | 2.699177836883008
 96 | 1.591661302045164
 97 | 1.8840468135577844
 98 | 1.7495445461388062
 99 | 1.3479481861993374
100 | 1.011041277934036
101 | 2.023453016270353
102 | 1.2743090917174307
103 | 1.299900416675326
104 | 1.3214888271794254
105 | 1.4265506627771969
106 | 1.116865525389416
107 | 1.4000207507939337
108 | 1.569972773762755
109 | 1.2782929044159645
110 | 0.9983853333558727
111 | 0.9482475638299139
112 | 1.756128589424267
113 | 1.0747632979277084
114 | 1.1311961071483507
115 | 1.805493422827841
116 | 1.2250300127635236
117 | 1.4722565684306739
118 | 1.7308599799596198
119 | 1.0541568534833894
120 | 1.4799468981257047
121 | 1.4068790723226388
122 | 1.3129654808087163
123 | 1.4230148535020344
124 | 1.7391848010061288
125 | 1.1061409853731698
126 | 1.1437601360707996
127 | 1.919306990559933
128 | 1.4064293857277659
129 | 1.6716623814650637
130 | 1.3340420248011764
131 | 1.438020552662828
132 | 1.1461099961103376
133 | 2.586804092654758
134 | 1.4562657664381922
135 | 1.2160919941663049
136 | 1.3963218428068862
137 | 2.709667619531886
138 | 1.609262566196966
139 | 1.2347622174202633
140 | 0.9513679278806935
141 | 1.1617827924083581
142 | 1.0533522518058107
143 | 1.2979262385950383
144 | 1.3422159115789083
145 | 1.1645769831130584
146 | 1.1370447770446959
147 | 1.035486037662375
148 | 1.5506945096538096
149 | 1.1370447770446959
150 | 1.6754959993528413
151 | 1.1454385625445833
152 | 1.4148011376036695
153 | 1.8112669987448937
154 | 1.4434658776015743
155 | 1.305550189160803
156 | 1.432629159082089
157 | 1.5058826374552357
158 | 1.291191279242996
159 | 1.2795136578284365
160 | 1.496766122948153
161 | 1.9797443348852555
162 | 1.3728126410218597
163 | 1.1343669888721053
164 | 1.4223869040705455
165 | 1.3958954393725276
166 | 1.8552717812142536
167 | 1.2702290005882346
168 | 1.73103339959018
169 | 1.4815864691092595
170 | 1.571194404199422
171 | 1.5685349160612598
172 | 1.356259171878278
173 | 1.3295102630895161
174 | 1.1132584490019601
175 | 1.4296451282982567
176 | 1.1588136364558406
177 | 1.166771869342331
178 | 1.655569763764054
179 | 1.2848610321322673
180 | 1.08459827352548
181 | 1.299900416675326
182 | 1.0539958839953787
183 | 2.262639335473195
184 | 1.747408447917363
185 | 1.84278084358781
186 | 1.546614802720863
187 | 1.6830164208804672
188 | 1.2192103036891195
189 | 1.5749554053823576
190 | 1.0525916105782345
191 | 1.3182774884328046
192 | 1.0693606742472472
193 | 1.28892316901959
194 | 1.0810183247897833
195 | 1.597430007039541
196 | 1.6222196458051712
197 | 1.6816726796459187
198 | 1.2654371983630648
199 | 0.8923467892223494
200 | 2.1209970081347924
201 | 0.8663640031857405
202 | 1.5936287891007983
203 | 1.5100277156326696
204 | 1.1874766643260461
205 | 1.222939747008075
206 | 1.3241286018856473
207 | 0.903790003843517
208 | 1.373441717573077
209 | 1.379411944758564
210 | 1.1327401896608185
211 | 1.2544699291902102
212 | 1.1432681866726222
213 | 1.3877252778336102
214 | 1.3515904238174947
215 | 1.1644121631447761
216 | 1.3153737444661586
217 | 1.7921183505325313
218 | 1.7266359862465284
219 | 1.9091169068903897
220 | 1.501071967601107
221 | 1.9616813397384274
222 | 1.7026929762159284
223 | 1.472654497379225
224 | 1.5969194138588296
225 | 1.396281515401538
226 | 1.3517102551089963
227 | 1.7429017086027023
228 | 1.379411944758564
229 | 1.1373157422679463
230 | 1.4847574325159862
231 | 1.7478595456922623
232 | 1.3580952904912151
233 | 2.832322364981433
234 | 1.3622461725677337
235 | 1.646076705554899
236 | 1.7391848010061288
237 | 1.5972094324766268
238 | 1.1951643500219205
239 | 2.264360655299461
240 | 1.299900416675326
241 | 1.595311459526821
242 | 1.1109759442233038
243 | 1.1185411728322991
244 | 1.5591048832897865
245 | 1.5556532996381962
246 | 1.4620242742742222
247 | 1.3516778135768421
248 | 1.2892209829560282
249 | 1.6693908333357095
250 | 1.1240960809377827
251 | 1.1173461136970986
252 | 2.308564373192447
253 | 1.6908539089563155
254 | 1.236649285226663
255 | 1.84404402118172
256 | 1.2122593901239413
257 | 1.1748175589553567
258 | 1.3295995404361716
259 | 1.5179856911754155
260 | 1.5511460818656453
261 | 1.7222407505898245
262 | 1.4967120456950531
263 | 1.4246072672273935
264 | 1.299900416675326
265 | 1.0924693453474235
266 | 1.1942052859147665
267 | 1.6994614659425926
268 | 1.111516080553065
269 | 3.8172842445514825
270 | 1.2644582625768408
271 | 1.532976504690712
272 | 1.2581150804243435
273 | 1.73365355983202
274 | 1.3626809208662145
275 | 1.5559130690546394
276 | 1.6807288385485204
277 | 1.7537416255001923
278 | 1.1700031144007204
279 | 1.9616813397384274
280 | 0.9134352145463237
281 | 1.5067812897959885
282 | 1.30412293226417
283 | 1.5506945096538096
284 | 1.3160608037741297
285 | 1.2800510061707886
286 | 2.106547548193457
287 | 1.1370447770446959
288 | 1.7393389962099362
289 | 2.066958103460676
290 | 1.5598193262470104
291 | 2.27168986162654
292 | 2.012417495033639
293 | 1.7782577052264712
294 | 1.6435648710396282
295 | 1.461487825996827
296 | 1.3915983629984086
297 | 1.5886720430514658
298 | 2.4746307247566177
299 | 0.9445005646625634
300 | 1.5051676785420898
301 | 1.7764344400480543
302 | 1.6676072355518912
303 | 1.54287746806488
304 | 4.141626697136798
305 | 1.385660698934494
306 | 1.5379546574141412
307 | 1.5601958605206607
308 | 1.620216860604285
309 | 1.2099363795360183
310 | 1.9291009295014914
311 | 1.3365359631500826
312 | 1.1040724960797867
313 | 2.133774348268156
314 | 2.2079235218393802
315 | 2.1435776783456215
316 | 2.056043487566545
317 | 1.2944007748084345
318 | 2.351541954789073
319 | 1.320040070879489
320 | 1.355884162147352
321 | 1.537599385248513
322 | 1.6830456235042799
323 | 1.9827699472230462
324 | 1.209751622553321
325 | 2.0144621475335347
326 | 2.080708925096761
327 | 1.5043280897866023
328 | 1.1874766643260461
329 | 12.142858231772479
330 | 1.91418083020366
331 | 1.452257745280585
332 | 1.6423880836585882
333 | 1.2773172273979465
334 | 2.132991420382744
335 | 1.9091169068903897
336 | 1.3459427853993509
337 | 1.3203955959985
338 | 1.470855591043725
339 | 1.3837730079424306
340 | 1.4740563105808742
341 | 1.2773172273979465
342 | 1.1993695855691393
343 | 1.3007416974883115
344 | 1.4053759504632581
345 | 1.3072514650333393
346 | 1.2173838473392469
347 | 1.3415335324702822
348 | 1.477645892398769
349 | 2.151334693113531
350 | 1.1266739688653569
351 | 1.1854286407108858
352 | 1.42470293039027
353 | 1.6003600403781872
354 | 1.3512835063656823
355 | 1.389036183937616
356 | 1.6286785010399056
357 | 1.5972834586212834
358 | 1.159921399412275
359 | 1.2493081284469363
360 | 1.2562757306689412
361 | 1.589157333851994
362 | 2.3784701434714712
363 | 1.8581071640928828
364 | 2.0669730782564657
365 | 1.7856820212966644
366 | 1.0896672960104299
367 | 1.4421990577488608
368 | 1.0606595180532017
369 | 1.148770450520872
370 | 1.1696262250974012
371 | 1.5486379494446831
372 | 1.1971737091523267
373 | 1.686642552973629
374 | 1.1964760562306826
375 | 1.506958405689278
376 | 1.7872288700758432
377 | 1.4348945022623711
378 | 1.4620242742742222
379 | 1.818984761630097
380 | 1.659088830952156
381 | 2.0847102097800647
382 | 2.7200992338272054
383 | 2.121833004147098
384 | 1.3415293887662447
385 | 1.387089658993257
386 | 1.3227764620691815
387 | 1.7204665226383646
388 | 1.3153737444661586
389 | 2.1989548188461296
390 | 1.1471606218042851
391 | 1.1019533965775652
392 | 1.8919660752560719
393 | 1.804660879715179
394 | 1.2315952088734266
395 | 1.9443889240337113
396 | 1.7478595456922623
397 | 0.971829780970315
398 | 1.1406970683293147
399 | 1.425775256592143
400 | 1.591661302045164
401 | 1.458045960826828
402 | 3.271024661345266
403 | 1.6488206406128503
404 | 1.8994474716611474
405 | 1.3596683800451452
406 | 1.7305294645857356
407 | 1.3739788239596982
408 | 0.9370195163360168
409 | 2.981411684593304
410 | 1.7737860828664083
411 | 1.3634983275261823
412 | 1.1767927102294096
413 | 1.5833387188815247
414 | 1.3605863320145757
415 | 1.9259367806445027
416 | 1.081348542880117
417 | 1.2099363795360183
418 | 1.448889964854583
419 | 1.0883213496173914
420 | 1.7059620520769656
421 | 2.248908580070489
422 | 2.190763119142914
423 | 0.9901860770929424
424 | 1.1158426454235635
425 | 1.2243730141049218
426 | 1.2781294586157481
427 | 1.7276856800466787
428 | 2.2143715401730417
429 | 1.760206756626474
430 | 1.6034326426911396
431 | 1.126726187020662
432 | 1.5377026358943673
433 | 1.4722565684306739
434 | 1.7478595456922623
435 | 2.2985934657886697
436 | 1.540305089489859
437 | 1.7921183505325313
438 | 1.4537866553228214
439 | 1.811569458188886
440 | 1.6044587362194564
441 | 1.2528412091594672
442 | 2.019569305468053
443 | 1.9135503510024814
444 | 1.3662079039710568
445 | 2.0726128639789727
446 | 1.7335052230702734
447 | 1.640410297845879
448 | 1.3659117819233757
449 | 1.5070329038492702
450 | 1.786841496070003
451 | 1.241000487246354
452 | 2.1462748468066413
453 | 1.230432888052957
454 | 1.1130884547150879
455 | 2.0293623070665054
456 | 4.994087204385433
457 | 1.3461872238242758
458 | 1.4934692282860007
459 | 1.6290716546828803
460 | 2.1500276019478806
461 | 1.2678434131942082
462 | 1.1575590401550242
463 | 1.291352406656757
464 | 1.2901086947508988
465 | 1.4545504993541192
466 | 1.65074891776069
467 | 1.4422878510632526
468 | 1.845002620550053
469 | 1.4094595938898957
470 | 1.5728146955520448
471 | 1.5669043337167583
472 | 3.2403243494780356
473 | 1.8626527818067853
474 | 1.3599597526741263
475 | 1.2422690358024981
476 | 1.3915983629984086
477 | 1.4716810439816361
478 | 1.5137980843115815
479 | 1.4638315141882428
480 | 1.159921399412275
481 | 2.190763119142914
482 | 2.156083852809556
483 | 2.156083852809556
484 | 4.876951361586901
485 | 1.6327451303031313
486 | 1.5679217551403173
487 | 1.5601958605206607
488 | 1.8214605259851877
489 | 2.1394646802958226
490 | 1.608034271416974
491 | 1.3173840096099623
492 | 1.5610894572784533
493 | 1.067158344969774
494 | 1.511079683907806
495 | 2.182779630157373
496 | 1.507028248953826
497 | 1.5438422347197596
498 | 1.34342714951139
499 | 1.4967120456950531
500 | 1.8439734704585764
501 | 1.2148742804601573
502 | 1.9379314440497082
503 | 1.4246072672273935
504 | 1.3938342744069776
505 | 1.1225916630518447
506 | 1.2825484100644697
507 | 1.5506945096538096
508 | 1.830999282952867
509 | 2.9441975417989306
510 | 1.9547814408248436
511 | 1.3665529655117288
512 | 1.651805290654875
513 | 2.447353651263764
514 | 1.7158345236830304
515 | 2.3108286246795116
516 | 1.1920656151073028
517 | 1.1503182851846718
518 | 1.324411574742509
519 | 1.2813698317415987
520 | 1.571194404199422
521 | 1.6095312933704564
522 | 1.712170050958161
523 | 1.7758406629804506
524 | 2.03497732647866
525 | 1.3904829986528187
526 | 1.7338935990268316
527 | 1.270120318177618
528 | 1.3438889409262853
529 | 1.236649285226663
530 | 1.1391303639496575
531 | 1.175255343592379
532 | 1.4246072672273935
533 | 1.1696458233157454
534 | 1.3932851077699513
535 | 1.251742886957726
536 | 1.804660879715179
537 | 1.5829123946131227
538 | 1.9616813397384274
539 | 1.6276334495136737
540 | 1.2776705024753328
541 | 1.2848610321322673
542 | 2.9114040968994663
543 | 1.0928030613748911
544 | 1.472109606964229
545 | 4.061375507652374
546 | 1.7380114185637785
547 | 4.207183107911281
548 | 6.3022920627776
549 | 2.055042401923337
550 | 1.7419971944819568
551 | 1.4218729499053318
552 | 1.2743090917174307
553 | 1.137913307143268
554 | 1.3861925359492449
555 | 1.78381147899951
556 | 1.662839276071543
557 | 1.090802289250184
558 | 1.845002620550053
559 | 1.2914751866854917
560 | 1.477645892398769
561 | 1.709205923154766
562 | 2.547655499823954
563 | 1.489163416288754
564 | 1.580979745136837
565 | 1.2266225720690511
566 | 1.7185380917819992
567 | 1.4667450055931295
568 | 1.5290604149324027
569 | 1.1987732237461235
570 | 3.536185390992804
571 | 2.0334518893397915
572 | 1.6067039861851677
573 | 1.468647263384833
574 | 1.6592828650771068
575 | 1.9429048388285797
576 | 1.5872170596463864
577 | 1.4550297413552988
578 | 1.7775879134401187
579 | 1.5161115927539688
580 | 0.9727236505503528
581 | 1.2151210511990873
582 | 1.230432888052957
583 | 1.9724531839170805
584 | 1.6191458656013205
585 | 1.677712269904247
586 | 1.5851386168588182
587 | 1.4058996453825523
588 | 1.595311459526821
589 | 1.507028248953826
590 | 1.5139529824511828
591 | 3.21833124408263
592 | 1.4391189853160273
593 | 1.1437601360707996
594 | 1.5740538714969317
595 | 1.4173961899580547
596 | 2.019569305468053
597 | 1.4347258587984166
598 | 1.0875061817174327
599 | 1.7678367736010594
600 | 1.28977573598255
601 | 2.853983742257853
602 | 1.614351271995543
603 | 1.1586331113461001
604 | 1.1951643500219205
605 | 1.6469960059434845
606 | 1.1860733314346792
607 | 3.222388750130358
608 | 1.6570275357335575
609 | 1.28079616198825
610 | 1.9054526468880446
611 | 1.5473446697110143
612 | 2.919529697438214
613 | 1.665292494582132
614 | 1.3241286018856473
615 | 1.5052687513809706
616 | 1.2634578614726677
617 | 1.3531950655274996
618 | 1.684194104700233
619 | 3.0821621473342047
620 | 1.5426638730292128
621 | 1.848694319834159
622 | 2.001738055405492
623 | 1.6046229111354398
624 | 1.8520599141065015
625 | 1.2356469390673412
626 | 2.4344636159031587
627 | 1.8046664539356445
628 | 0.9646703648968143
629 | 1.7738997079148424
630 | 2.2163690190274346
631 | 2.2615650856479466
632 | 1.855811379489538
633 | 1.200958619242105
634 | 1.2688718768561633
635 | 1.2679387983238632
636 | 1.1509512718690886
637 | 1.8346071511861994
638 | 2.5336517654188704
639 | 1.2987568277187695
640 | 1.2110455141327818
641 | 2.0461664798401777
642 | 1.3426067685941796
643 | 1.8374812522737474
644 | 2.6288501384067686
645 | 1.5135453407444897
646 | 2.340155018184363
647 | 1.3052526347687385
648 | 1.3592011615001736
649 | 1.5601958605206607
650 | 1.459222647908934
651 | 1.4388026184376503
652 | 1.2480323918900023
653 | 2.1711440081336457
654 | 1.2813555142604682
655 | 1.3053683530236413
656 | 1.1351762677685773
657 | 1.7796160314231309
658 | 1.412406363975991
659 | 1.6435648710396282
660 | 2.3142458164432926
661 | 1.4184073872475786
662 | 1.3317358369857621
663 | 1.5604341384235236
664 | 1.7498410617420639
665 | 1.199675504589035
666 | 1.1597850714857532
667 | 2.340155018184363
668 | 1.5969194138588296
669 | 1.2362885337769147
670 | 6.85517513290001
671 | 1.7593497104069002
672 | 1.4472896666035948
673 | 1.2160919941663049
674 | 1.2272427114465063
675 | 1.490659777015688
676 | 1.758450877017632
677 | 1.2926590693109838
678 | 2.0336759049974873
679 | 3.679099469385176
680 | 1.6163016694314787
681 | 1.5426638730292128
682 | 3.1388054124308353
683 | 3.1378916382098354
684 | 1.3599825401147634
685 | 1.6073027092090386
686 | 1.3780637673334344
687 | 1.537599385248513
688 | 1.4612087510880485
689 | 1.0630107569677125
690 | 0.9460779264066411
691 | 1.8011724903302118
692 | 1.5157867644494982
693 | 1.1985767788380866
694 | 2.1770059270968964
695 | 2.6960466534310816
696 | 1.3493898910242819
697 | 1.1471606218042851
698 | 1.3320342001938383
699 | 1.6064765929738287
700 | 1.7921183505325313
701 | 1.5749554053823576
702 | 1.1018520376897647
703 | 1.7495445461388062
704 | 2.7862222000555597
705 | 1.8207218688668838
706 | 1.9486752269414889
707 | 1.1686745957357658
708 | 1.5151094133672105
709 | 1.9135503510024814
710 | 3.300676775076133
711 | 2.214118123529967
712 | 1.2148742804601573
713 | 1.3632027927722234
714 | 4.720955250489674
715 | 1.9827699472230462
716 | 1.9135503510024814
717 | 1.7478595456922623
718 | 1.2914751866854917
719 | 1.4860616354149334
720 | 1.6347752455810902
721 | 1.485961852257869
722 | 1.86148811554261
723 | 2.173382996416875
724 | 1.126726187020662
725 | 1.5232292121615698
726 | 2.3311804946679517
727 | 2.2687151094464313
728 | 1.5581528016877275
729 | 1.3716397961165223
730 | 1.6891608002195535
731 | 2.117190247090065
732 | 1.5249718273342485
733 | 1.1862876041095358
734 | 1.4535890130529665
735 | 1.9767233466846406
736 | 1.672875267920636
737 | 2.866760710836066
738 | 1.3506645789244776
739 | 1.2147499821707957
740 | 1.231502557817369
741 | 1.7531810697883528
742 | 2.3784701434714712
743 | 4.523958062021655
744 | 1.7023678363569763
745 | 3.192997850429486
746 | 2.2036925546385135
747 | 1.2416525298356502
748 | 2.335663445980759
749 | 1.6745852811807076
750 | 1.3931460267422964
751 | 3.384224982806275
752 | 2.399425155391763
753 | 1.0738888527395116
754 | 1.4906643813523415
755 | 2.480305796358033
756 | 1.4256795266176017
757 | 1.562818919848765
758 | 1.216896386455207
759 | 1.741576791010962
760 | 1.5431351097386157
761 | 1.5386594085090133
762 | 1.594398273455865
763 | 1.7360131761854907
764 | 1.8074244909024912
765 | 1.1645769831130584
766 | 1.8140351298019544
767 | 2.3792306268278907
768 | 1.7195313575456406
769 | 2.7200992338272054
770 | 1.9980954741705295
771 | 1.160906732445729
772 | 4.8417475299086785
773 | 1.9332296815983918
774 | 1.16342385271845
775 | 1.442825300718281
776 | 1.9982726242575635
777 | 1.9501778484880097
778 | 2.8214824996343046
779 | 1.2240949850223426
780 | 1.3600696602445814
781 | 1.5262562040787233
782 | 1.0083449035023344
783 | 1.2848610321322673
784 | 1.127133456613148
785 | 1.8135070248958534
786 | 2.0644494171550782
787 | 3.0403062457988406
788 | 1.6908539089563155
789 | 2.0594406998539454
790 | 1.5985890336712016
791 | 1.159921399412275
792 | 1.5387738840395555
793 | 1.1166075260836446
794 | 1.6994614659425926
795 | 1.317084859707623
796 | 5.375265829827838
797 | 1.601413662937508
798 | 1.473364692028392
799 | 2.5066516058832273
800 | 6.9109998866989875
801 | 1.4765180561000602
802 | 1.1482442799932788
803 | 1.7052532261055533
804 | 7.710388580079479
805 | 1.780690526884969
806 | 1.6794098730963125
807 | 1.7788262788501346
808 | 1.6682831736678057
809 | 2.064932799514329
810 | 1.58911824737042
811 | 2.340065478380446
812 | 1.3661203881651482
813 | 5.080921489809112
814 | 1.2160919941663049
815 | 1.1806056444153608
816 | 3.8367167286317168
817 | 2.149699293270523
818 | 1.8836057749952264
819 | 1.4458926861394328
820 | 1.5171715823082605
821 | 1.6025382413322895
822 | 2.083112587863047
823 | 3.379110228650751
824 | 1.8903602436977847
825 | 1.3855421375184638
826 | 1.9086348452083035
827 | 1.2944007748084345
828 | 1.5077779604889854
829 | 1.3778006224660475
830 | 1.3604471253852821
831 | 2.3970091499005277
832 | 1.6754318970575488
833 | 2.4106317601862775
834 | 1.2183839666542644
835 | 1.450149546384091
836 | 3.122108430662958
837 | 1.5544832864428726
838 | 1.5566038539639102
839 | 1.1920656151073028
840 | 2.1121652720127613
841 | 1.8909646605776496
842 | 2.7092622222829355
843 | 1.4313514403163254
844 | 1.4280217905363068
845 | 1.3961620452355947
846 | 1.2023036917783498
847 | 1.7779677928034017
848 | 1.8355180646103677
849 | 1.7262405160193222
850 | 1.3069505791191476
851 | 2.088001449121803
852 | 1.9938059115742035
853 | 1.608115306277354
854 | 1.731229870179861
855 | 1.5276553109391047
856 | 2.89662306954205
857 | 1.9105751901861643
858 | 2.27168986162654
859 | 1.8726618423113022
860 | 1.6515295054192958
861 | 2.7092622222829355
862 | 2.449223059004678
863 | 1.4795681007586423
864 | 2.3491063122079994
865 | 3.8367167286317168
866 | 1.541170073306789
867 | 2.005275029434266
868 | 1.1070248477747302
869 | 1.752670687198158
870 | 1.3273827468809165
871 | 2.5392449057547775
872 | 1.2979262385950383
873 | 6.3022920627776
874 | 1.622368476175161
875 | 2.2073660099745944
876 | 1.780690526884969
877 | 1.2886810215439344
878 | 1.199919183715295
879 | 1.7047691468766337
880 | 3.5581785737312677
881 | 2.640366940963742
882 | 2.0847550542241797
883 | 2.595306208514257
884 | 1.5098938492432687
885 | 1.9222402374485936
886 | 2.3283024295382457
887 | 2.0446338725519695
888 | 1.580979745136837
889 | 2.126185751689322
890 | 1.4184586404631023
891 | 1.725111647649377
892 | 1.6219497658666162
893 | 1.3295102630895161
894 | 1.4956450242050274
895 | 1.3033265577136575
896 | 1.6331967081491723
897 | 1.599077353809558
898 | 2.4348021467611254
899 | 1.2973223514436398
900 | 1.4184586404631023
901 | 2.144554602396006
902 | 4.697044210338071
903 | 2.0641636160101284
904 | 2.3988401107969
905 | 2.3970091499005277
906 | 2.916151901986888
907 | 2.3751464073919
908 | 2.1487146680037488
909 | 2.571050866200927
910 | 5.080921489809112
911 | 2.1565735648195674
912 | 1.3512835063656823
913 | 3.719590003770812
914 | 1.4978553098718994
915 | 2.8403703867346572
916 | 5.24091538504311
917 | 


--------------------------------------------------------------------------------
/data/biochemists-zinb-coef.tsv:
--------------------------------------------------------------------------------
1 | count	zero	coef
2 | 0.4167465258789788	-0.19168829397210915	intercept
3 | -0.19550683126146232	0.6359332030001064	fem
4 | 0.09758262897011574	-1.4994684859382412	mar
5 | -0.15173245821414386	0.6284272015007232	kid5
6 | -7.001340806705136e-4	-0.03771473930388164	phd
7 | 0.024786201372630465	-0.8822932239400029	ment
8 | 2.6547660033812437	2.6547660033812437	theta
9 | 


--------------------------------------------------------------------------------
/data/biochemists.R:
--------------------------------------------------------------------------------
 1 | library(pscl)
 2 | library(readr)
 3 | 
 4 | 
 5 | # Load and save biochemists data ------------------------------------------
 6 | data("bioChemists", package = "pscl")
 7 | head(bioChemists)
 8 | 
 9 | #encode design matrix
10 | design <- cbind.data.frame(art=bioChemists$art, model.matrix(art~., bioChemists)[,-1])
11 | colnames(design) <- colnames(bioChemists)
12 | head(design)
13 | write_tsv(design, 'biochemists.tsv')
14 | 
15 | 
16 | # NB fit ------------------------------------------------------------------
17 | nb <- MASS::glm.nb(art ~ ., data = bioChemists)
18 | coef.df <- rbind.data.frame(data.frame(coef(nb)), theta=nb$theta)
19 | colnames(coef.df) <- 'val'
20 | coef.df$coef <- rownames(coef.df)
21 | coef.df
22 | coef.df$coef <- c('intercept', colnames(bioChemists)[-1], 'theta')
23 | coef.df
24 | write_tsv(coef.df, 'biochemists-nb-coef.tsv')
25 | pred.nb <- predict(nb, type='response')
26 | write_tsv(data.frame(count=pred.nb), 'biochemists-nb-predictions.tsv')
27 | 
28 | 
29 | # ZINB fit ----------------------------------------------------------------
30 | zinb <- zeroinfl(art ~ . | ., data = bioChemists, dist = "negbin")
31 | coef(zinb)
32 | coef.df <- data.frame(count=zinb$coefficients$count,
33 |                       zero=zinb$coefficients$zero)
34 | coef.df <- rbind(coef.df, theta=zinb$theta)
35 | coef.df$coef <- c('intercept', colnames(bioChemists)[-1], 'theta')
36 | coef.df
37 | write_tsv(coef.df, 'biochemists-zinb-coef.tsv')
38 | 
39 | pred.df <- cbind.data.frame(zero=predict(zinb, type='zero'),
40 |                             count=predict(zinb, type='count'))
41 | 
42 | write_tsv(pred.df, 'biochemists-zinb-predictions.tsv')


--------------------------------------------------------------------------------
/data/biochemists.tsv:
--------------------------------------------------------------------------------
  1 | art	fem	mar	kid5	phd	ment
  2 | 0	0	1	0	2.5199999809265137	7
  3 | 0	1	0	0	2.049999952316284	6
  4 | 0	1	0	0	3.75	6
  5 | 0	0	1	1	1.1799999475479126	3
  6 | 0	1	0	0	3.75	26
  7 | 0	1	1	2	3.5899999141693115	2
  8 | 0	1	0	0	3.190000057220459	3
  9 | 0	0	1	2	2.9600000381469727	4
 10 | 0	0	0	0	4.619999885559082	6
 11 | 0	1	1	0	1.25	0
 12 | 0	0	0	0	2.9600000381469727	14
 13 | 0	1	0	0	0.7549999952316284	13
 14 | 0	1	1	1	3.690000057220459	3
 15 | 0	1	1	0	3.4000000953674316	4
 16 | 0	1	1	0	1.7899999618530273	0
 17 | 0	1	0	0	3.0899999141693115	1
 18 | 0	1	1	0	2	7
 19 | 0	0	1	2	4.289999961853027	13
 20 | 0	1	0	0	3.359999895095825	7
 21 | 0	1	0	0	4.289999961853027	9
 22 | 0	1	1	0	2.259999990463257	6
 23 | 0	0	1	3	2.9600000381469727	3
 24 | 0	0	1	1	4.289999961853027	5
 25 | 0	0	1	1	2.859999895095825	4
 26 | 0	0	1	3	2.759999990463257	1
 27 | 0	1	1	0	1.5199999809265137	3
 28 | 0	1	1	1	3.5399999618530273	8
 29 | 0	0	1	1	4.289999961853027	3
 30 | 0	1	0	0	3.0899999141693115	0
 31 | 0	0	1	0	2.319999933242798	3
 32 | 0	1	0	0	2.390000104904175	0
 33 | 0	0	1	0	4.289999961853027	8
 34 | 0	1	0	0	1.5049999952316284	13
 35 | 0	0	0	0	2.0999999046325684	0
 36 | 0	0	1	1	1.5199999809265137	0
 37 | 0	1	0	0	2	0
 38 | 0	1	0	0	1.75	0
 39 | 0	0	0	0	4.289999961853027	9
 40 | 0	0	1	1	1.2200000286102295	2
 41 | 0	1	1	0	3.75	5
 42 | 0	0	0	0	4.289999961853027	9
 43 | 0	1	1	1	2.859999895095825	3
 44 | 0	0	0	0	3.9200000762939453	5
 45 | 0	1	1	1	3.359999895095825	3
 46 | 0	0	0	0	2.0999999046325684	6
 47 | 0	0	1	1	1.1799999475479126	0
 48 | 0	0	1	0	2.5	1
 49 | 0	0	1	0	3.359999895095825	25
 50 | 0	1	0	0	4.539999961853027	14
 51 | 0	1	1	0	3.9200000762939453	8
 52 | 0	0	0	0	4.539999961853027	5
 53 | 0	1	1	2	1.9700000286102295	0
 54 | 0	0	1	2	2.9600000381469727	12
 55 | 0	0	1	0	4.289999961853027	8
 56 | 0	0	1	1	2.259999990463257	2
 57 | 0	1	0	0	2.119999885559082	11
 58 | 0	1	0	0	3.2100000381469727	2
 59 | 0	0	1	1	3.1500000953674316	1
 60 | 0	0	1	1	3.9200000762939453	1
 61 | 0	0	0	0	2.9600000381469727	0
 62 | 0	1	0	0	2.859999895095825	6
 63 | 0	0	1	1	2.0999999046325684	0
 64 | 0	1	0	0	3.75	4
 65 | 0	1	1	0	3.5399999618530273	2
 66 | 0	1	1	0	2.5399999618530273	1
 67 | 0	0	1	1	2.759999990463257	3
 68 | 0	0	1	0	4.539999961853027	10
 69 | 0	0	1	0	1.6799999475479126	6
 70 | 0	0	1	1	2.200000047683716	0
 71 | 0	1	0	0	1.0049999952316284	0
 72 | 0	0	0	0	2.119999885559082	2
 73 | 0	1	1	2	2.5799999237060547	0
 74 | 0	1	1	0	1.7899999618530273	1
 75 | 0	0	0	0	4.289999961853027	2
 76 | 0	1	1	0	1.2799999713897705	4
 77 | 0	0	1	2	2.5799999237060547	5
 78 | 0	0	1	2	2.119999885559082	0
 79 | 0	0	1	1	2.2100000381469727	1
 80 | 0	0	1	1	2.259999990463257	5
 81 | 0	1	0	0	3.2100000381469727	5
 82 | 0	1	0	0	2.0999999046325684	2
 83 | 0	1	1	1	3.5399999618530273	1
 84 | 0	0	1	1	3.4000000953674316	18
 85 | 0	0	1	0	2.2100000381469727	6
 86 | 0	0	1	0	2.2100000381469727	19
 87 | 0	0	1	2	0.9200000166893005	1
 88 | 0	1	0	0	4.289999961853027	35
 89 | 0	0	1	2	3.1500000953674316	6
 90 | 0	0	1	0	4.289999961853027	19
 91 | 0	0	1	1	2.509999990463257	8
 92 | 0	0	1	1	4.289999961853027	1
 93 | 0	1	1	0	2.759999990463257	0
 94 | 0	0	1	0	1.4199999570846558	3
 95 | 0	0	1	0	2.2100000381469727	19
 96 | 0	0	0	0	4.139999866485596	5
 97 | 0	0	1	0	1.5199999809265137	7
 98 | 0	0	0	0	4.619999885559082	8
 99 | 0	1	0	0	3.619999885559082	7
100 | 0	1	1	2	3.75	4
101 | 0	0	1	0	4.289999961853027	8
102 | 0	1	0	0	3.75	5
103 | 0	0	1	1	2.0999999046325684	0
104 | 0	0	1	2	1.399999976158142	7
105 | 0	1	1	0	3.190000057220459	4
106 | 0	0	1	2	1.809999942779541	1
107 | 0	0	1	1	3.1500000953674316	2
108 | 0	1	1	0	3.75	7
109 | 0	1	0	0	2.049999952316284	6
110 | 0	0	1	3	2.2100000381469727	3
111 | 0	1	1	2	3.359999895095825	2
112 | 0	1	1	0	3.4700000286102295	11
113 | 0	1	0	0	2.119999885559082	0
114 | 0	1	1	1	3.359999895095825	2
115 | 0	0	1	0	2.5399999618530273	5
116 | 0	0	1	2	2.1500000953674316	4
117 | 0	0	1	1	4.539999961853027	3
118 | 0	0	1	0	1.6799999475479126	4
119 | 0	1	1	1	2.549999952316284	0
120 | 0	1	0	0	2.119999885559082	11
121 | 0	0	1	1	3.4700000286102295	2
122 | 0	1	1	0	3.4700000286102295	1
123 | 0	1	0	0	3.359999895095825	9
124 | 0	0	1	1	2.119999885559082	10
125 | 0	1	0	0	2.0999999046325684	1
126 | 0	1	0	0	4.289999961853027	1
127 | 0	0	1	1	2.859999895095825	13
128 | 0	1	1	1	4.289999961853027	9
129 | 0	0	1	0	3.2100000381469727	2
130 | 0	0	0	0	2.0999999046325684	0
131 | 0	1	1	0	1.809999942779541	5
132 | 0	1	0	0	2.5199999809265137	2
133 | 0	0	1	1	3.359999895095825	23
134 | 0	1	1	0	4.539999961853027	4
135 | 0	1	1	1	4.289999961853027	4
136 | 0	1	0	0	2.119999885559082	9
137 | 0	0	1	2	4.619999885559082	30
138 | 0	1	1	1	3.5899999141693115	14
139 | 0	1	0	0	3.5899999141693115	4
140 | 0	0	1	3	2.859999895095825	1
141 | 0	1	0	0	1.5049999952316284	3
142 | 0	1	1	1	2.5	0
143 | 0	1	1	0	4.619999885559082	0
144 | 0	0	0	0	2.5	0
145 | 0	1	1	1	3.359999895095825	3
146 | 0	1	0	0	2	2
147 | 0	1	1	2	3.4100000858306885	5
148 | 0	0	1	0	2.0999999046325684	0
149 | 0	1	0	0	2	2
150 | 0	0	1	0	3.359999895095825	2
151 | 0	0	1	3	3.5899999141693115	7
152 | 0	0	0	0	2.140000104904175	2
153 | 0	1	1	0	3.5899999141693115	12
154 | 0	1	0	0	2.390000104904175	10
155 | 0	0	1	3	4.539999961853027	11
156 | 0	0	0	0	2.9600000381469727	2
157 | 0	0	1	1	2.2100000381469727	5
158 | 0	0	1	2	3.690000057220459	5
159 | 0	1	1	0	1.7799999713897705	1
160 | 0	1	0	0	2.859999895095825	11
161 | 0	0	1	0	2.859999895095825	8
162 | 0	1	1	0	2.5799999237060547	3
163 | 0	1	0	0	3.75	1
164 | 0	1	1	1	1.2200000286102295	11
165 | 0	1	0	0	2.0999999046325684	9
166 | 0	1	0	0	3.5899999141693115	18
167 | 0	1	0	0	3.5399999618530273	5
168 | 0	1	0	0	2.859999895095825	16
169 | 0	1	1	0	1.8600000143051147	6
170 | 0	0	1	0	2.9600000381469727	0
171 | 0	1	1	0	3.690000057220459	7
172 | 0	1	1	0	3.690000057220459	2
173 | 0	1	1	0	4.289999961853027	1
174 | 0	1	0	0	2.5199999809265137	1
175 | 0	1	0	0	1.7599999904632568	10
176 | 0	0	1	2	2.319999933242798	2
177 | 0	1	0	0	3.690000057220459	2
178 | 0	1	0	0	3.75	14
179 | 0	1	0	0	4.289999961853027	5
180 | 0	1	1	2	4.539999961853027	6
181 | 0	0	1	1	2.0999999046325684	0
182 | 0	1	1	1	2.5399999618530273	0
183 | 0	0	1	1	2.2100000381469727	19
184 | 0	0	0	0	4.539999961853027	8
185 | 0	0	1	1	2.0999999046325684	12
186 | 0	0	0	0	2.259999990463257	5
187 | 0	1	1	1	4.619999885559082	15
188 | 0	1	0	0	2.759999990463257	4
189 | 0	1	0	0	4.289999961853027	12
190 | 0	1	0	0	0.7549999952316284	0
191 | 0	1	1	0	1.8300000429153442	2
192 | 0	1	0	0	1.7899999618530273	0
193 | 0	1	1	1	4.289999961853027	6
194 | 0	1	0	0	2.5	0
195 | 0	0	1	0	2.140000104904175	1
196 | 0	0	1	2	3.4000000953674316	13
197 | 0	1	0	0	2.869999885559082	15
198 | 0	1	1	0	2.9600000381469727	0
199 | 0	1	1	2	3.190000057220459	0
200 | 0	0	1	1	3.690000057220459	16
201 | 0	1	1	2	1.2549999952316284	0
202 | 0	0	1	2	4.139999866485596	12
203 | 0	0	1	1	2.390000104904175	5
204 | 0	0	1	2	3.9200000762939453	2
205 | 0	1	0	0	2.9600000381469727	4
206 | 0	1	1	0	2.119999885559082	2
207 | 0	1	1	2	2.119999885559082	1
208 | 0	1	1	0	2.609999895095825	3
209 | 0	0	0	0	4.289999961853027	0
210 | 0	0	1	3	2.859999895095825	7
211 | 0	1	1	0	2.390000104904175	0
212 | 0	0	1	2	3.3399999141693115	1
213 | 0	1	0	0	3.619999885559082	8
214 | 0	1	1	1	3.5899999141693115	8
215 | 0	0	1	2	4.539999961853027	1
216 | 0	1	1	0	3.5899999141693115	1
217 | 0	0	0	0	4.289999961853027	9
218 | 0	0	1	0	1.5199999809265137	4
219 | 0	0	1	0	4.289999961853027	6
220 | 0	1	1	0	4.619999885559082	5
221 | 0	0	1	1	4.289999961853027	13
222 | 0	0	1	0	2.509999990463257	3
223 | 0	0	0	0	2.859999895095825	3
224 | 0	1	1	0	2.9600000381469727	8
225 | 0	1	1	0	3.690000057220459	3
226 | 0	1	1	0	3.4700000286102295	2
227 | 0	0	1	2	4.289999961853027	15
228 | 0	0	0	0	4.289999961853027	0
229 | 0	1	0	0	3.9200000762939453	1
230 | 0	1	1	0	2	6
231 | 0	0	1	0	2.319999933242798	4
232 | 0	1	1	1	2	9
233 | 0	0	1	1	1.6799999475479126	27
234 | 0	0	0	0	3.4700000286102295	0
235 | 0	0	1	0	2.200000047683716	2
236 | 0	0	1	1	2.119999885559082	10
237 | 0	1	0	0	1.399999976158142	14
238 | 0	1	0	0	3.359999895095825	3
239 | 0	0	1	2	4.289999961853027	24
240 | 0	0	1	1	2.0999999046325684	0
241 | 0	0	0	0	4.289999961853027	5
242 | 0	1	0	0	4.289999961853027	0
243 | 0	1	0	0	2.8299999237060547	1
244 | 0	0	1	1	2.5799999237060547	6
245 | 0	1	1	0	3.1500000953674316	7
246 | 0	0	0	0	4.289999961853027	2
247 | 0	0	0	0	2.9600000381469727	0
248 | 0	0	1	2	3.5899999141693115	5
249 | 0	1	0	0	2.390000104904175	15
250 | 0	1	0	0	1.25	2
251 | 0	1	0	0	2.759999990463257	1
252 | 0	0	1	0	3.4000000953674316	13
253 | 0	0	0	0	4.289999961853027	7
254 | 0	1	0	0	3.690000057220459	4
255 | 0	1	1	0	2.859999895095825	13
256 | 0	1	0	0	4.289999961853027	3
257 | 0	1	0	0	4.139999866485596	2
258 | 0	1	1	0	2.390000104904175	2
259 | 0	0	1	2	2.859999895095825	11
260 | 0	1	1	0	2.9600000381469727	7
261 | 0	0	0	0	3.5899999141693115	8
262 | 0	0	1	1	1.809999942779541	5
263 | 0	0	1	1	4.289999961853027	2
264 | 0	0	1	1	2.0999999046325684	0
265 | 0	1	0	0	3.190000057220459	0
266 | 0	0	1	2	4.289999961853027	2
267 | 0	0	1	0	4.289999961853027	2
268 | 0	0	1	2	3.4000000953674316	0
269 | 0	0	0	0	4.289999961853027	35
270 | 0	0	1	2	2.319999933242798	5
271 | 0	0	0	0	1.6799999475479126	5
272 | 0	1	1	0	2.5799999237060547	0
273 | 0	0	1	0	3.690000057220459	3
274 | 0	0	1	2	3.4100000858306885	7
275 | 0	0	1	0	2.319999933242798	0
276 | 0	0	1	1	3.690000057220459	8
277 | 1	0	1	0	2.5399999618530273	4
278 | 1	1	1	1	1.7599999904632568	4
279 | 1	0	1	1	4.289999961853027	13
280 | 1	0	1	3	2.0999999046325684	0
281 | 1	1	1	1	3.0899999141693115	12
282 | 1	1	0	0	3.359999895095825	6
283 | 1	0	1	0	2.0999999046325684	0
284 | 1	1	1	1	3.75	7
285 | 1	1	0	0	2.140000104904175	6
286 | 1	1	0	0	4.289999961853027	22
287 | 1	1	0	0	2	2
288 | 1	0	1	0	2	4
289 | 1	0	1	1	2	16
290 | 1	0	1	1	2.609999895095825	6
291 | 1	0	1	0	4.25	12
292 | 1	0	0	0	2.359999895095825	14
293 | 1	1	1	0	4.289999961853027	11
294 | 1	0	1	0	2.0999999046325684	2
295 | 1	1	1	0	2.869999885559082	5
296 | 1	1	1	0	3.4700000286102295	3
297 | 1	0	1	0	1.7799999713897705	1
298 | 1	0	1	0	4.139999866485596	15
299 | 1	0	1	3	4.289999961853027	0
300 | 1	0	0	0	4.289999961853027	3
301 | 1	0	0	0	1.809999942779541	10
302 | 1	1	0	0	2.319999933242798	15
303 | 1	1	1	0	2.609999895095825	7
304 | 1	0	1	0	1.6799999475479126	34
305 | 1	1	1	0	3.190000057220459	3
306 | 1	0	1	1	3.5899999141693115	5
307 | 1	0	1	0	2.5	0
308 | 1	0	0	0	3.4000000953674316	6
309 | 1	1	0	0	2.259999990463257	4
310 | 1	0	0	0	3.4000000953674316	12
311 | 1	0	1	1	3.9200000762939453	0
312 | 1	0	1	2	2.9600000381469727	0
313 | 1	0	0	0	4.289999961853027	15
314 | 1	0	1	0	4.289999961853027	11
315 | 1	1	1	1	3.319999933242798	24
316 | 1	0	0	0	1.8600000143051147	15
317 | 1	1	0	0	2.869999885559082	6
318 | 1	1	1	0	1.6399999856948853	22
319 | 1	1	0	0	2.25	7
320 | 1	1	0	0	2.0999999046325684	8
321 | 1	1	1	0	4.289999961853027	6
322 | 1	0	1	0	1.75	3
323 | 1	0	1	0	2.9600000381469727	8
324 | 1	1	0	0	2.25	4
325 | 1	0	1	2	4.25	20
326 | 1	0	1	2	2.559999942779541	22
327 | 1	1	0	0	3.190000057220459	11
328 | 1	0	1	2	3.9200000762939453	2
329 | 1	0	1	1	1.7799999713897705	77
330 | 1	1	1	0	3.4000000953674316	14
331 | 1	0	1	1	1.7400000095367432	4
332 | 1	0	0	0	4.289999961853027	6
333 | 1	1	0	0	2	6
334 | 1	1	1	0	2.869999885559082	18
335 | 1	0	1	0	4.289999961853027	6
336 | 1	1	1	0	3.190000057220459	2
337 | 1	0	1	1	1.2200000286102295	1
338 | 1	1	1	2	3.5399999618530273	17
339 | 1	0	1	1	4.289999961853027	1
340 | 1	0	1	1	4.619999885559082	3
341 | 1	1	0	0	2	6
342 | 1	1	0	0	3.5899999141693115	3
343 | 1	1	0	0	3.190000057220459	6
344 | 1	0	1	1	3.4000000953674316	2
345 | 1	1	1	0	1.2799999713897705	2
346 | 1	0	1	2	1.7400000095367432	4
347 | 1	0	1	1	2.259999990463257	1
348 | 1	1	1	0	3.5899999141693115	5
349 | 1	0	1	1	4.619999885559082	16
350 | 1	1	0	0	1.399999976158142	2
351 | 1	1	0	0	0.9200000166893005	4
352 | 1	0	1	1	2.390000104904175	3
353 | 1	0	1	0	2.259999990463257	1
354 | 1	0	1	2	2.859999895095825	7
355 | 1	0	1	2	2.759999990463257	8
356 | 1	0	1	1	1.6299999952316284	8
357 | 1	0	1	2	4.289999961853027	12
358 | 1	1	0	0	1.399999976158142	3
359 | 1	1	1	0	2.119999885559082	0
360 | 1	1	1	1	2.609999895095825	6
361 | 1	0	1	0	1.7999999523162842	1
362 | 1	1	1	0	4.289999961853027	21
363 | 1	1	0	0	3.690000057220459	18
364 | 1	1	1	0	4.619999885559082	16
365 | 1	0	0	0	2.1500000953674316	10
366 | 1	0	1	2	2.0999999046325684	0
367 | 1	1	1	0	2	5
368 | 1	1	0	0	1.2549999952316284	0
369 | 1	0	1	2	1.75	2
370 | 1	1	0	0	3.8499999046325684	2
371 | 1	0	0	0	4.25	4
372 | 1	1	0	0	3.4700000286102295	3
373 | 1	0	1	1	3.9200000762939453	8
374 | 1	0	1	2	2.509999990463257	3
375 | 1	1	0	0	1.399999976158142	12
376 | 1	0	1	1	2	11
377 | 1	1	0	0	2	10
378 | 1	0	0	0	4.289999961853027	2
379 | 1	0	0	0	3.359999895095825	10
380 | 1	0	1	0	4.619999885559082	1
381 | 1	0	1	1	2.559999942779541	16
382 | 1	0	1	0	4.619999885559082	18
383 | 1	0	1	0	3.5899999141693115	10
384 | 1	0	1	2	4.289999961853027	6
385 | 1	1	0	0	3.5899999141693115	8
386 | 1	1	0	0	4.289999961853027	6
387 | 1	0	1	0	3.190000057220459	3
388 | 1	1	1	0	3.5899999141693115	1
389 | 1	1	1	0	2.9600000381469727	19
390 | 1	1	0	0	2.5799999237060547	2
391 | 1	0	1	3	2.9600000381469727	6
392 | 1	1	1	0	4.539999961853027	13
393 | 1	0	1	1	4.539999961853027	10
394 | 1	0	1	2	2.5	4
395 | 1	0	1	0	1.6799999475479126	8
396 | 1	0	1	0	2.319999933242798	4
397 | 1	1	1	3	3.190000057220459	9
398 | 1	1	0	0	2.2100000381469727	2
399 | 1	1	1	0	1.25	5
400 | 1	0	0	0	4.139999866485596	5
401 | 1	1	1	0	4.619999885559082	4
402 | 1	1	0	0	4.539999961853027	37
403 | 1	1	1	0	3.1500000953674316	9
404 | 1	0	0	0	4.289999961853027	11
405 | 1	1	1	0	1.9500000476837158	3
406 | 1	0	0	0	2	9
407 | 1	1	1	0	4.539999961853027	2
408 | 1	1	1	2	2.5799999237060547	2
409 | 1	1	1	0	3.8499999046325684	29
410 | 1	0	1	1	3.4100000858306885	10
411 | 1	1	1	1	2.259999990463257	9
412 | 1	1	0	0	4.25	2
413 | 1	0	1	1	3.5899999141693115	6
414 | 1	1	1	1	2.119999885559082	9
415 | 1	0	1	0	2.9600000381469727	7
416 | 1	1	0	0	2.5199999809265137	0
417 | 1	1	0	0	2.259999990463257	4
418 | 1	1	0	0	4.539999961853027	9
419 | 1	1	1	2	2.859999895095825	7
420 | 1	0	1	0	4.539999961853027	2
421 | 1	0	1	0	3.5899999141693115	12
422 | 1	1	1	0	4.619999885559082	18
423 | 1	0	1	3	1.6699999570846558	3
424 | 1	0	1	2	1.75	1
425 | 1	1	1	1	2.8299999237060547	5
426 | 1	0	1	3	3.1500000953674316	11
427 | 1	0	1	1	3.5899999141693115	9
428 | 1	1	0	0	3.75	24
429 | 1	1	0	0	2.049999952316284	17
430 | 1	0	1	0	4.289999961853027	0
431 | 1	0	1	2	4.289999961853027	0
432 | 1	1	1	0	2.390000104904175	7
433 | 1	0	1	1	4.539999961853027	3
434 | 1	0	1	0	2.319999933242798	4
435 | 1	1	0	0	4.289999961853027	25
436 | 1	0	1	1	3.690000057220459	5
437 | 1	0	0	0	4.289999961853027	9
438 | 1	0	0	0	3.9200000762939453	2
439 | 1	0	1	0	2.759999990463257	5
440 | 1	0	0	0	2.759999990463257	6
441 | 1	0	1	2	3.619999885559082	4
442 | 1	0	1	1	4.289999961853027	14
443 | 1	0	0	0	2.869999885559082	12
444 | 1	1	1	1	2.390000104904175	9
445 | 1	0	0	0	4.289999961853027	14
446 | 1	0	1	0	1.7799999713897705	4
447 | 1	0	1	1	2.0999999046325684	8
448 | 1	1	1	0	2.25	3
449 | 1	0	1	1	2.259999990463257	5
450 | 1	0	1	0	1.8600000143051147	5
451 | 1	1	0	0	3.9200000762939453	4
452 | 1	0	1	0	4.340000152587891	10
453 | 1	1	0	0	3.359999895095825	4
454 | 1	1	0	0	2.509999990463257	1
455 | 1	1	0	0	3.75	21
456 | 1	0	1	1	4.539999961853027	45
457 | 1	1	0	0	1.6299999952316284	8
458 | 1	1	0	0	4.619999885559082	10
459 | 1	0	1	0	1.5199999809265137	2
460 | 1	0	1	0	2.549999952316284	11
461 | 1	1	1	0	1.1799999475479126	1
462 | 1	1	1	2	3.0899999141693115	9
463 | 1	1	0	0	4.619999885559082	5
464 | 1	1	1	0	2.319999933242798	1
465 | 1	0	0	0	2.049999952316284	3
466 | 1	0	1	0	4.289999961853027	1
467 | 1	0	0	0	3.4000000953674316	2
468 | 1	0	0	0	4.289999961853027	10
469 | 1	0	1	1	3.5899999141693115	2
470 | 1	0	0	0	3.359999895095825	5
471 | 1	1	0	0	2.049999952316284	13
472 | 1	1	1	0	3.5899999141693115	32
473 | 1	1	0	0	3.8499999046325684	18
474 | 1	0	0	0	3.359999895095825	0
475 | 1	1	1	0	1.75	0
476 | 1	1	1	0	3.4700000286102295	3
477 | 1	0	1	1	2.609999895095825	4
478 | 1	0	0	0	2.759999990463257	4
479 | 1	0	1	2	4.289999961853027	9
480 | 1	1	0	0	1.399999976158142	3
481 | 1	1	1	0	4.619999885559082	18
482 | 1	0	1	1	2.859999895095825	17
483 | 1	0	1	1	2.859999895095825	17
484 | 1	0	1	0	2.859999895095825	39
485 | 1	0	0	0	2	7
486 | 1	1	1	0	1.7599999904632568	8
487 | 1	0	1	0	2.5	0
488 | 1	1	0	0	4.289999961853027	17
489 | 1	0	0	0	2.559999942779541	16
490 | 1	1	1	1	3.5399999618530273	14
491 | 1	1	1	0	3.690000057220459	1
492 | 1	0	0	0	2.869999885559082	5
493 | 1	1	0	0	1.6549999713897705	0
494 | 1	0	1	1	4.340000152587891	4
495 | 1	0	1	0	3.5399999618530273	11
496 | 1	0	1	2	4.289999961853027	10
497 | 1	0	1	0	1.809999942779541	0
498 | 1	1	0	0	3.4000000953674316	7
499 | 1	0	1	1	1.809999942779541	5
500 | 1	1	0	0	3.190000057220459	18
501 | 1	1	1	1	2.319999933242798	5
502 | 1	1	0	0	4.539999961853027	19
503 | 1	0	1	1	4.289999961853027	2
504 | 1	0	1	1	2.859999895095825	2
505 | 1	1	1	1	2.859999895095825	2
506 | 1	0	1	1	1.2200000286102295	0
507 | 1	0	1	0	2.0999999046325684	0
508 | 1	0	1	1	1.6799999475479126	12
509 | 1	1	0	0	3.359999895095825	34
510 | 1	1	1	0	2.869999885559082	15
511 | 1	0	1	1	3.4700000286102295	1
512 | 1	0	0	0	2.759999990463257	7
513 | 1	0	1	1	3.5399999618530273	21
514 | 1	1	1	0	1.9500000476837158	11
515 | 1	0	1	1	3.5899999141693115	19
516 | 1	1	0	0	3.190000057220459	3
517 | 1	1	0	0	2.759999990463257	2
518 | 1	1	1	2	4.289999961853027	13
519 | 1	0	1	2	3.190000057220459	5
520 | 1	0	1	0	2.9600000381469727	0
521 | 1	0	1	1	2.759999990463257	7
522 | 1	1	1	0	1.809999942779541	11
523 | 2	0	1	0	3.359999895095825	4
524 | 2	0	0	0	3.0899999141693115	14
525 | 2	1	0	0	3.75	8
526 | 2	1	1	0	4.539999961853027	10
527 | 2	1	0	0	1.6299999952316284	6
528 | 2	1	1	0	3.0899999141693115	2
529 | 2	1	0	0	3.690000057220459	4
530 | 2	1	0	0	2.119999885559082	2
531 | 2	1	0	0	2.259999990463257	3
532 | 2	0	1	1	4.289999961853027	2
533 | 2	1	1	1	1.7400000095367432	4
534 | 2	0	1	2	2.9600000381469727	8
535 | 2	1	0	0	2.5799999237060547	5
536 | 2	0	1	1	4.539999961853027	10
537 | 2	1	0	0	4.619999885559082	12
538 | 2	0	1	1	4.289999961853027	13
539 | 2	1	0	0	4.539999961853027	13
540 | 2	1	1	0	3.5899999141693115	0
541 | 2	1	0	0	4.289999961853027	5
542 | 2	0	0	0	3.690000057220459	26
543 | 2	1	0	0	3.2100000381469727	0
544 | 2	1	1	1	3.4700000286102295	11
545 | 2	0	0	0	4.539999961853027	37
546 | 2	0	1	0	1.9500000476837158	4
547 | 2	0	1	2	2.9600000381469727	46
548 | 2	0	1	1	4.539999961853027	53
549 | 2	0	1	0	3.4000000953674316	9
550 | 2	0	1	0	2.0999999046325684	4
551 | 2	0	1	2	4.289999961853027	8
552 | 2	1	0	0	3.75	5
553 | 2	1	0	0	2.049999952316284	2
554 | 2	0	1	1	2.5	2
555 | 2	1	1	1	4.619999885559082	17
556 | 2	1	1	0	1.7999999523162842	10
557 | 2	1	0	0	3.0899999141693115	0
558 | 2	0	0	0	4.289999961853027	10
559 | 2	0	1	2	1.7999999523162842	6
560 | 2	1	1	0	3.5899999141693115	5
561 | 2	0	1	0	2.759999990463257	3
562 | 2	0	1	0	4.139999866485596	16
563 | 2	0	0	0	3.5899999141693115	3
564 | 2	1	0	0	4.539999961853027	12
565 | 2	1	1	0	0.9200000166893005	0
566 | 2	1	0	0	4.289999961853027	15
567 | 2	0	1	1	2.390000104904175	4
568 | 2	0	1	0	1.1799999475479126	0
569 | 2	0	1	2	4.539999961853027	2
570 | 2	1	1	2	3.8499999046325684	47
571 | 2	0	1	2	2.9600000381469727	21
572 | 2	1	1	0	3.359999895095825	8
573 | 2	1	1	0	3.190000057220459	5
574 | 2	1	1	1	3.690000057220459	15
575 | 2	0	1	0	1.6299999952316284	8
576 | 2	0	1	0	1.7200000286102295	1
577 | 2	1	1	0	2.5799999237060547	5
578 | 2	0	1	0	1.5199999809265137	5
579 | 2	0	0	0	2.859999895095825	4
580 | 2	1	1	2	1.2200000286102295	4
581 | 2	1	0	0	2.5399999618530273	4
582 | 2	1	0	0	3.359999895095825	4
583 | 2	0	1	2	2.869999885559082	20
584 | 2	0	1	1	3.1500000953674316	7
585 | 2	1	0	0	4.619999885559082	14
586 | 2	0	1	1	1.7599999904632568	7
587 | 2	0	1	1	1.5199999809265137	3
588 | 2	0	0	0	4.289999961853027	5
589 | 2	0	1	2	4.289999961853027	10
590 | 2	0	1	1	2.559999942779541	5
591 | 2	0	0	0	4.539999961853027	29
592 | 2	1	1	0	1.8600000143051147	5
593 | 2	1	0	0	4.289999961853027	1
594 | 2	1	1	0	3.9200000762939453	7
595 | 2	0	0	0	2.259999990463257	2
596 | 2	0	1	1	4.289999961853027	14
597 | 2	1	1	1	3.690000057220459	10
598 | 2	0	1	2	1.9700000286102295	0
599 | 2	0	1	1	3.190000057220459	10
600 | 2	1	0	0	4.539999961853027	5
601 | 2	0	0	0	4.289999961853027	25
602 | 2	0	1	0	2.8299999237060547	1
603 | 2	0	1	3	4.340000152587891	7
604 | 2	1	0	0	3.359999895095825	3
605 | 2	1	0	0	3.4100000858306885	14
606 | 2	1	0	0	2.859999895095825	3
607 | 2	0	1	0	4.289999961853027	24
608 | 2	0	1	1	2.759999990463257	8
609 | 2	1	1	0	3.75	0
610 | 2	0	1	1	4.289999961853027	12
611 | 2	0	1	2	2.2100000381469727	12
612 | 2	0	1	0	3.5399999618530273	21
613 | 2	0	1	0	2.9600000381469727	2
614 | 2	1	1	0	2.119999885559082	2
615 | 2	0	0	0	2.390000104904175	4
616 | 2	1	0	0	3.190000057220459	5
617 | 2	1	0	0	1.9700000286102295	8
618 | 2	1	1	0	4.539999961853027	9
619 | 2	1	0	0	2.549999952316284	36
620 | 2	0	1	0	1.7599999904632568	0
621 | 2	0	1	2	4.340000152587891	17
622 | 2	1	1	0	2.5199999809265137	16
623 | 2	0	1	1	2.559999942779541	7
624 | 2	0	0	0	4.539999961853027	10
625 | 2	1	1	0	1.399999976158142	0
626 | 2	0	1	2	3.319999933242798	27
627 | 2	0	1	0	2.509999990463257	5
628 | 2	1	1	2	2.5799999237060547	3
629 | 2	0	1	2	3.5399999618530273	16
630 | 2	0	1	0	4.539999961853027	11
631 | 2	0	0	0	4.289999961853027	17
632 | 2	0	1	0	4.340000152587891	5
633 | 2	1	1	1	3.4700000286102295	4
634 | 2	1	0	0	3.4700000286102295	5
635 | 2	0	1	2	2.5	5
636 | 2	0	1	3	2	8
637 | 2	0	0	0	3.9200000762939453	10
638 | 2	1	1	0	4.619999885559082	23
639 | 2	1	0	0	3.0899999141693115	6
640 | 2	1	0	0	2.319999933242798	4
641 | 2	1	0	0	4.289999961853027	21
642 | 2	1	0	0	3.359999895095825	7
643 | 2	0	1	0	3.690000057220459	5
644 | 2	0	1	0	4.289999961853027	17
645 | 2	1	0	0	3.5899999141693115	11
646 | 2	0	1	0	4.289999961853027	13
647 | 2	1	1	1	3.2100000381469727	7
648 | 2	1	0	0	2.259999990463257	8
649 | 2	0	1	0	2.5	0
650 | 2	0	0	0	2.259999990463257	3
651 | 2	1	1	0	3.75	4
652 | 2	1	0	0	4.289999961853027	4
653 | 2	0	1	0	3.190000057220459	11
654 | 2	1	1	1	2	7
655 | 2	1	1	0	3.0899999141693115	1
656 | 2	1	1	1	3.5899999141693115	2
657 | 2	1	1	0	4.340000152587891	11
658 | 2	1	0	0	2.869999885559082	9
659 | 2	0	1	0	2.0999999046325684	2
660 | 2	1	0	0	2.8299999237060547	26
661 | 2	0	1	1	2.0999999046325684	3
662 | 2	0	1	1	1.7799999713897705	1
663 | 2	0	1	0	2.509999990463257	0
664 | 2	0	1	1	2.5199999809265137	10
665 | 2	1	1	1	3.4000000953674316	4
666 | 2	1	1	1	3.0899999141693115	3
667 | 2	0	1	0	4.289999961853027	13
668 | 2	1	1	0	2.9600000381469727	8
669 | 2	1	1	2	3.5899999141693115	11
670 | 2	0	0	0	4.539999961853027	55
671 | 2	1	1	0	3.5899999141693115	11
672 | 2	0	1	1	3.4200000762939453	3
673 | 2	1	1	1	4.289999961853027	4
674 | 2	1	0	0	3.190000057220459	4
675 | 2	1	1	1	4.289999961853027	11
676 | 2	0	1	0	4.619999885559082	3
677 | 2	0	1	2	1.8600000143051147	6
678 | 2	0	1	0	4.619999885559082	8
679 | 2	1	1	2	4.539999961853027	48
680 | 2	1	1	0	3.75	8
681 | 2	0	1	0	1.7599999904632568	0
682 | 2	1	1	0	3.4100000858306885	31
683 | 2	0	1	0	2.549999952316284	24
684 | 2	0	1	1	1.25	2
685 | 2	1	1	0	1.4800000190734863	9
686 | 2	1	1	0	2.8299999237060547	3
687 | 2	1	1	0	4.289999961853027	6
688 | 2	1	0	0	3.190000057220459	10
689 | 2	1	0	0	1.399999976158142	0
690 | 2	1	1	2	3.2100000381469727	2
691 | 2	0	0	0	4.619999885559082	9
692 | 2	1	1	0	1.4500000476837158	7
693 | 2	1	1	1	3.3399999141693115	4
694 | 2	1	0	0	4.539999961853027	23
695 | 2	0	1	2	4.289999961853027	30
696 | 2	1	0	0	3.690000057220459	7
697 | 2	1	0	0	2.5799999237060547	2
698 | 2	1	1	1	4.539999961853027	7
699 | 2	0	1	1	4.539999961853027	6
700 | 2	0	0	0	4.289999961853027	9
701 | 3	1	0	0	4.289999961853027	12
702 | 3	1	0	0	3.75	0
703 | 3	0	0	0	4.619999885559082	8
704 | 3	0	1	2	4.539999961853027	31
705 | 3	0	1	0	3.0899999141693115	5
706 | 3	0	1	1	1.9500000476837158	14
707 | 3	1	1	1	3.5899999141693115	3
708 | 3	0	1	1	2.609999895095825	5
709 | 3	0	0	0	2.869999885559082	12
710 | 3	0	0	0	4.289999961853027	30
711 | 3	1	1	0	3.4100000858306885	19
712 | 3	1	1	1	2.319999933242798	5
713 | 3	1	1	0	2.119999885559082	3
714 | 3	0	1	0	4.539999961853027	37
715 | 3	0	1	0	2.9600000381469727	8
716 | 3	0	0	0	2.869999885559082	12
717 | 3	0	1	0	2.319999933242798	4
718 | 3	0	1	2	1.7999999523162842	6
719 | 3	1	0	0	2.390000104904175	11
720 | 3	1	1	1	4.619999885559082	14
721 | 3	1	0	0	4.289999961853027	10
722 | 3	0	1	0	4.539999961853027	5
723 | 3	0	0	0	3.5899999141693115	16
724 | 3	0	1	2	4.289999961853027	0
725 | 3	0	1	1	2.9600000381469727	5
726 | 3	0	1	2	4.289999961853027	25
727 | 3	0	1	1	4.289999961853027	18
728 | 3	0	1	1	2.5399999618530273	6
729 | 3	0	0	0	3.9200000762939453	0
730 | 3	0	0	0	2.319999933242798	8
731 | 3	1	0	0	4.619999885559082	22
732 | 3	1	1	0	3.75	6
733 | 3	0	1	2	1.9500000476837158	3
734 | 3	0	1	1	1.7999999523162842	4
735 | 3	0	1	0	2.759999990463257	8
736 | 3	0	0	0	3.5899999141693115	7
737 | 3	0	1	0	4.25	20
738 | 3	0	1	2	2.8299999237060547	7
739 | 3	1	0	0	2.5199999809265137	4
740 | 3	1	1	0	1.1799999475479126	0
741 | 3	1	1	0	3.359999895095825	11
742 | 3	1	1	0	4.289999961853027	21
743 | 3	1	1	1	4.619999885559082	49
744 | 3	0	0	0	2.8299999237060547	8
745 | 3	0	1	0	3.690000057220459	24
746 | 3	0	1	0	2.259999990463257	12
747 | 3	1	0	0	2.049999952316284	5
748 | 3	0	1	1	4.289999961853027	19
749 | 3	0	1	0	1.4199999570846558	3
750 | 3	1	1	2	1.8899999856948853	16
751 | 3	0	1	0	3.690000057220459	26
752 | 3	1	1	0	2.9600000381469727	22
753 | 3	1	1	1	1.8600000143051147	1
754 | 3	1	1	0	2.259999990463257	6
755 | 3	0	1	0	4.289999961853027	15
756 | 3	1	1	0	3.1500000953674316	4
757 | 3	0	1	0	2.609999895095825	0
758 | 3	1	0	0	4.539999961853027	3
759 | 3	0	1	1	2.2100000381469727	10
760 | 3	0	1	0	1.7799999713897705	0
761 | 3	0	1	1	3.619999885559082	5
762 | 3	0	1	0	3.9200000762939453	0
763 | 3	1	1	0	4.619999885559082	10
764 | 3	0	1	0	2.609999895095825	5
765 | 3	1	1	1	3.359999895095825	3
766 | 3	1	1	0	3.690000057220459	12
767 | 3	0	1	0	3.4700000286102295	14
768 | 3	0	1	0	1.25	4
769 | 3	0	1	0	4.619999885559082	18
770 | 3	0	1	1	3.5899999141693115	14
771 | 3	1	0	0	3.359999895095825	2
772 | 3	0	1	0	4.289999961853027	38
773 | 3	0	0	0	3.5399999618530273	12
774 | 3	0	1	2	2.5799999237060547	2
775 | 3	0	0	0	1.5199999809265137	3
776 | 3	0	1	0	3.4700000286102295	8
777 | 3	1	1	0	4.619999885559082	14
778 | 3	0	0	0	3.5399999618530273	25
779 | 3	0	1	2	2.0999999046325684	4
780 | 3	0	1	2	1.3799999952316284	8
781 | 3	0	1	1	3.0899999141693115	5
782 | 3	0	1	3	2.859999895095825	3
783 | 3	1	0	0	4.289999961853027	5
784 | 3	1	1	1	1.2200000286102295	3
785 | 4	0	1	0	2.8299999237060547	5
786 | 4	1	1	0	4.539999961853027	16
787 | 4	0	1	0	4.289999961853027	22
788 | 4	0	0	0	4.289999961853027	7
789 | 4	0	1	0	3.5399999618530273	9
790 | 4	0	0	0	2.5199999809265137	6
791 | 4	1	0	0	1.399999976158142	3
792 | 4	1	1	0	4.340000152587891	6
793 | 4	1	1	1	2.509999990463257	2
794 | 4	0	1	0	4.289999961853027	2
795 | 4	0	1	1	2.9600000381469727	0
796 | 4	0	0	0	3.8499999046325684	47
797 | 4	0	0	0	4.539999961853027	5
798 | 4	1	1	0	3.4000000953674316	5
799 | 4	0	0	0	3.4100000858306885	21
800 | 4	0	1	1	2.9600000381469727	57
801 | 4	1	1	0	3.5399999618530273	5
802 | 4	0	1	2	1.7200000286102295	2
803 | 4	0	1	2	2.859999895095825	15
804 | 4	0	1	2	4.539999961853027	66
805 | 4	0	1	1	1.7599999904632568	11
806 | 4	0	1	2	1.8600000143051147	15
807 | 4	0	1	0	3.4700000286102295	4
808 | 4	0	0	0	3.4100000858306885	7
809 | 4	0	1	0	1.809999942779541	10
810 | 4	0	1	2	2.049999952316284	13
811 | 4	0	0	0	4.619999885559082	18
812 | 4	1	1	0	2.259999990463257	3
813 | 4	0	1	1	1.8600000143051147	47
814 | 4	1	1	1	4.289999961853027	4
815 | 4	0	1	2	3.5399999618530273	2
816 | 4	0	1	0	4.289999961853027	30
817 | 4	0	1	0	2.5399999618530273	11
818 | 4	1	1	0	4.25	13
819 | 4	1	0	0	2.5	10
820 | 4	1	1	1	3.5399999618530273	12
821 | 4	1	1	0	3.190000057220459	8
822 | 4	0	1	2	4.539999961853027	21
823 | 4	1	0	0	2.859999895095825	39
824 | 4	1	1	0	2.5799999237060547	14
825 | 4	1	1	0	1.2799999713897705	4
826 | 4	1	1	0	3.2100000381469727	14
827 | 4	1	0	0	2.869999885559082	6
828 | 4	1	0	0	3.3399999141693115	11
829 | 4	1	0	0	3.1500000953674316	8
830 | 4	1	0	0	2.319999933242798	8
831 | 4	0	0	0	4.289999961853027	19
832 | 4	0	0	0	3.690000057220459	7
833 | 4	0	1	1	2.549999952316284	21
834 | 4	1	0	0	4.619999885559082	3
835 | 4	1	1	0	2.359999895095825	5
836 | 4	0	1	1	4.25	29
837 | 4	0	1	1	4.289999961853027	5
838 | 4	1	1	0	3.190000057220459	7
839 | 4	1	0	0	3.190000057220459	3
840 | 4	1	0	0	2.559999942779541	23
841 | 4	0	1	0	1.7599999904632568	7
842 | 4	0	1	1	2.5799999237060547	25
843 | 4	1	1	0	3.4100000858306885	4
844 | 4	1	0	0	3.5899999141693115	9
845 | 4	1	1	0	1.7799999713897705	4
846 | 4	1	0	0	3.75	3
847 | 4	0	1	2	3.690000057220459	16
848 | 4	0	1	0	3.619999885559082	5
849 | 4	0	1	0	1.5049999952316284	4
850 | 4	0	1	2	2.5799999237060547	6
851 | 4	0	0	0	2.869999885559082	15
852 | 5	1	1	0	2.259999990463257	16
853 | 5	1	0	0	3.75	13
854 | 5	0	1	2	3.8499999046325684	15
855 | 5	0	1	1	3.1500000953674316	5
856 | 5	0	1	1	3.1500000953674316	27
857 | 5	0	1	0	4.340000152587891	6
858 | 5	0	1	0	4.25	12
859 | 5	0	0	0	3.359999895095825	11
860 | 5	1	0	0	3.5899999141693115	14
861 | 5	0	1	1	2.5799999237060547	25
862 | 5	0	1	1	3.5899999141693115	21
863 | 5	0	1	1	2.9600000381469727	4
864 | 5	0	1	0	4.539999961853027	13
865 | 5	0	1	0	4.289999961853027	30
866 | 5	1	0	0	2.869999885559082	12
867 | 5	1	1	0	4.539999961853027	15
868 | 5	1	1	1	3.8499999046325684	1
869 | 5	0	1	0	2.5	4
870 | 5	0	1	1	3.4700000286102295	0
871 | 5	1	1	0	2.859999895095825	24
872 | 5	1	1	0	4.619999885559082	0
873 | 5	0	1	1	4.539999961853027	53
874 | 5	0	1	0	1.25	2
875 | 5	1	1	0	3.2100000381469727	19
876 | 5	0	1	1	1.7599999904632568	11
877 | 5	1	0	0	2.5799999237060547	6
878 | 5	1	0	0	3.619999885559082	3
879 | 6	0	1	1	4.619999885559082	8
880 | 6	1	1	0	2.0999999046325684	36
881 | 6	1	1	2	1.8600000143051147	38
882 | 6	0	1	0	4.340000152587891	9
883 | 6	1	1	0	4.289999961853027	24
884 | 6	0	1	2	2.509999990463257	11
885 | 6	0	1	1	2.9600000381469727	13
886 | 6	0	0	0	4.289999961853027	18
887 | 6	0	0	0	3.4000000953674316	14
888 | 6	1	0	0	4.539999961853027	12
889 | 6	0	1	1	3.8499999046325684	16
890 | 6	1	0	0	3.1500000953674316	9
891 | 6	1	0	0	4.539999961853027	15
892 | 6	0	0	0	3.4700000286102295	6
893 | 6	1	1	0	4.289999961853027	1
894 | 6	0	0	0	1.9700000286102295	4
895 | 6	1	0	0	3.319999933242798	6
896 | 7	0	1	0	3.5899999141693115	1
897 | 7	0	0	0	2.5399999618530273	6
898 | 7	0	0	0	3.4100000858306885	20
899 | 7	0	1	1	1.9700000286102295	0
900 | 7	1	0	0	3.1500000953674316	9
901 | 7	0	0	0	4.619999885559082	15
902 | 7	0	0	0	4.539999961853027	42
903 | 7	0	1	0	3.690000057220459	9
904 | 7	0	0	0	4.340000152587891	19
905 | 7	0	0	0	4.289999961853027	19
906 | 7	0	1	1	3.5899999141693115	27
907 | 7	0	0	0	3.690000057220459	19
908 | 8	0	1	0	2.509999990463257	11
909 | 9	0	1	1	2.9600000381469727	23
910 | 9	0	1	1	1.8600000143051147	47
911 | 10	1	1	0	3.5899999141693115	18
912 | 11	0	1	2	2.859999895095825	7
913 | 12	0	1	1	4.289999961853027	35
914 | 12	0	1	1	1.8600000143051147	5
915 | 16	0	1	0	1.7400000095367432	21
916 | 19	0	1	0	1.8600000143051147	42
917 | 


--------------------------------------------------------------------------------
/data/test-biochemists-nb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from autoencoder.io import read_text
 3 | from autoencoder.network import MLP
 4 | from keras.callbacks import TensorBoard
 5 | 
 6 | count = read_text('biochemists.tsv', header='infer')
 7 | y = count[:, 0].astype(int)
 8 | x = count[:, 1:]
 9 | 
10 | net = MLP(x.shape[1], output_size=1, hidden_size=(), masking=False, loss_type='nb')
11 | net.build()
12 | model = net.model
13 | tb = TensorBoard(log_dir='./logs', histogram_freq=1)
14 | 
15 | model.summary()
16 | model.compile(loss=net.loss, optimizer='Adam')
17 | model.fit(x, y, epochs=700, batch_size=32, callbacks=[tb])
18 | 
19 | 
20 | print('Theta: %f' % net.extra_models['dispersion']())
21 | 


--------------------------------------------------------------------------------
/data/test-biochemists-zinb-ae.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from autoencoder.io import read_text, preprocess
 4 | from autoencoder.api import autoencode
 5 | import keras.backend as K
 6 | 
 7 | # for full reproducibility
 8 | np.random.seed(1)
 9 | tf.set_random_seed(1)
10 | sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1,
11 |                                         inter_op_parallelism_threads=1))
12 | K.set_session(sess)
13 | 
14 | x = read_text('biochemists.tsv', header='infer')
15 | print(x.shape)
16 | 
17 | # test API
18 | result = autoencode(x, 'test-ae', type='zinb-conddisp', hidden_size=(1,), epochs=3)
19 | 


--------------------------------------------------------------------------------
/data/test-biochemists-zinb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from autoencoder.io import read_text
 3 | from autoencoder.network import MLP
 4 | from keras.callbacks import TensorBoard
 5 | 
 6 | count = read_text('biochemists.tsv', header='infer')
 7 | y = count[:, 0].astype(int)
 8 | x = count[:, 1:]
 9 | 
10 | net = MLP(x.shape[1], output_size=1, hidden_size=(), masking=False, loss_type='zinb')
11 | net.build()
12 | model = net.model
13 | tb = TensorBoard(log_dir='./logs', histogram_freq=1)
14 | 
15 | model.summary()
16 | model.compile(loss=net.loss, optimizer='Adam')
17 | model.fit(x, y, epochs=700, batch_size=32, callbacks=[tb])
18 | 
19 | print('Theta: %f' % net.extra_models['dispersion']())
20 | 


--------------------------------------------------------------------------------
/dca/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['KERAS_BACKEND'] = 'tensorflow'
3 | 


--------------------------------------------------------------------------------
/dca/__main__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Goekcen Eraslan
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os, sys, argparse
 17 | 
 18 | def parse_args():
 19 |     parser = argparse.ArgumentParser(description='Autoencoder')
 20 | 
 21 |     parser.add_argument('input', type=str, help='Input is raw count data in TSV/CSV '
 22 |                         'or H5AD (anndata) format. '
 23 |                         'Row/col names are mandatory. Note that TSV/CSV files must be in '
 24 |                         'gene x cell layout where rows are genes and cols are cells (scRNA-seq '
 25 |                         'convention).'
 26 |                         'Use the -t/--transpose option if your count matrix in cell x gene layout. '
 27 |                         'H5AD files must be in cell x gene format (stats and scanpy convention).')
 28 |     parser.add_argument('outputdir', type=str, help='The path of the output directory')
 29 | 
 30 |     # IO and norm options
 31 |     parser.add_argument('--normtype', type=str, default='zheng',
 32 |             help='Type of size factor estimation. Possible values: deseq, zheng.'
 33 |                  ' (default: zheng)')
 34 |     parser.add_argument('-t', '--transpose', dest='transpose',
 35 |             action='store_true', help='Transpose input matrix (default: False)')
 36 |     parser.add_argument('--testsplit', dest='testsplit',
 37 |             action='store_true', help="Use one fold as a test set (default: False)")
 38 | 
 39 |     # training options
 40 |     parser.add_argument('--type', type=str, default='nb-conddisp',
 41 |             help="Type of autoencoder. Possible values: normal, poisson, nb, "
 42 |                  "nb-shared, nb-conddisp (default), nb-fork, zinb, "
 43 |                  "zinb-shared, zinb-conddisp( zinb-fork")
 44 |     parser.add_argument('--threads', type=int, default=None,
 45 |             help='Number of threads for training (default is all cores)')
 46 |     parser.add_argument('-b', '--batchsize', type=int, default=32,
 47 |             help="Batch size (default:32)")
 48 |     parser.add_argument('--sizefactors', dest='sizefactors',
 49 |             action='store_true', help="Normalize means by library size (default: True)")
 50 |     parser.add_argument('--nosizefactors', dest='sizefactors',
 51 |             action='store_false', help="Do not normalize means by library size")
 52 |     parser.add_argument('--norminput', dest='norminput',
 53 |             action='store_true', help="Zero-mean normalize input (default: True)")
 54 |     parser.add_argument('--nonorminput', dest='norminput',
 55 |             action='store_false', help="Do not zero-mean normalize inputs")
 56 |     parser.add_argument('--loginput', dest='loginput',
 57 |             action='store_true', help="Log-transform input (default: True)")
 58 |     parser.add_argument('--nologinput', dest='loginput',
 59 |             action='store_false', help="Do not log-transform inputs")
 60 |     parser.add_argument('-d', '--dropoutrate', type=str, default='0.0',
 61 |             help="Dropout rate (default: 0)")
 62 |     parser.add_argument('--batchnorm', dest='batchnorm', action='store_true',
 63 |             help="Batchnorm (default: True)")
 64 |     parser.add_argument('--nobatchnorm', dest='batchnorm', action='store_false',
 65 |             help="Do not use batchnorm")
 66 |     parser.add_argument('--l2', type=float, default=0.0,
 67 |             help="L2 regularization coefficient (default: 0.0)")
 68 |     parser.add_argument('--l1', type=float, default=0.0,
 69 |             help="L1 regularization coefficient (default: 0.0)")
 70 |     parser.add_argument('--l2enc', type=float, default=0.0,
 71 |             help="Encoder-specific L2 regularization coefficient (default: 0.0)")
 72 |     parser.add_argument('--l1enc', type=float, default=0.0,
 73 |             help="Encoder-specific L1 regularization coefficient (default: 0.0)")
 74 |     parser.add_argument('--ridge', type=float, default=0.0,
 75 |             help="L2 regularization coefficient for dropout probabilities (default: 0.0)")
 76 |     parser.add_argument('--gradclip', type=float, default=5.0,
 77 |             help="Clip grad values (default: 5.0)")
 78 |     parser.add_argument('--activation', type=str, default='relu',
 79 |             help="Activation function of hidden units (default: relu)")
 80 |     parser.add_argument('--optimizer', type=str, default='RMSprop',
 81 |             help="Optimization method (default: RMSprop)")
 82 |     parser.add_argument('--init', type=str, default='glorot_uniform',
 83 |             help="Initialization method for weights (default: glorot_uniform)")
 84 |     parser.add_argument('-e', '--epochs', type=int, default=300,
 85 |             help="Max number of epochs to continue training in case of no "
 86 |                  "improvement on validation loss (default: 300)")
 87 |     parser.add_argument('--earlystop', type=int, default=15,
 88 |             help="Number of epochs to stop training if no improvement in loss "
 89 |                  "occurs (default: 15)")
 90 |     parser.add_argument('--reducelr', type=int, default=10,
 91 |             help="Number of epochs to reduce learning rate if no improvement "
 92 |             "in loss occurs (default: 10)")
 93 |     parser.add_argument('-s', '--hiddensize', type=str, default='64,32,64',
 94 |             help="Size of hidden layers (default: 64,32,64)")
 95 |     parser.add_argument('--inputdropout', type=float, default=0.0,
 96 |             help="Input layer dropout probability"),
 97 |     parser.add_argument('-r', '--learningrate', type=float, default=None,
 98 |             help="Learning rate (default: 0.001)")
 99 |     parser.add_argument('--saveweights', dest='saveweights',
100 |             action='store_true', help="Save weights (default: False)")
101 |     parser.add_argument('--no-saveweights', dest='saveweights',
102 |             action='store_false', help="Do not save weights")
103 |     parser.add_argument('--hyper', dest='hyper',
104 |             action='store_true', help="Optimizer hyperparameters (default: False)")
105 |     parser.add_argument('--hypern', dest='hypern', type=int, default=1000,
106 |             help="Number of samples drawn from hyperparameter distributions during optimization. "
107 |                  "(default: 1000)")
108 |     parser.add_argument('--hyperepoch', dest='hyperepoch', type=int, default=100,
109 |             help="Number of epochs used in each hyperpar optimization iteration. "
110 |                  "(default: 100)")
111 |     parser.add_argument('--debug', dest='debug',
112 |             action='store_true', help="Enable debugging. Checks whether every term in "
113 |                                       "loss functions is finite. (default: False)")
114 |     parser.add_argument('--tensorboard', dest='tensorboard',
115 |             action='store_true', help="Use tensorboard for saving weight distributions and "
116 |                                       "visualization. (default: False)")
117 |     parser.add_argument('--checkcounts', dest='checkcounts', action='store_true',
118 |             help="Check if the expression matrix has raw (unnormalized) counts (default: True)")
119 |     parser.add_argument('--nocheckcounts', dest='checkcounts', action='store_false',
120 |             help="Do not check if the expression matrix has raw (unnormalized) counts")
121 |     parser.add_argument('--denoisesubset', dest='denoisesubset', type=str,
122 |                         help='Perform denoising only for the subset of genes '
123 |                              'in the given file. Gene names should be line '
124 |                              'separated.')
125 | 
126 |     parser.set_defaults(transpose=False,
127 |                         testsplit=False,
128 |                         saveweights=False,
129 |                         sizefactors=True,
130 |                         batchnorm=True,
131 |                         checkcounts=True,
132 |                         norminput=True,
133 |                         hyper=False,
134 |                         debug=False,
135 |                         tensorboard=False,
136 |                         loginput=True)
137 | 
138 |     return parser.parse_args()
139 | 
140 | 
141 | def main():
142 |     args = parse_args()
143 | 
144 |     try:
145 |         import tensorflow as tf
146 |     except ImportError:
147 |         raise ImportError('DCA requires TensorFlow v2+. Please follow instructions'
148 |                           ' at https://www.tensorflow.org/install/ to install'
149 |                           ' it.')
150 | 
151 |     # import tf and the rest after parse_args() to make argparse help faster
152 |     from . import train
153 | 
154 |     train.train_with_args(args)
155 | 


--------------------------------------------------------------------------------
/dca/api.py:
--------------------------------------------------------------------------------
  1 | import os, tempfile, shutil, random
  2 | import anndata
  3 | import numpy as np
  4 | import scanpy as sc
  5 | 
  6 | try:
  7 |     import tensorflow as tf
  8 | except ImportError:
  9 |     raise ImportError('DCA requires TensorFlow v2+. Please follow instructions'
 10 |                       ' at https://www.tensorflow.org/install/ to install'
 11 |                       ' it.')
 12 | 
 13 | 
 14 | from .io import read_dataset, normalize
 15 | from .train import train
 16 | from .network import AE_types
 17 | 
 18 | 
 19 | def dca(adata,
 20 |         mode='denoise',
 21 |         ae_type='nb-conddisp',
 22 |         normalize_per_cell=True,
 23 |         scale=True,
 24 |         log1p=True,
 25 |         hidden_size=(64, 32, 64), # network args
 26 |         hidden_dropout=0.,
 27 |         batchnorm=True,
 28 |         activation='relu',
 29 |         init='glorot_uniform',
 30 |         network_kwds={},
 31 |         epochs=300,               # training args
 32 |         reduce_lr=10,
 33 |         early_stop=15,
 34 |         batch_size=32,
 35 |         optimizer='RMSprop',
 36 |         learning_rate=None,
 37 |         random_state=0,
 38 |         threads=None,
 39 |         verbose=False,
 40 |         training_kwds={},
 41 |         return_model=False,
 42 |         return_info=False,
 43 |         copy=False,
 44 |         check_counts=True,
 45 |         ):
 46 |     """Deep count autoencoder(DCA) API.
 47 | 
 48 |     Fits a count autoencoder to the count data given in the anndata object
 49 |     in order to denoise the data and capture hidden representation of
 50 |     cells in low dimensions. Type of the autoencoder and return values are
 51 |     determined by the parameters.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     adata : :class:`~scanpy.api.AnnData`
 56 |         An anndata file with `.raw` attribute representing raw counts.
 57 |     mode : `str`, optional. `denoise`(default), or `latent`.
 58 |         `denoise` overwrites `adata.X` with denoised expression values.
 59 |         In `latent` mode DCA adds `adata.obsm['X_dca']` to given adata
 60 |         object. This matrix represent latent representation of cells via DCA.
 61 |     ae_type : `str`, optional. `nb-conddisp`(default), `zinb`, `nb-conddisp` or `nb`.
 62 |         Type of the autoencoder. Return values and the architecture is
 63 |         determined by the type e.g. `nb` does not provide dropout
 64 |         probabilities.
 65 |     normalize_per_cell : `bool`, optional. Default: `True`.
 66 |         If true, library size normalization is performed using
 67 |         the `sc.pp.normalize_per_cell` function in Scanpy and saved into adata
 68 |         object. Mean layer is re-introduces library size differences by
 69 |         scaling the mean value of each cell in the output layer. See the
 70 |         manuscript for more details.
 71 |     scale : `bool`, optional. Default: `True`.
 72 |         If true, the input of the autoencoder is centered using
 73 |         `sc.pp.scale` function of Scanpy. Note that the output is kept as raw
 74 |         counts as loss functions are designed for the count data.
 75 |     log1p : `bool`, optional. Default: `True`.
 76 |         If true, the input of the autoencoder is log transformed with a
 77 |         pseudocount of one using `sc.pp.log1p` function of Scanpy.
 78 |     hidden_size : `tuple` or `list`, optional. Default: (64, 32, 64).
 79 |         Width of hidden layers.
 80 |     hidden_dropout : `float`, `tuple` or `list`, optional. Default: 0.0.
 81 |         Probability of weight dropout in the autoencoder (per layer if list
 82 |         or tuple).
 83 |     batchnorm : `bool`, optional. Default: `True`.
 84 |         If true, batch normalization is performed.
 85 |     activation : `str`, optional. Default: `relu`.
 86 |         Activation function of hidden layers.
 87 |     init : `str`, optional. Default: `glorot_uniform`.
 88 |         Initialization method used to initialize weights.
 89 |     network_kwds : `dict`, optional.
 90 |         Additional keyword arguments for the autoencoder.
 91 |     epochs : `int`, optional. Default: 300.
 92 |         Number of total epochs in training.
 93 |     reduce_lr : `int`, optional. Default: 10.
 94 |         Reduces learning rate if validation loss does not improve in given number of epochs.
 95 |     early_stop : `int`, optional. Default: 15.
 96 |         Stops training if validation loss does not improve in given number of epochs.
 97 |     batch_size : `int`, optional. Default: 32.
 98 |         Number of samples in the batch used for SGD.
 99 |     learning_rate : `float`, optional. Default: None.
100 |         Learning rate to use in the training.
101 |     optimizer : `str`, optional. Default: "RMSprop".
102 |         Type of optimization method used for training.
103 |     random_state : `int`, optional. Default: 0.
104 |         Seed for python, numpy and tensorflow.
105 |     threads : `int` or None, optional. Default: None
106 |         Number of threads to use in training. All cores are used by default.
107 |     verbose : `bool`, optional. Default: `False`.
108 |         If true, prints additional information about training and architecture.
109 |     training_kwds : `dict`, optional.
110 |         Additional keyword arguments for the training process.
111 |     return_model : `bool`, optional. Default: `False`.
112 |         If true, trained autoencoder object is returned. See "Returns".
113 |     return_info : `bool`, optional. Default: `False`.
114 |         If true, all additional parameters of DCA are stored in `adata.obsm` such as dropout
115 |         probabilities (obsm['X_dca_dropout']) and estimated dispersion values
116 |         (obsm['X_dca_dispersion']), in case that autoencoder is of type
117 |         zinb or zinb-conddisp.
118 |     copy : `bool`, optional. Default: `False`.
119 |         If true, a copy of anndata is returned.
120 |     check_counts : `bool`. Default `True`.
121 |         Check if the counts are unnormalized (raw) counts.
122 | 
123 |     Returns
124 |     -------
125 |     If `copy` is true and `return_model` is false, AnnData object is returned.
126 | 
127 |     In "denoise" mode, `adata.X` is overwritten with the denoised values. In "latent" mode, latent
128 |     low dimensional representation of cells are stored in `adata.obsm['X_dca']` and `adata.X`
129 |     is not modified. Note that these values are not corrected for library size effects.
130 | 
131 |     If `return_info` is true, all estimated distribution parameters are stored in AnnData such as:
132 | 
133 |     - `.obsm["X_dca_dropout"]` which is the mixture coefficient (pi) of the zero component
134 |     in ZINB, i.e. dropout probability. (Only if ae_type is zinb or zinb-conddisp)
135 | 
136 |     - `.obsm["X_dca_dispersion"]` which is the dispersion parameter of NB.
137 | 
138 |     - `.uns["dca_loss_history"]` which stores the loss history of the training.
139 | 
140 |     Finally, the raw counts are stored as `.raw`.
141 | 
142 |     If `return_model` is given, trained model is returned. When both `copy` and `return_model`
143 |     are true, a tuple of anndata and model is returned in that order.
144 |     """
145 | 
146 |     assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance'
147 |     assert mode in ('denoise', 'latent'), '%s is not a valid mode.' % mode
148 | 
149 |     # set seed for reproducibility
150 |     random.seed(random_state)
151 |     np.random.seed(random_state)
152 |     tf.random.set_seed(random_state)
153 |     os.environ['PYTHONHASHSEED'] = '0'
154 | 
155 |     # this creates adata.raw with raw counts and copies adata if copy==True
156 |     adata = read_dataset(adata,
157 |                          transpose=False,
158 |                          test_split=False,
159 |                          copy=copy,
160 |                          check_counts=check_counts)
161 | 
162 |     # check for zero genes
163 |     nonzero_genes, _ = sc.pp.filter_genes(adata.X, min_counts=1)
164 |     assert nonzero_genes.all(), 'Please remove all-zero genes before using DCA.'
165 | 
166 |     adata = normalize(adata,
167 |                       filter_min_counts=False, # no filtering, keep cell and gene idxs same
168 |                       size_factors=normalize_per_cell,
169 |                       normalize_input=scale,
170 |                       logtrans_input=log1p)
171 | 
172 |     network_kwds = {**network_kwds,
173 |         'hidden_size': hidden_size,
174 |         'hidden_dropout': hidden_dropout,
175 |         'batchnorm': batchnorm,
176 |         'activation': activation,
177 |         'init': init
178 |     }
179 |     
180 |     from tensorflow.python.framework.ops import disable_eager_execution
181 |     disable_eager_execution()
182 | 
183 |     input_size = output_size = adata.n_vars
184 |     net = AE_types[ae_type](input_size=input_size,
185 |                             output_size=output_size,
186 |                             **network_kwds)
187 |     net.save()
188 |     net.build()
189 | 
190 |     training_kwds = {**training_kwds,
191 |         'epochs': epochs,
192 |         'reduce_lr': reduce_lr,
193 |         'early_stop': early_stop,
194 |         'batch_size': batch_size,
195 |         'optimizer': optimizer,
196 |         'verbose': verbose,
197 |         'threads': threads,
198 |         'learning_rate': learning_rate
199 |     }
200 | 
201 |     hist = train(adata[adata.obs.dca_split == 'train'], net, **training_kwds)
202 |     res = net.predict(adata, mode, return_info, copy)
203 |     adata = res if copy else adata
204 | 
205 |     if return_info:
206 |         adata.uns['dca_loss_history'] = hist.history
207 | 
208 |     if return_model:
209 |         return (adata, net) if copy else net
210 |     else:
211 |         return adata if copy else None
212 | 


--------------------------------------------------------------------------------
/dca/hyper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import json
  4 | 
  5 | import numpy as np
  6 | from kopt import CompileFN, test_fn
  7 | from hyperopt import fmin, tpe, hp, Trials
  8 | import keras.optimizers as opt
  9 | 
 10 | from . import io
 11 | from .network import AE_types
 12 | 
 13 | 
 14 | def hyper(args):
 15 |     adata = io.read_dataset(args.input,
 16 |                             transpose=args.transpose,
 17 |                             test_split=False)
 18 | 
 19 |     hyper_params = {
 20 |             "data": {
 21 |                 "norm_input_log": hp.choice('d_norm_log', (True, False)),
 22 |                 "norm_input_zeromean": hp.choice('d_norm_zeromean', (True, False)),
 23 |                 "norm_input_sf": hp.choice('d_norm_sf', (True, False)),
 24 |                 },
 25 |             "model": {
 26 |                 "lr": hp.loguniform("m_lr", np.log(1e-3), np.log(1e-2)),
 27 |                 "ridge": hp.loguniform("m_ridge", np.log(1e-7), np.log(1e-1)),
 28 |                 "l1_enc_coef": hp.loguniform("m_l1_enc_coef", np.log(1e-7), np.log(1e-1)),
 29 |                 "hidden_size": hp.choice("m_hiddensize", ((64,32,64), (32,16,32),
 30 |                                                           (64,64), (32,32), (16,16),
 31 |                                                           (16,), (32,), (64,), (128,))),
 32 |                 "activation": hp.choice("m_activation", ('relu', 'selu', 'elu',
 33 |                                                          'PReLU', 'linear', 'LeakyReLU')),
 34 |                 "aetype": hp.choice("m_aetype", ('zinb', 'zinb-conddisp')),
 35 |                 "batchnorm": hp.choice("m_batchnorm", (True, False)),
 36 |                 "dropout": hp.uniform("m_do", 0, 0.7),
 37 |                 "input_dropout": hp.uniform("m_input_do", 0, 0.8),
 38 |                 },
 39 |             "fit": {
 40 |                 "epochs": args.hyperepoch
 41 |                 }
 42 |     }
 43 | 
 44 |     def data_fn(norm_input_log, norm_input_zeromean, norm_input_sf):
 45 | 
 46 |         ad = adata.copy()
 47 |         ad = io.normalize(ad,
 48 |                           size_factors=norm_input_sf,
 49 |                           logtrans_input=norm_input_log,
 50 |                           normalize_input=norm_input_zeromean)
 51 | 
 52 |         x_train = {'count': ad.X, 'size_factors': ad.obs.size_factors}
 53 |         y_train = ad.raw.X
 54 | 
 55 |         return (x_train, y_train),
 56 | 
 57 |     def model_fn(train_data, lr, hidden_size, activation, aetype, batchnorm,
 58 |                  dropout, input_dropout, ridge, l1_enc_coef):
 59 | 
 60 |         net = AE_types[aetype](train_data[1].shape[1],
 61 |                 hidden_size=hidden_size,
 62 |                 l2_coef=0.0,
 63 |                 l1_coef=0.0,
 64 |                 l2_enc_coef=0.0,
 65 |                 l1_enc_coef=l1_enc_coef,
 66 |                 ridge=ridge,
 67 |                 hidden_dropout=dropout,
 68 |                 input_dropout=input_dropout,
 69 |                 batchnorm=batchnorm,
 70 |                 activation=activation,
 71 |                 init='glorot_uniform',
 72 |                 debug=args.debug)
 73 |         net.build()
 74 |         net.model.summary()
 75 | 
 76 |         optimizer = opt.__dict__['RMSprop'](lr=lr, clipvalue=5.0)
 77 |         net.model.compile(loss=net.loss, optimizer=optimizer)
 78 | 
 79 |         return net.model
 80 | 
 81 |     output_dir = os.path.join(args.outputdir, 'hyperopt_results')
 82 |     objective = CompileFN('autoencoder_hyperpar_db', 'myexp1',
 83 |                           data_fn=data_fn,
 84 |                           model_fn=model_fn,
 85 |                           loss_metric='loss',
 86 |                           loss_metric_mode='min',
 87 |                           valid_split=.2,
 88 |                           save_model=None,
 89 |                           save_results=True,
 90 |                           use_tensorboard=False,
 91 |                           save_dir=output_dir)
 92 | 
 93 |     test_fn(objective, hyper_params, save_model=None)
 94 | 
 95 |     trials = Trials()
 96 |     best = fmin(objective,
 97 |                 hyper_params,
 98 |                 trials=trials,
 99 |                 algo=tpe.suggest,
100 |                 max_evals=args.hypern,
101 |                 catch_eval_exceptions=True)
102 | 
103 |     with open(os.path.join(output_dir, 'trials.pickle'), 'wb') as f:
104 |         pickle.dump(trials, f)
105 | 
106 |     #TODO: map indices in "best" back to choice-based hyperpars before saving
107 |     with open(os.path.join(output_dir, 'best.json'), 'wt') as f:
108 |         json.dump(best, f, sort_keys=True, indent=4)
109 | 
110 |     print(best)
111 | 
112 |     #TODO: not just save the best conf but also train the model with these params
113 | 


--------------------------------------------------------------------------------
/dca/io.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Goekcen Eraslan
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import pickle, os, numbers
 22 | 
 23 | import numpy as np
 24 | import scipy as sp
 25 | import pandas as pd
 26 | import scanpy as sc
 27 | from sklearn.model_selection import train_test_split
 28 | from sklearn.preprocessing import scale
 29 | 
 30 | 
 31 | #TODO: Fix this
 32 | class AnnSequence:
 33 |     def __init__(self, matrix, batch_size, sf=None):
 34 |         self.matrix = matrix
 35 |         if sf is None:
 36 |             self.size_factors = np.ones((self.matrix.shape[0], 1),
 37 |                                         dtype=np.float32)
 38 |         else:
 39 |             self.size_factors = sf
 40 |         self.batch_size = batch_size
 41 | 
 42 |     def __len__(self):
 43 |         return len(self.matrix) // self.batch_size
 44 | 
 45 |     def __getitem__(self, idx):
 46 |         batch = self.matrix[idx*self.batch_size:(idx+1)*self.batch_size]
 47 |         batch_sf = self.size_factors[idx*self.batch_size:(idx+1)*self.batch_size]
 48 | 
 49 |         # return an (X, Y) pair
 50 |         return {'count': batch, 'size_factors': batch_sf}, batch
 51 | 
 52 | 
 53 | def read_dataset(adata, transpose=False, test_split=False, copy=False, check_counts=True):
 54 | 
 55 |     if isinstance(adata, sc.AnnData):
 56 |         if copy:
 57 |             adata = adata.copy()
 58 |     elif isinstance(adata, str):
 59 |         adata = sc.read(adata, first_column_names=True)
 60 |     else:
 61 |         raise NotImplementedError
 62 | 
 63 |     if check_counts:
 64 |         # check if observations are unnormalized using first 10
 65 |         X_subset = adata.X[:10]
 66 |         norm_error = 'Make sure that the dataset (adata.X) contains unnormalized count data.'
 67 |         if sp.sparse.issparse(X_subset):
 68 |             assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error
 69 |         else:
 70 |             assert np.all(X_subset.astype(int) == X_subset), norm_error
 71 | 
 72 |     if transpose: adata = adata.transpose()
 73 | 
 74 |     if test_split:
 75 |         train_idx, test_idx = train_test_split(np.arange(adata.n_obs), test_size=0.1, random_state=42)
 76 |         spl = pd.Series(['train'] * adata.n_obs)
 77 |         spl.iloc[test_idx] = 'test'
 78 |         adata.obs['dca_split'] = spl.values
 79 |     else:
 80 |         adata.obs['dca_split'] = 'train'
 81 | 
 82 |     adata.obs['dca_split'] = adata.obs['dca_split'].astype('category')
 83 |     print('dca: Successfully preprocessed {} genes and {} cells.'.format(adata.n_vars, adata.n_obs))
 84 | 
 85 |     return adata
 86 | 
 87 | 
 88 | def normalize(adata, filter_min_counts=True, size_factors=True, normalize_input=True, logtrans_input=True):
 89 | 
 90 |     if filter_min_counts:
 91 |         sc.pp.filter_genes(adata, min_counts=1)
 92 |         sc.pp.filter_cells(adata, min_counts=1)
 93 | 
 94 |     if size_factors or normalize_input or logtrans_input:
 95 |         adata.raw = adata.copy()
 96 |     else:
 97 |         adata.raw = adata
 98 | 
 99 |     if size_factors:
100 |         sc.pp.normalize_per_cell(adata)
101 |         adata.obs['size_factors'] = adata.obs.n_counts / np.median(adata.obs.n_counts)
102 |     else:
103 |         adata.obs['size_factors'] = 1.0
104 | 
105 |     if logtrans_input:
106 |         sc.pp.log1p(adata)
107 | 
108 |     if normalize_input:
109 |         sc.pp.scale(adata)
110 | 
111 |     return adata
112 | 
113 | def read_genelist(filename):
114 |     genelist = list(set(open(filename, 'rt').read().strip().split('\n')))
115 |     assert len(genelist) > 0, 'No genes detected in genelist file'
116 |     print('dca: Subset of {} genes will be denoised.'.format(len(genelist)))
117 | 
118 |     return genelist
119 | 
120 | def write_text_matrix(matrix, filename, rownames=None, colnames=None, transpose=False):
121 |     if transpose:
122 |         matrix = matrix.T
123 |         rownames, colnames = colnames, rownames
124 | 
125 |     pd.DataFrame(matrix, index=rownames, columns=colnames).to_csv(filename,
126 |                                                                   sep='\t',
127 |                                                                   index=(rownames is not None),
128 |                                                                   header=(colnames is not None),
129 |                                                                   float_format='%.6f')
130 | def read_pickle(inputfile):
131 |     return pickle.load(open(inputfile, "rb"))
132 | 


--------------------------------------------------------------------------------
/dca/layers.py:
--------------------------------------------------------------------------------
 1 | from keras.engine.topology import Layer
 2 | from keras.layers import Lambda, Dense
 3 | from keras.engine.base_layer import InputSpec
 4 | from keras import backend as K
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | class ConstantDispersionLayer(Layer):
 9 |     '''
10 |         An identity layer which allows us to inject extra parameters
11 |         such as dispersion to Keras models
12 |     '''
13 |     def __init__(self, **kwargs):
14 |         super().__init__(**kwargs)
15 | 
16 |     def build(self, input_shape):
17 |         self.theta = self.add_weight(shape=(1, input_shape[1]),
18 |                                      initializer='zeros',
19 |                                      trainable=True,
20 |                                      name='theta')
21 |         self.theta_exp = tf.clip_by_value(K.exp(self.theta), 1e-3, 1e4)
22 |         super().build(input_shape)
23 | 
24 |     def call(self, x):
25 |         return tf.identity(x)
26 | 
27 |     def compute_output_shape(self, input_shape):
28 |         return input_shape
29 | 
30 | 
31 | class SliceLayer(Layer):
32 |     def __init__(self, index, **kwargs):
33 |         self.index = index
34 |         super().__init__(**kwargs)
35 | 
36 |     def build(self, input_shape):
37 |         if not isinstance(input_shape, list):
38 |             raise ValueError('Input should be a list')
39 | 
40 |         super().build(input_shape)
41 | 
42 |     def call(self, x):
43 |         assert isinstance(x, list), 'SliceLayer input is not a list'
44 |         return x[self.index]
45 | 
46 |     def compute_output_shape(self, input_shape):
47 |         return input_shape[self.index]
48 | 
49 | 
50 | class ElementwiseDense(Dense):
51 |     def build(self, input_shape):
52 |         assert len(input_shape) >= 2
53 |         input_dim = input_shape[-1]
54 |         assert (input_dim == self.units) or (self.units == 1), \
55 |                "Input and output dims are not compatible"
56 | 
57 |         # shape=(input_units, ) makes this elementwise bcs of broadcasting
58 |         self.kernel = self.add_weight(shape=(self.units,),
59 |                                       initializer=self.kernel_initializer,
60 |                                       name='kernel',
61 |                                       regularizer=self.kernel_regularizer,
62 |                                       constraint=self.kernel_constraint)
63 |         if self.use_bias:
64 |             self.bias = self.add_weight(shape=(self.units,),
65 |                                         initializer=self.bias_initializer,
66 |                                         name='bias',
67 |                                         regularizer=self.bias_regularizer,
68 |                                         constraint=self.bias_constraint)
69 |         else:
70 |             self.bias = None
71 |         self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
72 |         self.built = True
73 | 
74 |     def call(self, inputs):
75 |         # use * instead of tf.matmul, we need broadcasting here
76 |         output = inputs * self.kernel
77 |         if self.use_bias:
78 |             output = output + self.bias
79 |         if self.activation is not None:
80 |             output = self.activation(output)
81 |         return output
82 | 
83 | 
84 | nan2zeroLayer = Lambda(lambda x: tf.where(tf.is_nan(x), tf.zeros_like(x), x))
85 | ColwiseMultLayer = Lambda(lambda l: l[0]*tf.reshape(l[1], (-1,1)))
86 | 


--------------------------------------------------------------------------------
/dca/loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from keras import backend as K
  4 | 
  5 | 
  6 | def _nan2zero(x):
  7 |     return tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
  8 | 
  9 | def _nan2inf(x):
 10 |     return tf.where(tf.math.is_nan(x), tf.zeros_like(x)+np.inf, x)
 11 | 
 12 | def _nelem(x):
 13 |     nelem = tf.reduce_sum(tf.cast(~tf.math.is_nan(x), tf.float32))
 14 |     return tf.cast(tf.where(tf.equal(nelem, 0.), 1., nelem), x.dtype)
 15 | 
 16 | 
 17 | def _reduce_mean(x):
 18 |     nelem = _nelem(x)
 19 |     x = _nan2zero(x)
 20 |     return tf.divide(tf.reduce_sum(x), nelem)
 21 | 
 22 | 
 23 | def mse_loss(y_true, y_pred):
 24 |     ret = tf.square(y_pred - y_true)
 25 | 
 26 |     return _reduce_mean(ret)
 27 | 
 28 | 
 29 | # In the implementations, I try to keep the function signature
 30 | # similar to those of Keras objective functions so that
 31 | # later on we can use them in Keras smoothly:
 32 | # https://github.com/fchollet/keras/blob/master/keras/objectives.py#L7
 33 | def poisson_loss(y_true, y_pred):
 34 |     y_pred = tf.cast(y_pred, tf.float32)
 35 |     y_true = tf.cast(y_true, tf.float32)
 36 | 
 37 |     # we can use the Possion PMF from TensorFlow as well
 38 |     # dist = tf.contrib.distributions
 39 |     # return -tf.reduce_mean(dist.Poisson(y_pred).log_pmf(y_true))
 40 | 
 41 |     nelem = _nelem(y_true)
 42 |     y_true = _nan2zero(y_true)
 43 | 
 44 |     # last term can be avoided since it doesn't depend on y_pred
 45 |     # however keeping it gives a nice lower bound to zero
 46 |     ret = y_pred - y_true*tf.math.log(y_pred+1e-10) + tf.math.lgamma(y_true+1.0)
 47 | 
 48 |     return tf.divide(tf.reduce_sum(ret), nelem)
 49 | 
 50 | 
 51 | # We need a class (or closure) here,
 52 | # because it's not possible to
 53 | # pass extra arguments to Keras loss functions
 54 | # See https://github.com/fchollet/keras/issues/2121
 55 | 
 56 | # dispersion (theta) parameter is a scalar by default.
 57 | # scale_factor scales the nbinom mean before the
 58 | # calculation of the loss to balance the
 59 | # learning rates of theta and network weights
 60 | class NB(object):
 61 |     def __init__(self, theta=None, masking=False, scope='nbinom_loss/',
 62 |                  scale_factor=1.0, debug=False):
 63 | 
 64 |         # for numerical stability
 65 |         self.eps = 1e-10
 66 |         self.scale_factor = scale_factor
 67 |         self.debug = debug
 68 |         self.scope = scope
 69 |         self.masking = masking
 70 |         self.theta = theta
 71 | 
 72 |     def loss(self, y_true, y_pred, mean=True):
 73 |         scale_factor = self.scale_factor
 74 |         eps = self.eps
 75 | 
 76 |         with tf.name_scope(self.scope):
 77 |             y_true = tf.cast(y_true, tf.float32)
 78 |             y_pred = tf.cast(y_pred, tf.float32) * scale_factor
 79 | 
 80 |             if self.masking:
 81 |                 nelem = _nelem(y_true)
 82 |                 y_true = _nan2zero(y_true)
 83 | 
 84 |             # Clip theta
 85 |             theta = tf.minimum(self.theta, 1e6)
 86 | 
 87 |             t1 = tf.math.lgamma(theta+eps) + tf.math.lgamma(y_true+1.0) - tf.math.lgamma(y_true+theta+eps)
 88 |             t2 = (theta+y_true) * tf.math.log(1.0 + (y_pred/(theta+eps))) + (y_true * (tf.math.log(theta+eps) - tf.math.log(y_pred+eps)))
 89 | 
 90 |             if self.debug:
 91 |                 assert_ops = [
 92 |                         tf.verify_tensor_all_finite(y_pred, 'y_pred has inf/nans'),
 93 |                         tf.verify_tensor_all_finite(t1, 't1 has inf/nans'),
 94 |                         tf.verify_tensor_all_finite(t2, 't2 has inf/nans')]
 95 | 
 96 |                 tf.summary.histogram('t1', t1)
 97 |                 tf.summary.histogram('t2', t2)
 98 | 
 99 |                 with tf.control_dependencies(assert_ops):
100 |                     final = t1 + t2
101 | 
102 |             else:
103 |                 final = t1 + t2
104 | 
105 |             final = _nan2inf(final)
106 | 
107 |             if mean:
108 |                 if self.masking:
109 |                     final = tf.divide(tf.reduce_sum(final), nelem)
110 |                 else:
111 |                     final = tf.reduce_mean(final)
112 | 
113 | 
114 |         return final
115 | 
116 | class ZINB(NB):
117 |     def __init__(self, pi, ridge_lambda=0.0, scope='zinb_loss/', **kwargs):
118 |         super().__init__(scope=scope, **kwargs)
119 |         self.pi = pi
120 |         self.ridge_lambda = ridge_lambda
121 | 
122 |     def loss(self, y_true, y_pred, mean=True):
123 |         scale_factor = self.scale_factor
124 |         eps = self.eps
125 | 
126 |         with tf.name_scope(self.scope):
127 |             # reuse existing NB neg.log.lik.
128 |             # mean is always False here, because everything is calculated
129 |             # element-wise. we take the mean only in the end
130 |             nb_case = super().loss(y_true, y_pred, mean=False) - tf.math.log(1.0-self.pi+eps)
131 | 
132 |             y_true = tf.cast(y_true, tf.float32)
133 |             y_pred = tf.cast(y_pred, tf.float32) * scale_factor
134 |             theta = tf.minimum(self.theta, 1e6)
135 | 
136 |             zero_nb = tf.pow(theta/(theta+y_pred+eps), theta)
137 |             zero_case = -tf.math.log(self.pi + ((1.0-self.pi)*zero_nb)+eps)
138 |             result = tf.where(tf.less(y_true, 1e-8), zero_case, nb_case)
139 |             ridge = self.ridge_lambda*tf.square(self.pi)
140 |             result += ridge
141 | 
142 |             if mean:
143 |                 if self.masking:
144 |                     result = _reduce_mean(result)
145 |                 else:
146 |                     result = tf.reduce_mean(result)
147 | 
148 |             result = _nan2inf(result)
149 | 
150 |             if self.debug:
151 |                 tf.summary.histogram('nb_case', nb_case)
152 |                 tf.summary.histogram('zero_nb', zero_nb)
153 |                 tf.summary.histogram('zero_case', zero_case)
154 |                 tf.summary.histogram('ridge', ridge)
155 | 
156 |         return result
157 | 


--------------------------------------------------------------------------------
/dca/network.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Goekcen Eraslan
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import os
 17 | import pickle
 18 | from abc import ABCMeta, abstractmethod
 19 | 
 20 | import numpy as np
 21 | import scanpy as sc
 22 | 
 23 | import keras
 24 | from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
 25 | from keras.models import Model
 26 | from keras.regularizers import l1_l2
 27 | from keras.objectives import mean_squared_error
 28 | from keras.initializers import Constant
 29 | from keras import backend as K
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | from .loss import poisson_loss, NB, ZINB
 34 | from .layers import ConstantDispersionLayer, SliceLayer, ColwiseMultLayer, ElementwiseDense
 35 | from .io import write_text_matrix
 36 | 
 37 | 
 38 | MeanAct = lambda x: tf.clip_by_value(K.exp(x), 1e-5, 1e6)
 39 | DispAct = lambda x: tf.clip_by_value(tf.nn.softplus(x), 1e-4, 1e4)
 40 | 
 41 | advanced_activations = ('PReLU', 'LeakyReLU')
 42 | 
 43 | class Autoencoder():
 44 |     def __init__(self,
 45 |                  input_size,
 46 |                  output_size=None,
 47 |                  hidden_size=(64, 32, 64),
 48 |                  l2_coef=0.,
 49 |                  l1_coef=0.,
 50 |                  l2_enc_coef=0.,
 51 |                  l1_enc_coef=0.,
 52 |                  ridge=0.,
 53 |                  hidden_dropout=0.,
 54 |                  input_dropout=0.,
 55 |                  batchnorm=True,
 56 |                  activation='relu',
 57 |                  init='glorot_uniform',
 58 |                  file_path=None,
 59 |                  debug=False):
 60 | 
 61 |         self.input_size = input_size
 62 |         self.output_size = output_size
 63 |         self.hidden_size = hidden_size
 64 |         self.l2_coef = l2_coef
 65 |         self.l1_coef = l1_coef
 66 |         self.l2_enc_coef = l2_enc_coef
 67 |         self.l1_enc_coef = l1_enc_coef
 68 |         self.ridge = ridge
 69 |         self.hidden_dropout = hidden_dropout
 70 |         self.input_dropout = input_dropout
 71 |         self.batchnorm = batchnorm
 72 |         self.activation = activation
 73 |         self.init = init
 74 |         self.loss = None
 75 |         self.file_path = file_path
 76 |         self.extra_models = {}
 77 |         self.model = None
 78 |         self.encoder = None
 79 |         self.decoder = None
 80 |         self.input_layer = None
 81 |         self.sf_layer = None
 82 |         self.debug = debug
 83 | 
 84 |         if self.output_size is None:
 85 |             self.output_size = input_size
 86 | 
 87 |         if isinstance(self.hidden_dropout, list):
 88 |             assert len(self.hidden_dropout) == len(self.hidden_size)
 89 |         else:
 90 |             self.hidden_dropout = [self.hidden_dropout]*len(self.hidden_size)
 91 | 
 92 |     def build(self):
 93 | 
 94 |         self.input_layer = Input(shape=(self.input_size,), name='count')
 95 |         self.sf_layer = Input(shape=(1,), name='size_factors')
 96 |         last_hidden = self.input_layer
 97 | 
 98 |         if self.input_dropout > 0.0:
 99 |             last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden)
100 | 
101 |         for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)):
102 |             center_idx = int(np.floor(len(self.hidden_size) / 2.0))
103 |             if i == center_idx:
104 |                 layer_name = 'center'
105 |                 stage = 'center'  # let downstream know where we are
106 |             elif i < center_idx:
107 |                 layer_name = 'enc%s' % i
108 |                 stage = 'encoder'
109 |             else:
110 |                 layer_name = 'dec%s' % (i-center_idx)
111 |                 stage = 'decoder'
112 | 
113 |             # use encoder-specific l1/l2 reg coefs if given
114 |             if self.l1_enc_coef != 0. and stage in ('center', 'encoder'):
115 |                 l1 = self.l1_enc_coef
116 |             else:
117 |                 l1 = self.l1_coef
118 | 
119 |             if self.l2_enc_coef != 0. and stage in ('center', 'encoder'):
120 |                 l2 = self.l2_enc_coef
121 |             else:
122 |                 l2 = self.l2_coef
123 | 
124 |             last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init,
125 |                                 kernel_regularizer=l1_l2(l1, l2),
126 |                                 name=layer_name)(last_hidden)
127 |             if self.batchnorm:
128 |                 last_hidden = BatchNormalization(center=True, scale=False)(last_hidden)
129 | 
130 |             # Use separate act. layers to give user the option to get pre-activations
131 |             # of layers when requested
132 |             if self.activation in advanced_activations:
133 |                 last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden)
134 |             else:
135 |                 last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden)
136 | 
137 |             if hid_drop > 0.0:
138 |                 last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden)
139 | 
140 |         self.decoder_output = last_hidden
141 |         self.build_output()
142 | 
143 |     def build_output(self):
144 | 
145 |         self.loss = mean_squared_error
146 |         mean = Dense(self.output_size, kernel_initializer=self.init,
147 |                      kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
148 |                      name='mean')(self.decoder_output)
149 |         output = ColwiseMultLayer([mean, self.sf_layer])
150 | 
151 |         # keep unscaled output as an extra model
152 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
153 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
154 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
155 | 
156 |         self.encoder = self.get_encoder()
157 | 
158 |     def save(self):
159 |         if self.file_path:
160 |             os.makedirs(self.file_path, exist_ok=True)
161 |             with open(os.path.join(self.file_path, 'model.pickle'), 'wb') as f:
162 |                 pickle.dump(self, f)
163 | 
164 |     def load_weights(self, filename):
165 |         self.model.load_weights(filename)
166 |         self.encoder = self.get_encoder()
167 |         self.decoder = None  # get_decoder()
168 | 
169 |     def get_decoder(self):
170 |         i = 0
171 |         for l in self.model.layers:
172 |             if l.name == 'center_drop':
173 |                 break
174 |             i += 1
175 | 
176 |         return Model(inputs=self.model.get_layer(index=i+1).input,
177 |                      outputs=self.model.output)
178 | 
179 |     def get_encoder(self, activation=False):
180 |         if activation:
181 |             ret = Model(inputs=self.model.input,
182 |                         outputs=self.model.get_layer('center_act').output)
183 |         else:
184 |             ret = Model(inputs=self.model.input,
185 |                         outputs=self.model.get_layer('center').output)
186 |         return ret
187 | 
188 |     def predict(self, adata, mode='denoise', return_info=False, copy=False):
189 | 
190 |         assert mode in ('denoise', 'latent', 'full'), 'Unknown mode'
191 | 
192 |         adata = adata.copy() if copy else adata
193 | 
194 |         if mode in ('latent', 'full'):
195 |             print('dca: Calculating low dimensional representations...')
196 | 
197 |             adata.obsm['X_dca'] = self.encoder.predict({'count': adata.X,
198 |                                                         'size_factors': adata.obs.size_factors})        
199 |         if mode in ('denoise', 'full'):
200 |             print('dca: Calculating reconstructions...')
201 | 
202 |             adata.X = self.model.predict({'count': adata.X,
203 |                                           'size_factors': adata.obs.size_factors})
204 | 
205 |             #adata.uns['dca_loss'] = self.model.test_on_batch({'count': adata.X,
206 |             #                                                  'size_factors': adata.obs.size_factors},
207 |             #                                                 adata.raw.X)
208 |         if mode == 'latent':
209 |             adata.X = adata.raw.X.copy() #recover normalized expression values
210 | 
211 |         return adata if copy else None
212 | 
213 |     def write(self, adata, file_path, mode='denoise', colnames=None):
214 | 
215 |         colnames = adata.var_names.values if colnames is None else colnames
216 |         rownames = adata.obs_names.values
217 | 
218 |         print('dca: Saving output(s)...')
219 |         os.makedirs(file_path, exist_ok=True)
220 | 
221 |         if mode in ('denoise', 'full'):
222 |             print('dca: Saving denoised expression...')
223 |             write_text_matrix(adata.X,
224 |                               os.path.join(file_path, 'mean.tsv'),
225 |                               rownames=rownames, colnames=colnames, transpose=True)
226 | 
227 |         if mode in ('latent', 'full'):
228 |             print('dca: Saving latent representations...')
229 |             write_text_matrix(adata.obsm['X_dca'],
230 |                               os.path.join(file_path, 'latent.tsv'),
231 |                               rownames=rownames, transpose=False)
232 | 
233 | class PoissonAutoencoder(Autoencoder):
234 | 
235 |     def build_output(self):
236 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
237 |                      kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
238 |                      name='mean')(self.decoder_output)
239 |         output = ColwiseMultLayer([mean, self.sf_layer])
240 |         self.loss = poisson_loss
241 | 
242 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
243 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
244 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
245 | 
246 |         self.encoder = self.get_encoder()
247 | 
248 | 
249 | class NBConstantDispAutoencoder(Autoencoder):
250 | 
251 |     def build_output(self):
252 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
253 |                      kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
254 |                      name='mean')(self.decoder_output)
255 | 
256 |         # Plug in dispersion parameters via fake dispersion layer
257 |         disp = ConstantDispersionLayer(name='dispersion')
258 |         mean = disp(mean)
259 | 
260 |         output = ColwiseMultLayer([mean, self.sf_layer])
261 | 
262 |         nb = NB(disp.theta_exp)
263 |         self.loss = nb.loss
264 |         self.extra_models['dispersion'] = lambda :K.function([], [nb.theta])([])[0].squeeze()
265 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
266 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
267 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
268 | 
269 |         self.encoder = self.get_encoder()
270 | 
271 |     def predict(self, adata, mode='denoise', return_info=False, copy=False):
272 |         colnames = adata.var_names.values
273 |         rownames = adata.obs_names.values
274 |         res = super().predict(adata, mode, return_info, copy)
275 |         adata = res if copy else adata
276 | 
277 |         if return_info:
278 |             adata.var['X_dca_dispersion'] = self.extra_models['dispersion']()
279 | 
280 |         return adata if copy else None
281 | 
282 |     def write(self, adata, file_path, mode='denoise', colnames=None):
283 |         colnames = adata.var_names.values if colnames is None else colnames
284 |         rownames = adata.obs_names.values
285 | 
286 |         super().write(adata, file_path, mode, colnames=colnames)
287 |         if 'X_dca_dispersion' in adata.var_keys():
288 |             write_text_matrix(adata.var['X_dca_dispersion'].reshape(1, -1),
289 |                               os.path.join(file_path, 'dispersion.tsv'),
290 |                               colnames=colnames, transpose=True)
291 | 
292 | 
293 | class NBAutoencoder(Autoencoder):
294 | 
295 |     def build_output(self):
296 |         disp = Dense(self.output_size, activation=DispAct,
297 |                            kernel_initializer=self.init,
298 |                            kernel_regularizer=l1_l2(self.l1_coef,
299 |                                self.l2_coef),
300 |                            name='dispersion')(self.decoder_output)
301 | 
302 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
303 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
304 |                        name='mean')(self.decoder_output)
305 |         output = ColwiseMultLayer([mean, self.sf_layer])
306 |         output = SliceLayer(0, name='slice')([output, disp])
307 | 
308 |         nb = NB(theta=disp, debug=self.debug)
309 |         self.loss = nb.loss
310 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
311 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
312 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
313 | 
314 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
315 | 
316 |         self.encoder = self.get_encoder()
317 | 
318 |     def predict(self, adata, mode='denoise', return_info=False, copy=False):
319 |         colnames = adata.var_names.values
320 |         rownames = adata.obs_names.values
321 | 
322 |         res = super().predict(adata, mode, return_info, copy)
323 |         adata = res if copy else adata
324 | 
325 |         if return_info:
326 |             adata.obsm['X_dca_dispersion'] = self.extra_models['dispersion'].predict(adata.X)
327 | 
328 |         return adata if copy else None
329 | 
330 |     def write(self, adata, file_path, mode='denoise', colnames=None):
331 |         colnames = adata.var_names.values if colnames is None else colnames
332 |         rownames = adata.obs_names.values
333 | 
334 |         super().write(adata, file_path, mode, colnames=colnames)
335 | 
336 |         if 'X_dca_dispersion' in adata.obsm_keys():
337 |             write_text_matrix(adata.obsm['X_dca_dispersion'],
338 |                               os.path.join(file_path, 'dispersion.tsv'),
339 |                               colnames=colnames, transpose=True)
340 | 
341 | class NBSharedAutoencoder(NBAutoencoder):
342 | 
343 |     def build_output(self):
344 |         disp = Dense(1, activation=DispAct,
345 |                      kernel_initializer=self.init,
346 |                      kernel_regularizer=l1_l2(self.l1_coef,
347 |                                               self.l2_coef),
348 |                      name='dispersion')(self.decoder_output)
349 | 
350 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
351 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
352 |                        name='mean')(self.decoder_output)
353 |         output = ColwiseMultLayer([mean, self.sf_layer])
354 |         output = SliceLayer(0, name='slice')([output, disp])
355 | 
356 |         nb = NB(theta=disp, debug=self.debug)
357 |         self.loss = nb.loss
358 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
359 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
360 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
361 | 
362 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
363 |         self.encoder = self.get_encoder()
364 | 
365 | 
366 | class ZINBAutoencoder(Autoencoder):
367 | 
368 |     def build_output(self):
369 |         pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init,
370 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
371 |                        name='pi')(self.decoder_output)
372 | 
373 |         disp = Dense(self.output_size, activation=DispAct,
374 |                            kernel_initializer=self.init,
375 |                            kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
376 |                            name='dispersion')(self.decoder_output)
377 | 
378 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
379 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
380 |                        name='mean')(self.decoder_output)
381 |         output = ColwiseMultLayer([mean, self.sf_layer])
382 |         output = SliceLayer(0, name='slice')([output, disp, pi])
383 | 
384 |         zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug)
385 |         self.loss = zinb.loss
386 |         self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi)
387 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
388 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
389 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
390 | 
391 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
392 | 
393 |         self.encoder = self.get_encoder()
394 | 
395 |     def predict(self, adata, mode='denoise', return_info=False, copy=False, colnames=None):
396 | 
397 |         adata = adata.copy() if copy else adata
398 | 
399 |         if return_info:
400 |             adata.obsm['X_dca_dispersion'] = self.extra_models['dispersion'].predict(adata.X)
401 |             adata.obsm['X_dca_dropout']    = self.extra_models['pi'].predict(adata.X)
402 | 
403 |         # warning! this may overwrite adata.X
404 |         super().predict(adata, mode, return_info, copy=False)
405 |         return adata if copy else None
406 | 
407 |     def write(self, adata, file_path, mode='denoise', colnames=None):
408 |         colnames = adata.var_names.values if colnames is None else colnames
409 |         rownames = adata.obs_names.values
410 | 
411 |         super().write(adata, file_path, mode, colnames=colnames)
412 | 
413 |         if 'X_dca_dispersion' in adata.obsm_keys():
414 |             write_text_matrix(adata.obsm['X_dca_dispersion'],
415 |                               os.path.join(file_path, 'dispersion.tsv'),
416 |                               colnames=colnames, transpose=True)
417 | 
418 |         if 'X_dca_dropout' in adata.obsm_keys():
419 |             write_text_matrix(adata.obsm['X_dca_dropout'],
420 |                               os.path.join(file_path, 'dropout.tsv'),
421 |                               colnames=colnames, transpose=True)
422 | 
423 | 
424 | class ZINBAutoencoderElemPi(ZINBAutoencoder):
425 |     def __init__(self, sharedpi=False, **kwds):
426 |         super().__init__(**kwds)
427 |         self.sharedpi = sharedpi
428 | 
429 |     def build_output(self):
430 |         disp = Dense(self.output_size, activation=DispAct,
431 |                            kernel_initializer=self.init,
432 |                            kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
433 |                            name='dispersion')(self.decoder_output)
434 | 
435 |         mean_no_act = Dense(self.output_size, activation=None, kernel_initializer=self.init,
436 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
437 |                        name='mean_no_act')(self.decoder_output)
438 | 
439 |         minus = Lambda(lambda x: -x)
440 |         mean_no_act = minus(mean_no_act)
441 |         pidim = self.output_size if not self.sharedpi else 1
442 | 
443 |         pi = ElementwiseDense(pidim, activation='sigmoid', kernel_initializer=self.init,
444 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
445 |                        name='pi')(mean_no_act)
446 | 
447 |         mean = Activation(MeanAct, name='mean')(mean_no_act)
448 | 
449 |         output = ColwiseMultLayer([mean, self.sf_layer])
450 |         output = SliceLayer(0, name='slice')([output, disp, pi])
451 | 
452 |         zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug)
453 |         self.loss = zinb.loss
454 |         self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi)
455 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
456 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
457 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
458 | 
459 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
460 | 
461 |         self.encoder = self.get_encoder()
462 | 
463 | 
464 | 
465 | class ZINBSharedAutoencoder(ZINBAutoencoder):
466 | 
467 |     def build_output(self):
468 |         pi = Dense(1, activation='sigmoid', kernel_initializer=self.init,
469 |                    kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
470 |                    name='pi')(self.decoder_output)
471 | 
472 |         disp = Dense(1, activation=DispAct,
473 |                      kernel_initializer=self.init,
474 |                      kernel_regularizer=l1_l2(self.l1_coef,
475 |                                               self.l2_coef),
476 |                      name='dispersion')(self.decoder_output)
477 | 
478 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
479 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
480 |                        name='mean')(self.decoder_output)
481 |         output = ColwiseMultLayer([mean, self.sf_layer])
482 |         output = SliceLayer(0, name='slice')([output, disp, pi])
483 | 
484 |         zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug)
485 |         self.loss = zinb.loss
486 |         self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi)
487 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
488 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
489 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
490 | 
491 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
492 | 
493 |         self.encoder = self.get_encoder()
494 | 
495 | 
496 | class ZINBConstantDispAutoencoder(Autoencoder):
497 | 
498 |     def build_output(self):
499 |         pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init,
500 |                    kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
501 |                    name='pi')(self.decoder_output)
502 | 
503 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
504 |                      kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
505 |                      name='mean')(self.decoder_output)
506 | 
507 |         # NB dispersion layer
508 |         disp = ConstantDispersionLayer(name='dispersion')
509 |         mean = disp(mean)
510 | 
511 |         output = ColwiseMultLayer([mean, self.sf_layer])
512 | 
513 |         zinb = ZINB(pi, theta=disp.theta_exp, ridge_lambda=self.ridge, debug=self.debug)
514 |         self.loss = zinb.loss
515 |         self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi)
516 |         self.extra_models['dispersion'] = lambda :K.function([], [zinb.theta])([])[0].squeeze()
517 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
518 |         self.extra_models['decoded'] = Model(inputs=self.input_layer, outputs=self.decoder_output)
519 | 
520 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
521 | 
522 |         self.encoder = self.get_encoder()
523 | 
524 |     def predict(self, adata, mode='denoise', return_info=False, copy=False):
525 |         colnames = adata.var_names.values
526 |         rownames = adata.obs_names.values
527 |         adata = adata.copy() if copy else adata
528 | 
529 |         if return_info:
530 |             adata.var['X_dca_dispersion'] = self.extra_models['dispersion']()
531 |             adata.obsm['X_dca_dropout']    = self.extra_models['pi'].predict(adata.X)
532 | 
533 |         super().predict(adata, mode, return_info, copy=False)
534 |         return adata if copy else None
535 | 
536 |     def write(self, adata, file_path, mode='denoise', colnames=None):
537 |         colnames = adata.var_names.values if colnames is None else colnames
538 |         rownames = adata.obs_names.values
539 | 
540 |         super().write(adata, file_path, mode)
541 | 
542 |         if 'X_dca_dispersion' in adata.var_keys():
543 |             write_text_matrix(adata.var['X_dca_dispersion'].values.reshape(1, -1),
544 |                               os.path.join(file_path, 'dispersion.tsv'),
545 |                               colnames=colnames, transpose=True)
546 | 
547 |         if 'X_dca_dropout' in adata.obsm_keys():
548 |             write_text_matrix(adata.obsm['X_dca_dropout'],
549 |                               os.path.join(file_path, 'dropout.tsv'),
550 |                               colnames=colnames, transpose=True)
551 | 
552 | 
553 | class ZINBForkAutoencoder(ZINBAutoencoder):
554 | 
555 |     def build(self):
556 | 
557 |         self.input_layer = Input(shape=(self.input_size,), name='count')
558 |         self.sf_layer = Input(shape=(1,), name='size_factors')
559 |         last_hidden = self.input_layer
560 | 
561 |         if self.input_dropout > 0.0:
562 |             last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden)
563 | 
564 |         for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)):
565 |             center_idx = int(np.floor(len(self.hidden_size) / 2.0))
566 |             if i == center_idx:
567 |                 layer_name = 'center'
568 |                 stage = 'center'  # let downstream know where we are
569 |             elif i < center_idx:
570 |                 layer_name = 'enc%s' % i
571 |                 stage = 'encoder'
572 |             else:
573 |                 layer_name = 'dec%s' % (i-center_idx)
574 |                 stage = 'decoder'
575 | 
576 |             # use encoder-specific l1/l2 reg coefs if given
577 |             if self.l1_enc_coef != 0. and stage in ('center', 'encoder'):
578 |                 l1 = self.l1_enc_coef
579 |             else:
580 |                 l1 = self.l1_coef
581 | 
582 |             if self.l2_enc_coef != 0. and stage in ('center', 'encoder'):
583 |                 l2 = self.l2_enc_coef
584 |             else:
585 |                 l2 = self.l2_coef
586 | 
587 |             if i > center_idx:
588 |                 self.last_hidden_mean = Dense(hid_size, activation=None, kernel_initializer=self.init,
589 |                                     kernel_regularizer=l1_l2(l1, l2),
590 |                                     name='%s_last_mean'%layer_name)(last_hidden)
591 |                 self.last_hidden_disp = Dense(hid_size, activation=None, kernel_initializer=self.init,
592 |                                     kernel_regularizer=l1_l2(l1, l2),
593 |                                     name='%s_last_disp'%layer_name)(last_hidden)
594 |                 self.last_hidden_pi = Dense(hid_size, activation=None, kernel_initializer=self.init,
595 |                                     kernel_regularizer=l1_l2(l1, l2),
596 |                                     name='%s_last_pi'%layer_name)(last_hidden)
597 | 
598 |                 if self.batchnorm:
599 |                     self.last_hidden_mean = BatchNormalization(center=True, scale=False)(self.last_hidden_mean)
600 |                     self.last_hidden_disp = BatchNormalization(center=True, scale=False)(self.last_hidden_disp)
601 |                     self.last_hidden_pi = BatchNormalization(center=True, scale=False)(self.last_hidden_pi)
602 | 
603 |                 # Use separate act. layers to give user the option to get pre-activations
604 |                 # of layers when requested
605 |                 self.last_hidden_mean = Activation(self.activation, name='%s_mean_act'%layer_name)(self.last_hidden_mean)
606 |                 self.last_hidden_disp = Activation(self.activation, name='%s_disp_act'%layer_name)(self.last_hidden_disp)
607 |                 self.last_hidden_pi = Activation(self.activation, name='%s_pi_act'%layer_name)(self.last_hidden_pi)
608 | 
609 |                 if hid_drop > 0.0:
610 |                     self.last_hidden_mean = Dropout(hid_drop, name='%s_mean_drop'%layer_name)(self.last_hidden_mean)
611 |                     self.last_hidden_disp = Dropout(hid_drop, name='%s_disp_drop'%layer_name)(self.last_hidden_disp)
612 |                     self.last_hidden_pi = Dropout(hid_drop, name='%s_pi_drop'%layer_name)(self.last_hidden_pi)
613 | 
614 |             else:
615 |                 last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init,
616 |                                     kernel_regularizer=l1_l2(l1, l2),
617 |                                     name=layer_name)(last_hidden)
618 | 
619 |                 if self.batchnorm:
620 |                     last_hidden = BatchNormalization(center=True, scale=False)(last_hidden)
621 | 
622 |                 # Use separate act. layers to give user the option to get pre-activations
623 |                 # of layers when requested
624 |                 if self.activation in advanced_activations:
625 |                     last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden)
626 |                 else:
627 |                     last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden)
628 | 
629 |                 if hid_drop > 0.0:
630 |                     last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden)
631 | 
632 |         self.build_output()
633 | 
634 | 
635 |     def build_output(self):
636 |         pi = Dense(self.output_size, activation='sigmoid', kernel_initializer=self.init,
637 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
638 |                        name='pi')(self.last_hidden_pi)
639 | 
640 |         disp = Dense(self.output_size, activation=DispAct,
641 |                            kernel_initializer=self.init,
642 |                            kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
643 |                            name='dispersion')(self.last_hidden_disp)
644 | 
645 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
646 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
647 |                        name='mean')(self.last_hidden_mean)
648 | 
649 |         output = ColwiseMultLayer([mean, self.sf_layer])
650 |         output = SliceLayer(0, name='slice')([output, disp, pi])
651 | 
652 |         zinb = ZINB(pi, theta=disp, ridge_lambda=self.ridge, debug=self.debug)
653 |         self.loss = zinb.loss
654 |         self.extra_models['pi'] = Model(inputs=self.input_layer, outputs=pi)
655 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
656 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
657 | 
658 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
659 | 
660 |         self.encoder = self.get_encoder()
661 | 
662 | 
663 | class NBForkAutoencoder(NBAutoencoder):
664 | 
665 |     def build(self):
666 | 
667 |         self.input_layer = Input(shape=(self.input_size,), name='count')
668 |         self.sf_layer = Input(shape=(1,), name='size_factors')
669 |         last_hidden = self.input_layer
670 | 
671 |         if self.input_dropout > 0.0:
672 |             last_hidden = Dropout(self.input_dropout, name='input_dropout')(last_hidden)
673 | 
674 |         for i, (hid_size, hid_drop) in enumerate(zip(self.hidden_size, self.hidden_dropout)):
675 |             center_idx = int(np.floor(len(self.hidden_size) / 2.0))
676 |             if i == center_idx:
677 |                 layer_name = 'center'
678 |                 stage = 'center'  # let downstream know where we are
679 |             elif i < center_idx:
680 |                 layer_name = 'enc%s' % i
681 |                 stage = 'encoder'
682 |             else:
683 |                 layer_name = 'dec%s' % (i-center_idx)
684 |                 stage = 'decoder'
685 | 
686 |             # use encoder-specific l1/l2 reg coefs if given
687 |             if self.l1_enc_coef != 0. and stage in ('center', 'encoder'):
688 |                 l1 = self.l1_enc_coef
689 |             else:
690 |                 l1 = self.l1_coef
691 | 
692 |             if self.l2_enc_coef != 0. and stage in ('center', 'encoder'):
693 |                 l2 = self.l2_enc_coef
694 |             else:
695 |                 l2 = self.l2_coef
696 | 
697 |             if i > center_idx:
698 |                 self.last_hidden_mean = Dense(hid_size, activation=None, kernel_initializer=self.init,
699 |                                     kernel_regularizer=l1_l2(l1, l2),
700 |                                     name='%s_last_mean'%layer_name)(last_hidden)
701 |                 self.last_hidden_disp = Dense(hid_size, activation=None, kernel_initializer=self.init,
702 |                                     kernel_regularizer=l1_l2(l1, l2),
703 |                                     name='%s_last_disp'%layer_name)(last_hidden)
704 | 
705 |                 if self.batchnorm:
706 |                     self.last_hidden_mean = BatchNormalization(center=True, scale=False)(self.last_hidden_mean)
707 |                     self.last_hidden_disp = BatchNormalization(center=True, scale=False)(self.last_hidden_disp)
708 | 
709 |                 # Use separate act. layers to give user the option to get pre-activations
710 |                 # of layers when requested
711 |                 self.last_hidden_mean = Activation(self.activation, name='%s_mean_act'%layer_name)(self.last_hidden_mean)
712 |                 self.last_hidden_disp = Activation(self.activation, name='%s_disp_act'%layer_name)(self.last_hidden_disp)
713 | 
714 |                 if hid_drop > 0.0:
715 |                     self.last_hidden_mean = Dropout(hid_drop, name='%s_mean_drop'%layer_name)(self.last_hidden_mean)
716 |                     self.last_hidden_disp = Dropout(hid_drop, name='%s_disp_drop'%layer_name)(self.last_hidden_disp)
717 | 
718 |             else:
719 |                 last_hidden = Dense(hid_size, activation=None, kernel_initializer=self.init,
720 |                                     kernel_regularizer=l1_l2(l1, l2),
721 |                                     name=layer_name)(last_hidden)
722 | 
723 |                 if self.batchnorm:
724 |                     last_hidden = BatchNormalization(center=True, scale=False)(last_hidden)
725 | 
726 |                 # Use separate act. layers to give user the option to get pre-activations
727 |                 # of layers when requested
728 |                 if self.activation in advanced_activations:
729 |                     last_hidden = keras.layers.__dict__[self.activation](name='%s_act'%layer_name)(last_hidden)
730 |                 else:
731 |                     last_hidden = Activation(self.activation, name='%s_act'%layer_name)(last_hidden)
732 | 
733 |                 if hid_drop > 0.0:
734 |                     last_hidden = Dropout(hid_drop, name='%s_drop'%layer_name)(last_hidden)
735 | 
736 |         self.build_output()
737 | 
738 | 
739 |     def build_output(self):
740 | 
741 |         disp = Dense(self.output_size, activation=DispAct,
742 |                            kernel_initializer=self.init,
743 |                            kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
744 |                            name='dispersion')(self.last_hidden_disp)
745 | 
746 |         mean = Dense(self.output_size, activation=MeanAct, kernel_initializer=self.init,
747 |                        kernel_regularizer=l1_l2(self.l1_coef, self.l2_coef),
748 |                        name='mean')(self.last_hidden_mean)
749 | 
750 |         output = ColwiseMultLayer([mean, self.sf_layer])
751 |         output = SliceLayer(0, name='slice')([output, disp])
752 | 
753 |         nb = NB(theta=disp, debug=self.debug)
754 |         self.loss = nb.loss
755 |         self.extra_models['dispersion'] = Model(inputs=self.input_layer, outputs=disp)
756 |         self.extra_models['mean_norm'] = Model(inputs=self.input_layer, outputs=mean)
757 | 
758 |         self.model = Model(inputs=[self.input_layer, self.sf_layer], outputs=output)
759 | 
760 |         self.encoder = self.get_encoder()
761 | 
762 | 
763 | AE_types = {'normal': Autoencoder, 'poisson': PoissonAutoencoder,
764 |             'nb': NBConstantDispAutoencoder, 'nb-conddisp': NBAutoencoder,
765 |             'nb-shared': NBSharedAutoencoder, 'nb-fork': NBForkAutoencoder,
766 |             'zinb': ZINBConstantDispAutoencoder, 'zinb-conddisp': ZINBAutoencoder,
767 |             'zinb-shared': ZINBSharedAutoencoder, 'zinb-fork': ZINBForkAutoencoder,
768 |             'zinb-elempi': ZINBAutoencoderElemPi}
769 | 
770 | 


--------------------------------------------------------------------------------
/dca/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scanpy as sc
 3 | 
 4 | from .api import dca
 5 | 
 6 | def test_api():
 7 |     adata = sc.datasets.paul15()
 8 |     epochs = 1
 9 | 
10 |     # simple tests for denoise
11 |     ret = dca(adata, mode='denoise', copy=True, epochs=epochs, verbose=True)
12 |     assert not np.allclose(ret.X[:10], adata.X[:10])
13 | 
14 |     ret, _ = dca(adata, mode='denoise', ae_type='nb-conddisp', copy=True, epochs=epochs,
15 |               return_model=True, return_info=True)
16 |     assert not np.allclose(ret.X[:10], adata.X[:10])
17 |     assert 'X_dca_dispersion' in ret.obsm_keys()
18 |     assert _ is not None
19 | 
20 |     ret = dca(adata, mode='denoise', ae_type='nb', copy=True, epochs=epochs,
21 |               return_model=False, return_info=True)
22 |     assert not np.allclose(ret.X[:10], adata.X[:10])
23 |     assert 'X_dca_dispersion' in ret.var_keys()
24 | 
25 |     ret = dca(adata, mode='denoise', ae_type='zinb', copy=True, epochs=epochs,
26 |               return_model=False, return_info=True)
27 |     assert not np.allclose(ret.X[:10], adata.X[:10])
28 |     assert 'X_dca_dropout' in ret.obsm_keys()
29 |     assert 'dca_loss_history' in ret.uns_keys()
30 | 
31 |     ret = dca(adata, mode='denoise', ae_type='zinb-elempi', copy=True, epochs=epochs,
32 |               return_model=False, return_info=True)
33 |     assert not np.allclose(ret.X[:10], adata.X[:10])
34 |     assert 'X_dca_dropout' in ret.obsm_keys()
35 |     assert 'dca_loss_history' in ret.uns_keys()
36 | 
37 |     ret = dca(adata, mode='denoise', ae_type='zinb-elempi', copy=True, epochs=epochs,
38 |               return_model=False, return_info=True, network_kwds={'sharedpi': True})
39 |     assert not np.allclose(ret.X[:10], adata.X[:10])
40 |     assert 'X_dca_dropout' in ret.obsm_keys()
41 |     assert 'dca_loss_history' in ret.uns_keys()
42 | 
43 |     # simple tests for latent
44 |     hid_size = (10, 2, 10)
45 |     ret = dca(adata, mode='latent', hidden_size=hid_size, copy=True, epochs=epochs)
46 |     assert 'X_dca' in ret.obsm_keys()
47 |     assert ret.obsm['X_dca'].shape[1] == hid_size[1]
48 | 
49 |     ret = dca(adata, mode='latent', ae_type='nb-conddisp', hidden_size=hid_size, copy=True, epochs=epochs)
50 |     assert 'X_dca' in ret.obsm_keys()
51 |     assert ret.obsm['X_dca'].shape[1] == hid_size[1]
52 | 
53 |     ret = dca(adata, mode='latent', ae_type='nb', hidden_size=hid_size, copy=True, epochs=epochs, return_info=True)
54 |     assert 'X_dca' in ret.obsm_keys()
55 |     assert ret.obsm['X_dca'].shape[1] == hid_size[1]
56 | 
57 |     ret = dca(adata, mode='latent', ae_type='zinb', hidden_size=hid_size, copy=True, epochs=epochs)
58 |     assert 'X_dca' in ret.obsm_keys()
59 |     assert ret.obsm['X_dca'].shape[1] == hid_size[1]
60 | 


--------------------------------------------------------------------------------
/dca/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Goekcen Eraslan
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import os
 21 | import random
 22 | 
 23 | from . import io
 24 | from .network import AE_types
 25 | from .hyper import hyper
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | import keras.optimizers as opt
 30 | from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
 31 | from keras import backend as K
 32 | from keras.preprocessing.image import Iterator
 33 | 
 34 | 
 35 | def train(adata, network, output_dir=None, optimizer='RMSprop', learning_rate=None,
 36 |           epochs=300, reduce_lr=10, output_subset=None, use_raw_as_output=True,
 37 |           early_stop=15, batch_size=32, clip_grad=5., save_weights=False,
 38 |           validation_split=0.1, tensorboard=False, verbose=True, threads=None,
 39 |           **kwds):
 40 | 
 41 |     tf.compat.v1.keras.backend.set_session(
 42 |         tf.compat.v1.Session(
 43 |             config=tf.compat.v1.ConfigProto(
 44 |                 intra_op_parallelism_threads=threads,
 45 |                 inter_op_parallelism_threads=threads,
 46 |             )
 47 |         )
 48 |     )
 49 |     model = network.model
 50 |     loss = network.loss
 51 |     if output_dir is not None:
 52 |         os.makedirs(output_dir, exist_ok=True)
 53 | 
 54 |     if learning_rate is None:
 55 |         optimizer = opt.__dict__[optimizer](clipvalue=clip_grad)
 56 |     else:
 57 |         optimizer = opt.__dict__[optimizer](lr=learning_rate, clipvalue=clip_grad)
 58 | 
 59 |     model.compile(loss=loss, optimizer=optimizer)
 60 | 
 61 |     # Callbacks
 62 |     callbacks = []
 63 | 
 64 |     if save_weights and output_dir is not None:
 65 |         checkpointer = ModelCheckpoint(filepath="%s/weights.hdf5" % output_dir,
 66 |                                        verbose=verbose,
 67 |                                        save_weights_only=True,
 68 |                                        save_best_only=True)
 69 |         callbacks.append(checkpointer)
 70 |     if reduce_lr:
 71 |         lr_cb = ReduceLROnPlateau(monitor='val_loss', patience=reduce_lr, verbose=verbose)
 72 |         callbacks.append(lr_cb)
 73 |     if early_stop:
 74 |         es_cb = EarlyStopping(monitor='val_loss', patience=early_stop, verbose=verbose)
 75 |         callbacks.append(es_cb)
 76 |     if tensorboard:
 77 |         tb_log_dir = os.path.join(output_dir, 'tb')
 78 |         tb_cb = TensorBoard(log_dir=tb_log_dir, histogram_freq=1, write_grads=True)
 79 |         callbacks.append(tb_cb)
 80 | 
 81 |     if verbose: model.summary()
 82 | 
 83 |     inputs = {'count': adata.X, 'size_factors': adata.obs.size_factors}
 84 | 
 85 |     if output_subset:
 86 |         gene_idx = [np.where(adata.raw.var_names == x)[0][0] for x in output_subset]
 87 |         output = adata.raw.X[:, gene_idx] if use_raw_as_output else adata.X[:, gene_idx]
 88 |     else:
 89 |         output = adata.raw.X if use_raw_as_output else adata.X
 90 | 
 91 |     loss = model.fit(inputs, output,
 92 |                      epochs=epochs,
 93 |                      batch_size=batch_size,
 94 |                      shuffle=True,
 95 |                      callbacks=callbacks,
 96 |                      validation_split=validation_split,
 97 |                      verbose=verbose,
 98 |                      **kwds)
 99 | 
100 |     return loss
101 | 
102 | 
103 | def train_with_args(args):
104 | 
105 |     tf.compat.v1.keras.backend.set_session(
106 |         tf.compat.v1.Session(
107 |             config=tf.compat.v1.ConfigProto(
108 |                 intra_op_parallelism_threads=args.threads,
109 |                 inter_op_parallelism_threads=args.threads,
110 |             )
111 |         )
112 |     )
113 |     # set seed for reproducibility
114 |     random.seed(42)
115 |     np.random.seed(42)
116 |     tf.random.set_seed(42)
117 |     os.environ['PYTHONHASHSEED'] = '0'
118 | 
119 |     # do hyperpar optimization and exit
120 |     if args.hyper:
121 |         hyper(args)
122 |         return
123 | 
124 |     adata = io.read_dataset(args.input,
125 |                             transpose=(not args.transpose), # assume gene x cell by default
126 |                             check_counts=args.checkcounts,
127 |                             test_split=args.testsplit)
128 | 
129 |     adata = io.normalize(adata,
130 |                          size_factors=args.sizefactors,
131 |                          logtrans_input=args.loginput,
132 |                          normalize_input=args.norminput)
133 | 
134 |     if args.denoisesubset:
135 |         genelist = list(set(io.read_genelist(args.denoisesubset)))
136 |         assert len(set(genelist) - set(adata.var_names.values)) == 0, \
137 |                'Gene list is not overlapping with genes from the dataset'
138 |         output_size = len(genelist)
139 |     else:
140 |         genelist = None
141 |         output_size = adata.n_vars
142 | 
143 |     hidden_size = [int(x) for x in args.hiddensize.split(',')]
144 |     hidden_dropout = [float(x) for x in args.dropoutrate.split(',')]
145 |     if len(hidden_dropout) == 1:
146 |         hidden_dropout = hidden_dropout[0]
147 | 
148 |     assert args.type in AE_types, 'loss type not supported'
149 |     input_size = adata.n_vars
150 | 
151 |     from tensorflow.python.framework.ops import disable_eager_execution
152 |     disable_eager_execution()
153 | 
154 |     net = AE_types[args.type](input_size=input_size,
155 |             output_size=output_size,
156 |             hidden_size=hidden_size,
157 |             l2_coef=args.l2,
158 |             l1_coef=args.l1,
159 |             l2_enc_coef=args.l2enc,
160 |             l1_enc_coef=args.l1enc,
161 |             ridge=args.ridge,
162 |             hidden_dropout=hidden_dropout,
163 |             input_dropout=args.inputdropout,
164 |             batchnorm=args.batchnorm,
165 |             activation=args.activation,
166 |             init=args.init,
167 |             debug=args.debug,
168 |             file_path=args.outputdir)
169 | 
170 |     net.save()
171 |     net.build()
172 | 
173 |     losses = train(adata[adata.obs.dca_split == 'train'], net,
174 |                    output_dir=args.outputdir,
175 |                    learning_rate=args.learningrate,
176 |                    epochs=args.epochs, batch_size=args.batchsize,
177 |                    early_stop=args.earlystop,
178 |                    reduce_lr=args.reducelr,
179 |                    output_subset=genelist,
180 |                    optimizer=args.optimizer,
181 |                    clip_grad=args.gradclip,
182 |                    save_weights=args.saveweights,
183 |                    tensorboard=args.tensorboard)
184 | 
185 |     if genelist:
186 |         predict_columns = adata.var_names[[np.where(adata.var_names==x)[0][0] for x in genelist]]
187 |     else:
188 |         predict_columns = adata.var_names
189 | 
190 |     net.predict(adata, mode='full', return_info=True)
191 |     net.write(adata, args.outputdir, mode='full', colnames=predict_columns)
192 | 


--------------------------------------------------------------------------------
/dca/utils.py:
--------------------------------------------------------------------------------
  1 | import scanpy as sc
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | import pandas as pd
  5 | import seaborn as sns
  6 | import scipy as sp
  7 | import tensorflow as tf
  8 | from tensorflow.contrib.opt import ScipyOptimizerInterface
  9 | 
 10 | 
 11 | nb_zero = lambda t, mu: (t/(mu+t))**t
 12 | zinb_zero = lambda t, mu, p: p + ((1.-p)*((t/(mu+t))**t))
 13 | sigmoid = lambda x: 1. / (1.+np.exp(-x))
 14 | logit = lambda x: np.log(x + 1e-7) - np.log(1. - x + 1e-7)
 15 | tf_logit = lambda x: tf.cast(tf.log(x + 1e-7) - tf.log(1. - x + 1e-7), 'float32')
 16 | log_loss = lambda pred, label: np.sum(-(label*np.log(pred+1e-7)) - ((1.-label)*np.log(1.-pred+1e-7)))
 17 | 
 18 | 
 19 | def _lrt(ll_full, ll_reduced, df_full, df_reduced):
 20 |     # Compute the difference in degrees of freedom.
 21 |     delta_df = df_full - df_reduced
 22 |     # Compute the deviance test statistic.
 23 |     delta_dev = 2 * (ll_full - ll_reduced)
 24 |     # Compute the p-values based on the deviance and its expection based on the chi-square distribution.
 25 |     pvals = 1. - sp.stats.chi2(delta_df).cdf(delta_dev)
 26 | 
 27 |     return pvals
 28 | 
 29 | 
 30 | def _fitquad(x, y):
 31 |     coef, res, _, _ = np.linalg.lstsq((x**2)[:, np.newaxis] , y-x, rcond=None)
 32 |     ss_exp = res[0]
 33 |     ss_tot = (np.var(y-x)*len(x))
 34 |     r2 = 1 - (ss_exp / ss_tot)
 35 |     #print('Coefs:', coef)
 36 |     return np.array([coef[0], 1, 0]), r2
 37 | 
 38 | 
 39 | def _tf_zinb_zero(mu, t=None):
 40 |     a, b = tf.Variable([-1.0], dtype='float32'), tf.Variable([0.0], dtype='float32')
 41 | 
 42 |     if t is None:
 43 |         t_log = tf.Variable([-10.], dtype='float32')
 44 |         t = tf.exp(t_log)
 45 | 
 46 |     p = tf.sigmoid((tf.log(mu+1e-7)*a) + b)
 47 |     pred = p + ((1.-p)*((t/(mu+t))**t))
 48 |     pred = tf.cast(pred, 'float32')
 49 |     return pred, a, b, t
 50 | 
 51 | 
 52 | def _optimize_zinb(mu, dropout, theta=None):
 53 |     pred, a, b, t = _tf_zinb_zero(mu, theta)
 54 |     #loss = tf.reduce_mean(tf.abs(tf_logit(pred) - tf_logit(dropout)))
 55 |     loss = tf.losses.log_loss(labels=dropout.astype('float32'),
 56 |                               predictions=pred)
 57 | 
 58 |     optimizer = ScipyOptimizerInterface(loss, options={'maxiter': 100})
 59 | 
 60 |     with tf.Session() as sess:
 61 |         sess.run(tf.global_variables_initializer())
 62 |         optimizer.minimize(sess)
 63 |         ret_a = sess.run(a)
 64 |         ret_b = sess.run(b)
 65 |         if theta is None:
 66 |             ret_t = sess.run(t)
 67 |         else:
 68 |             ret_t = t
 69 | 
 70 |     return ret_a, ret_b, ret_t
 71 | 
 72 | 
 73 | def plot_mean_dropout(ad, title, ax, opt_zinb_theta=False, legend_out=False):
 74 |     expr = ad.X
 75 |     mu = expr.mean(0)
 76 |     do = np.mean(expr == 0, 0)
 77 |     v = expr.var(axis=0)
 78 | 
 79 |     coefs, r2 = _fitquad(mu, v)
 80 |     theta = 1.0/coefs[0]
 81 | 
 82 |     # zinb fit
 83 |     coefs = _optimize_zinb(mu, do, theta=theta if not opt_zinb_theta else None)
 84 |     print(coefs)
 85 | 
 86 |     #pois_pred = np.exp(-mu)
 87 |     nb_pred = nb_zero(theta, mu)
 88 |     zinb_pred = zinb_zero(coefs[2],
 89 |                           mu,
 90 |                           sigmoid((np.log(mu+1e-7)*coefs[0])+coefs[1]))
 91 | 
 92 |     # calculate log loss for all distr.
 93 |     #pois_ll = log_loss(pois_pred, do)
 94 |     nb_ll = log_loss(nb_pred, do)
 95 |     zinb_ll = log_loss(zinb_pred, do)
 96 | 
 97 |     ax.plot(mu, do, 'o', c='black', markersize=1)
 98 |     ax.set(xscale="log")
 99 | 
100 |     #sns.lineplot(mu, pois_pred, ax=ax, color='blue')
101 |     sns.lineplot(mu, nb_pred, ax=ax, color='red')
102 |     sns.lineplot(mu, zinb_pred, ax=ax, color='green')
103 | 
104 |     ax.set_title(title)
105 |     ax.set_ylabel('Empirical dropout rate')
106 |     ax.set_xlabel(r'Mean expression')
107 | 
108 | 
109 |     leg_loc = 'best' if not legend_out else 'upper left'
110 |     leg_bbox = None if not legend_out else (1.02, 1.)
111 |     ax.legend(['Genes',
112 |                #r'Poisson $L=%.4f$' % pois_ll,
113 |                r'NB($\theta=%.2f)\ L=%.4f$' % ((1./theta), nb_ll),
114 |                r'ZINB($\theta=%.2f,\pi=\sigma(%.2f\mu%+.2f)) \ L=%.4f$' % (1.0/coefs[2], coefs[0], coefs[1], zinb_ll)],
115 |                loc=leg_loc, bbox_to_anchor=leg_bbox)
116 |     zinb_pval = _lrt(-zinb_ll, -nb_ll, 3, 1)
117 |     print('p-value: %e' % zinb_pval)
118 | 
119 | 
120 | def plot_mean_var(ad, title, ax):
121 |     ad = ad.copy()
122 | 
123 |     sc.pp.filter_cells(ad, min_counts=1)
124 |     sc.pp.filter_genes(ad, min_counts=1)
125 | 
126 |     m = ad.X.mean(axis=0)
127 |     v = ad.X.var(axis=0)
128 | 
129 |     coefs, r2 = _fitquad(m, v)
130 | 
131 |     ax.set(xscale="log", yscale="log")
132 |     ax.plot(m, v, 'o', c='black', markersize=1)
133 | 
134 |     poly = np.poly1d(coefs)
135 |     sns.lineplot(m, poly(m), ax=ax, color='red')
136 | 
137 |     ax.set_title(title)
138 |     ax.set_ylabel('Variance')
139 |     ax.set_xlabel(r'$\mu$')
140 | 
141 |     sns.lineplot(m, m, ax=ax, color='blue')
142 |     ax.legend(['Genes', r'NB ($\theta=%.2f)\ r^2=%.3f$' % (coefs[0], r2), 'Poisson'])
143 | 
144 |     return coefs[0]
145 | 
146 | 
147 | def plot_zeroinf(ad, title, mean_var_plot=False, opt_theta=True):
148 |     if mean_var_plot:
149 |         f, axs = plt.subplots(1, 2, figsize=(15, 5))
150 |         plot_mean_var(ad, title, ax=axs[0])
151 |         plot_mean_dropout(ad, title, axs[1], opt_zinb_theta=opt_theta, legend_out=True)
152 |         plt.tight_layout()
153 |     else:
154 |         f, ax = plt.subplots(1, 1, figsize=(10, 5))
155 |         plot_mean_dropout(ad, title, ax, opt_zinb_theta=opt_theta, legend_out=True)
156 |         plt.tight_layout()
157 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 16 | 
 17 | .PHONY: help
 18 | help:
 19 | 	@echo "Please use \`make <target>' where <target> is one of"
 20 | 	@echo "  html       to make standalone HTML files"
 21 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 22 | 	@echo "  singlehtml to make a single large HTML file"
 23 | 	@echo "  pickle     to make pickle files"
 24 | 	@echo "  json       to make JSON files"
 25 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 27 | 	@echo "  applehelp  to make an Apple Help Book"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  epub3      to make an epub3"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  xml        to make Docutils-native XML files"
 41 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 42 | 	@echo "  linkcheck  to check all external links for integrity"
 43 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 44 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 45 | 	@echo "  dummy      to check syntax errors of document sources"
 46 | 
 47 | .PHONY: clean
 48 | clean:
 49 | 	rm -rf $(BUILDDIR)/*
 50 | 
 51 | .PHONY: html
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | .PHONY: dirhtml
 58 | dirhtml:
 59 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 62 | 
 63 | .PHONY: singlehtml
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | .PHONY: pickle
 70 | pickle:
 71 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 72 | 	@echo
 73 | 	@echo "Build finished; now you can process the pickle files."
 74 | 
 75 | .PHONY: json
 76 | json:
 77 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the JSON files."
 80 | 
 81 | .PHONY: htmlhelp
 82 | htmlhelp:
 83 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 84 | 	@echo
 85 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 86 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 87 | 
 88 | .PHONY: qthelp
 89 | qthelp:
 90 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 91 | 	@echo
 92 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 93 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 94 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/autoencoder.qhcp"
 95 | 	@echo "To view the help file:"
 96 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/autoencoder.qhc"
 97 | 
 98 | .PHONY: applehelp
 99 | applehelp:
100 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
101 | 	@echo
102 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
103 | 	@echo "N.B. You won't be able to view it unless you put it in" \
104 | 	      "~/Library/Documentation/Help or install it in your application" \
105 | 	      "bundle."
106 | 
107 | .PHONY: devhelp
108 | devhelp:
109 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
110 | 	@echo
111 | 	@echo "Build finished."
112 | 	@echo "To view the help file:"
113 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/autoencoder"
114 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/autoencoder"
115 | 	@echo "# devhelp"
116 | 
117 | .PHONY: epub
118 | epub:
119 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
120 | 	@echo
121 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
122 | 
123 | .PHONY: epub3
124 | epub3:
125 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
126 | 	@echo
127 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
128 | 
129 | .PHONY: latex
130 | latex:
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo
133 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
134 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
135 | 	      "(use \`make latexpdf' here to do that automatically)."
136 | 
137 | .PHONY: latexpdf
138 | latexpdf:
139 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
140 | 	@echo "Running LaTeX files through pdflatex..."
141 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
142 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
143 | 
144 | .PHONY: latexpdfja
145 | latexpdfja:
146 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
147 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
148 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
149 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
150 | 
151 | .PHONY: text
152 | text:
153 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
154 | 	@echo
155 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
156 | 
157 | .PHONY: man
158 | man:
159 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
160 | 	@echo
161 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
162 | 
163 | .PHONY: texinfo
164 | texinfo:
165 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
166 | 	@echo
167 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
168 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
169 | 	      "(use \`make info' here to do that automatically)."
170 | 
171 | .PHONY: info
172 | info:
173 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
174 | 	@echo "Running Texinfo files through makeinfo..."
175 | 	make -C $(BUILDDIR)/texinfo info
176 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
177 | 
178 | .PHONY: gettext
179 | gettext:
180 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
181 | 	@echo
182 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
183 | 
184 | .PHONY: changes
185 | changes:
186 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
187 | 	@echo
188 | 	@echo "The overview file is in $(BUILDDIR)/changes."
189 | 
190 | .PHONY: linkcheck
191 | linkcheck:
192 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
193 | 	@echo
194 | 	@echo "Link check complete; look for any errors in the above output " \
195 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
196 | 
197 | .PHONY: doctest
198 | doctest:
199 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
200 | 	@echo "Testing of doctests in the sources finished, look at the " \
201 | 	      "results in $(BUILDDIR)/doctest/output.txt."
202 | 
203 | .PHONY: coverage
204 | coverage:
205 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
206 | 	@echo "Testing of coverage in the sources finished, look at the " \
207 | 	      "results in $(BUILDDIR)/coverage/python.txt."
208 | 
209 | .PHONY: xml
210 | xml:
211 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
212 | 	@echo
213 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
214 | 
215 | .PHONY: pseudoxml
216 | pseudoxml:
217 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
218 | 	@echo
219 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
220 | 
221 | .PHONY: dummy
222 | dummy:
223 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
224 | 	@echo
225 | 	@echo "Build finished. Dummy builder generates no files."
226 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # autoencoder documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Apr  9 12:35:06 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('../autoencoder'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinx.ext.todo',
 36 |     'sphinx.ext.coverage',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.viewcode',
 39 |     'sphinx.ext.githubpages',
 40 |     'sphinx.ext.napoleon',
 41 | ]
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = '.rst'
 51 | 
 52 | # The encoding of source files.
 53 | #
 54 | # source_encoding = 'utf-8-sig'
 55 | 
 56 | # The master toctree document.
 57 | master_doc = 'index'
 58 | 
 59 | # General information about the project.
 60 | project = 'autoencoder'
 61 | copyright = '2017, Gokcen Eraslan'
 62 | author = 'Gokcen Eraslan'
 63 | 
 64 | # The version info for the project you're documenting, acts as replacement for
 65 | # |version| and |release|, also used in various other places throughout the
 66 | # built documents.
 67 | #
 68 | # The short X.Y version.
 69 | version = '0.1'
 70 | # The full version, including alpha/beta/rc tags.
 71 | release = '0.1'
 72 | 
 73 | # The language for content autogenerated by Sphinx. Refer to documentation
 74 | # for a list of supported languages.
 75 | #
 76 | # This is also used if you do content translation via gettext catalogs.
 77 | # Usually you set "language" from the command line for these cases.
 78 | language = None
 79 | 
 80 | # There are two options for replacing |today|: either, you set today to some
 81 | # non-false value, then it is used:
 82 | #
 83 | # today = ''
 84 | #
 85 | # Else, today_fmt is used as the format for a strftime call.
 86 | #
 87 | # today_fmt = '%B %d, %Y'
 88 | 
 89 | # List of patterns, relative to source directory, that match files and
 90 | # directories to ignore when looking for source files.
 91 | # This patterns also effect to html_static_path and html_extra_path
 92 | exclude_patterns = []
 93 | 
 94 | # The reST default role (used for this markup: `text`) to use for all
 95 | # documents.
 96 | #
 97 | # default_role = None
 98 | 
 99 | # If true, '()' will be appended to :func: etc. cross-reference text.
100 | #
101 | # add_function_parentheses = True
102 | 
103 | # If true, the current module name will be prepended to all description
104 | # unit titles (such as .. function::).
105 | #
106 | # add_module_names = True
107 | 
108 | # If true, sectionauthor and moduleauthor directives will be shown in the
109 | # output. They are ignored by default.
110 | #
111 | # show_authors = False
112 | 
113 | # The name of the Pygments (syntax highlighting) style to use.
114 | pygments_style = 'sphinx'
115 | 
116 | # A list of ignored prefixes for module index sorting.
117 | # modindex_common_prefix = []
118 | 
119 | # If true, keep warnings as "system message" paragraphs in the built documents.
120 | # keep_warnings = False
121 | 
122 | # If true, `todo` and `todoList` produce output, else they produce nothing.
123 | todo_include_todos = True
124 | 
125 | 
126 | # -- Options for HTML output ----------------------------------------------
127 | 
128 | # The theme to use for HTML and HTML Help pages.  See the documentation for
129 | # a list of builtin themes.
130 | #
131 | html_theme = 'alabaster'
132 | 
133 | # Theme options are theme-specific and customize the look and feel of a theme
134 | # further.  For a list of options available for each theme, see the
135 | # documentation.
136 | #
137 | # html_theme_options = {}
138 | 
139 | # Add any paths that contain custom themes here, relative to this directory.
140 | # html_theme_path = []
141 | 
142 | # The name for this set of Sphinx documents.
143 | # "<project> v<release> documentation" by default.
144 | #
145 | # html_title = 'autoencoder v0.1'
146 | 
147 | # A shorter title for the navigation bar.  Default is the same as html_title.
148 | #
149 | # html_short_title = None
150 | 
151 | # The name of an image file (relative to this directory) to place at the top
152 | # of the sidebar.
153 | #
154 | # html_logo = None
155 | 
156 | # The name of an image file (relative to this directory) to use as a favicon of
157 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
158 | # pixels large.
159 | #
160 | # html_favicon = None
161 | 
162 | # Add any paths that contain custom static files (such as style sheets) here,
163 | # relative to this directory. They are copied after the builtin static files,
164 | # so a file named "default.css" will overwrite the builtin "default.css".
165 | html_static_path = ['_static']
166 | 
167 | # Add any extra paths that contain custom files (such as robots.txt or
168 | # .htaccess) here, relative to this directory. These files are copied
169 | # directly to the root of the documentation.
170 | #
171 | # html_extra_path = []
172 | 
173 | # If not None, a 'Last updated on:' timestamp is inserted at every page
174 | # bottom, using the given strftime format.
175 | # The empty string is equivalent to '%b %d, %Y'.
176 | #
177 | # html_last_updated_fmt = None
178 | 
179 | # If true, SmartyPants will be used to convert quotes and dashes to
180 | # typographically correct entities.
181 | #
182 | # html_use_smartypants = True
183 | 
184 | # Custom sidebar templates, maps document names to template names.
185 | #
186 | # html_sidebars = {}
187 | 
188 | # Additional templates that should be rendered to pages, maps page names to
189 | # template names.
190 | #
191 | # html_additional_pages = {}
192 | 
193 | # If false, no module index is generated.
194 | #
195 | # html_domain_indices = True
196 | 
197 | # If false, no index is generated.
198 | #
199 | # html_use_index = True
200 | 
201 | # If true, the index is split into individual pages for each letter.
202 | #
203 | # html_split_index = False
204 | 
205 | # If true, links to the reST sources are added to the pages.
206 | #
207 | # html_show_sourcelink = True
208 | 
209 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
210 | #
211 | # html_show_sphinx = True
212 | 
213 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
214 | #
215 | # html_show_copyright = True
216 | 
217 | # If true, an OpenSearch description file will be output, and all pages will
218 | # contain a <link> tag referring to it.  The value of this option must be the
219 | # base URL from which the finished HTML is served.
220 | #
221 | # html_use_opensearch = ''
222 | 
223 | # This is the file name suffix for HTML files (e.g. ".xhtml").
224 | # html_file_suffix = None
225 | 
226 | # Language to be used for generating the HTML full-text search index.
227 | # Sphinx supports the following languages:
228 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
229 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
230 | #
231 | # html_search_language = 'en'
232 | 
233 | # A dictionary with options for the search language support, empty by default.
234 | # 'ja' uses this config value.
235 | # 'zh' user can custom change `jieba` dictionary path.
236 | #
237 | # html_search_options = {'type': 'default'}
238 | 
239 | # The name of a javascript file (relative to the configuration directory) that
240 | # implements a search results scorer. If empty, the default will be used.
241 | #
242 | # html_search_scorer = 'scorer.js'
243 | 
244 | # Output file base name for HTML help builder.
245 | htmlhelp_basename = 'autoencoderdoc'
246 | 
247 | # -- Options for LaTeX output ---------------------------------------------
248 | 
249 | latex_elements = {
250 |      # The paper size ('letterpaper' or 'a4paper').
251 |      #
252 |      # 'papersize': 'letterpaper',
253 | 
254 |      # The font size ('10pt', '11pt' or '12pt').
255 |      #
256 |      # 'pointsize': '10pt',
257 | 
258 |      # Additional stuff for the LaTeX preamble.
259 |      #
260 |      # 'preamble': '',
261 | 
262 |      # Latex figure (float) alignment
263 |      #
264 |      # 'figure_align': 'htbp',
265 | }
266 | 
267 | # Grouping the document tree into LaTeX files. List of tuples
268 | # (source start file, target name, title,
269 | #  author, documentclass [howto, manual, or own class]).
270 | latex_documents = [
271 |     (master_doc, 'autoencoder.tex', 'autoencoder Documentation',
272 |      'Gokcen Eraslan', 'manual'),
273 | ]
274 | 
275 | # The name of an image file (relative to this directory) to place at the top of
276 | # the title page.
277 | #
278 | # latex_logo = None
279 | 
280 | # For "manual" documents, if this is true, then toplevel headings are parts,
281 | # not chapters.
282 | #
283 | # latex_use_parts = False
284 | 
285 | # If true, show page references after internal links.
286 | #
287 | # latex_show_pagerefs = False
288 | 
289 | # If true, show URL addresses after external links.
290 | #
291 | # latex_show_urls = False
292 | 
293 | # Documents to append as an appendix to all manuals.
294 | #
295 | # latex_appendices = []
296 | 
297 | # It false, will not define \strong, \code, 	itleref, \crossref ... but only
298 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
299 | # packages.
300 | #
301 | # latex_keep_old_macro_names = True
302 | 
303 | # If false, no module index is generated.
304 | #
305 | # latex_domain_indices = True
306 | 
307 | 
308 | # -- Options for manual page output ---------------------------------------
309 | 
310 | # One entry per manual page. List of tuples
311 | # (source start file, name, description, authors, manual section).
312 | man_pages = [
313 |     (master_doc, 'autoencoder', 'autoencoder Documentation',
314 |      [author], 1)
315 | ]
316 | 
317 | # If true, show URL addresses after external links.
318 | #
319 | # man_show_urls = False
320 | 
321 | 
322 | # -- Options for Texinfo output -------------------------------------------
323 | 
324 | # Grouping the document tree into Texinfo files. List of tuples
325 | # (source start file, target name, title, author,
326 | #  dir menu entry, description, category)
327 | texinfo_documents = [
328 |     (master_doc, 'autoencoder', 'autoencoder Documentation',
329 |      author, 'autoencoder', 'One line description of project.',
330 |      'Miscellaneous'),
331 | ]
332 | 
333 | # Documents to append as an appendix to all manuals.
334 | #
335 | # texinfo_appendices = []
336 | 
337 | # If false, no module index is generated.
338 | #
339 | # texinfo_domain_indices = True
340 | 
341 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
342 | #
343 | # texinfo_show_urls = 'footnote'
344 | 
345 | # If true, do not generate a @detailmenu in the "Top" node's menu.
346 | #
347 | # texinfo_no_detailmenu = False
348 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. autoencoder documentation master file, created by
 2 |    sphinx-quickstart on Sun Apr  9 12:35:06 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to autoencoder's documentation!
 7 | =======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 | 
15 | API documentation
16 | =================
17 | 
18 | 
19 | .. automodule:: autoencoder.api
20 |    :members:
21 | .. automodule:: autoencoder.loss
22 |    :members:
23 | .. automodule:: autoencoder.network
24 |    :members:
25 | .. automodule:: autoencoder.encode
26 |    :members:
27 | .. automodule:: autoencoder.io
28 |    :members:
29 | 
30 | 
31 | 
32 | Indices and tables
33 | ==================
34 | 
35 | * :ref:`genindex`
36 | * :ref:`modindex`
37 | * :ref:`search`
38 | 
39 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files=dca/test*.py
3 | 


--------------------------------------------------------------------------------
/reproducibility/code/Figure4.R:
--------------------------------------------------------------------------------
 1 | # Load matrices ####
 2 | withoutDropout <- read.csv("../data/francesconi/francesconi_withoutDropout.csv", row.names = 1)
 3 | withDropout <- read.csv("../data/francesconi/francesconi_withDropout.csv", row.names = 1)
 4 | dca <- read.csv("../data/francesconi/francesconi_dca.csv", row.names = 1)
 5 | magic <- read.csv("../data/francesconi/francesconi_magic.csv", row.names = 1)
 6 | scimpute  <- read.csv("../data/francesconi/francesconi_scimpute.csv", row.names = 1)
 7 | saver <- read.csv("../data/francesconi/francesconi_saver.csv", row.names = 1)
 8 | 
 9 | # Generate heatmaps ####
10 | cors <- apply(withoutDropout,1,function(x) cor.test(method = "pearson",x,1:ncol(withoutDropout)))
11 | pvals <- unlist(lapply(cors, function(x) x$p.value))
12 | coefs <- unlist(lapply(cors, function(x) x$estimate))
13 | genes.up <- names(head(sort(pvals[coefs > 0]), 100))
14 | genes.down <- names(head(sort(pvals[coefs < 0]), 100))
15 | 
16 | genHeatmap <- function(matr){
17 |   library(gplots)
18 |   load("../data/BlueYellowColormaps_V1.RData")
19 |   genes <- c(genes.up, genes.down )
20 |   rowOrd <- order(unlist(lapply(cors[genes], function(x) x$estimate)))
21 |   matr <- matr[match(genes, rownames(withoutDropout)),]
22 |   tmp <- data.matrix(matr)
23 |   tmp <- t(apply(tmp, 1, function(x) (x - mean(x)) / sd(x)))
24 |   tmp[which(tmp > 2)] <- 2
25 |   tmp[which(tmp < (-2))] <- (-2)
26 |   tmp[which(matr == 0)] <- NA
27 |   heatmap.2(tmp[rowOrd,], Rowv = NA, Colv = NA, density.info = "none", trace = "none", col = yellow2blue, na.color = "grey", scale = "none", labRow = "", labCol = "")
28 | }
29 | 
30 | genHeatmap(withoutDropout) # Panel A
31 | genHeatmap(withDropout) # Panel B
32 | genHeatmap(dca) # Panel C
33 | 
34 | # Generate boxplot ####
35 | cors <- apply(withoutDropout, 1, function(x) cor.test(method = "pearson",x,1:ncol(withoutDropout)))
36 | pvals <- unlist(lapply(cors, function(x) x$p.value))
37 | coefs <- unlist(lapply(cors, function(x) x$estimate))
38 | 
39 | genes <- names(head(sort(unlist(lapply(cors, function(x) x$p.value))), 500))
40 | 
41 | calc.cor <- function(x){
42 |   ok <- match(genes, rownames(withoutDropout))
43 |   C <- apply(x[ok,],1,function(y) cor(y,1:ncol(x)))
44 |   return(abs(C))
45 | }
46 | 
47 | imputed <- list(withoutDropout, withDropout, dca, saver, scimpute, magic)
48 | imputed <- lapply(imputed, data.matrix)
49 | 
50 | imp <- lapply(imputed, calc.cor)
51 | boxplot(imp, main = "Correlation with Time", ylab = "Pearson Correlation", names = c("Without noise", "With noise", "AE","SAVER", "scImpute", "MAGIC"),
52 |         cex.main = 2, cex.lab = 1.5, cex.axis = 1.5, outline = F)
53 | 
54 | 
55 | # Generate correlation plots ####
56 | genes <- c("tbx-36", "his-8")
57 | 
58 | cors <- apply(withoutDropout,1,function(x) cor.test(method = "pearson", x, 1:ncol(withoutDropout)))
59 | pvals <- unlist(lapply(cors, function(x) x$p.value))
60 | coefs <- unlist(lapply(cors, function(x) x$estimate))
61 | 
62 | genes.up <- names(head(sort(pvals[coefs > 0]), 100))
63 | genes.down <- names(head(sort(pvals[coefs < 0]), 100))
64 | 
65 | scale01 <- function(x) (x - min(x)) / (max(x) - min(x))
66 | genCorPlot <- function(gene1, gene2, matr){
67 |   par(mfrow = c(1, 3))
68 |   library(plotrix)
69 |   farben <- color.scale(1:206, extremes = c("blue", "red"), alpha = 0.8)
70 |   plot(scale01(exp(unlist(withoutDropout[gene1,]))), scale01(exp(unlist(withoutDropout[gene2,]))), col = farben, pch = 16, main = "Original", ylab = gene2, xlab = gene1)
71 |   correl <- signif(cor(method = "spearman", scale01(exp(unlist(withoutDropout[gene1,]))), scale01(exp(unlist(withoutDropout[gene2,])))), 2)
72 |   legend("topright", paste("Spearman Rho", correl))
73 |   
74 |   plot(scale01(unlist(withDropout[gene1,])), scale01(unlist(withDropout[gene2,])), col = farben, pch = 16, main = "Dropout", ylab = gene2, xlab = gene1)
75 |   correl <- signif(cor(method = "spearman", scale01(unlist(withDropout[gene1,])), scale01(unlist(withDropout[gene2,]))), 2)
76 |   legend("topright", paste("Spearman Rho", correl))
77 |   
78 |   plot(scale01(unlist(matr[gene1,])), scale01(unlist(matr[gene2,])), col = farben, pch = 16, main = "Denoised", ylab = gene2, xlab = gene1)
79 |   correl <- signif(cor(method = "spearman", scale01(unlist(matr[gene1,])), scale01(unlist(matr[gene2,]))), 2)
80 |   legend("topright", paste("Spearman Rho", correl))
81 |   
82 | }
83 | 
84 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = dca) # Panels E, F, G 
85 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = saver)
86 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = magic)
87 | genCorPlot(gene1 = genes[2], gene2 = genes[1], matr = scimpute)
88 | 


--------------------------------------------------------------------------------
/reproducibility/code/Figure5.R:
--------------------------------------------------------------------------------
 1 | # Load libraries ####
 2 | library(DESeq2)
 3 | library(plotrix)
 4 | library(ggplot2)
 5 | library(beeswarm)
 6 | 
 7 | # Load DESeq2 results ####
 8 | load("../data/chu/chu_deseq2_results.RData")
 9 | 
10 | # Generate plots foldchange plots ####
11 | # Panels A and B
12 | pdf(useDingbats = F, "../figs/Fig5_A_B.pdf", width = 8, height = 4.5)
13 | par(mfrow = c(1, 2))
14 | diffs <- list(abs(res_original$log2FoldChange - res_bulk$log2FoldChange),
15 |                 abs(res_dca$log2FoldChange - res_bulk$log2FoldChange))
16 | farben <- color.scale(unlist(diffs), alpha = 0.8, extremes = c("darkblue", "darkred"))
17 | plot(res_original$log2FoldChange, res_bulk$log2FoldChange, main = "Original", ylab = "Bulk", xlab = "Esimtated fold change", ylim = c(-5, 15), xlim = c(-30, 30), col = farben[1:1000], pch = 16)
18 | abline(0, 1)
19 | abline(v = 0, h = 0, col = "grey", lty = 2)
20 | legend("bottomright", paste("Rho:", signif(cor(res_original$log2FoldChange, res_bulk$log2FoldChange), 2)), bty = "n")
21 | plot(res_dca$log2FoldChange, res_bulk$log2FoldChange, main = "DCA denoised", ylab = "Bulk", xlab = "Esimtated fold change", ylim = c(-5, 15), xlim = c(-30, 30), col = farben[1001:2000], pch = 16)
22 | abline(0, 1)
23 | abline(v = 0, h = 0, col = "grey", lty = 2)
24 | legend("bottomright", paste("Rho:", signif(cor(res_dca$log2FoldChange, res_bulk$log2FoldChange), 2)), bty = "n")
25 | dev.off()
26 | 
27 | # Load expression tables ####
28 | bulk <- data.matrix(read.csv("../data/chu/chu_bulk.csv", row.names = 1))
29 | treat_bulk <- colnames(bulk)
30 | treat_bulk <- unlist(lapply(treat_bulk, function(x) strsplit(x, "_", fixed = T)[[1]][1]))
31 | ok <- which(treat_bulk %in% c("H1", "DEC"))
32 | treat_bulk <- treat_bulk[ok]
33 | bulk <- bulk[, ok]
34 | 
35 | counts <- read.csv("../data/chu/chu_original.csv", row.names = 1)
36 | counts <- round(counts)
37 | treat <- unlist(lapply(colnames(counts), function(x) strsplit(x, "_", fixed = T)[[1]][1]))
38 | farben <- c("black", "yellow", "blue", "purple", "green", "red", "grey")
39 | names(farben) <- c("H1", "H9", "EC", "NPC", "DEC", "HFF", "TB")
40 | ok <- which(treat %in% c("H1", "DEC"))
41 | counts <- counts[, ok]
42 | treat <- treat[ok]
43 | dca <- data.matrix(read.csv("../data/chu/chu_dca.csv", row.names = 1))
44 | original <- data.matrix(counts[rownames(dca),])
45 | bulk <- data.matrix(bulk[rownames(dca),])
46 | 
47 | # Generate single gene plots ####
48 | # Panels C, D and E
49 | pdf(useDingbats = F, "../figs/Fig5_C_D_E.pdf", width = 9, height = 3.5)
50 | par(mfrow = c(1, 3))
51 | gene <- "LEFTY1"
52 | boxplot(split(original[gene,], treat)[c("H1", "DEC")], outline = FALSE, main = "Original", ylim = c(0, 5000), ylab = gene)
53 | #beeswarm(split(original[gene,], treat)[c("H1", "DEC")], pch = 16, add = TRUE, cex = 0.8)
54 | boxplot(split(dca[gene,], treat)[c("H1", "DEC")], outline = FALSE, main = "DCA denoised", ylim = c(0, 5000), ylab = gene)
55 | #beeswarm(split(dca[gene,], treat)[c("H1", "DEC")], pch = 16, add = TRUE, cex = 0.8)
56 | boxplot(split(bulk[gene,], treat_bulk)[c("H1", "DEC")], outline = FALSE, main = "Bulk", ylab = gene)
57 | dev.off()
58 | 
59 | # Generate boxplot ####
60 | # Panel F
61 | load("../data/chu/HundredTimes_20cells.RData")
62 | load("../data/chu/chu_deseq2_results.RData")
63 | 
64 | res_bulk <- res_bulk[rownames(res_original), ]
65 | res_bulk$log2FoldChange <- res_bulk$log2FoldChange*(-1)
66 | tmp <- lapply(1:5, function(y) unlist(lapply(1:100, function(x) cor(res_bulk$log2FoldChange, hundredTimes[[x]][[y]]$log2FoldChange, use = "complete"))))
67 | 
68 | colors <- list(c(192, 81, 158), c(73, 93, 115), c(152, 201, 125), c(117, 90, 36))
69 | colors <- c("white", unlist(lapply(colors, function(x) rgb((x/sum(x))[1], (x/sum(x))[2], (x/sum(x))[3]))))
70 | 
71 | pdf(useDingbats = F, "../figs/Fig5_F.pdf", height = 4, width = 3.5)
72 | boxplot(tmp, names = c("original", "DCA", "SAVER", "MAGIC", "scImpute"), ylab = "Pearson correlation", las = 2, col = colors, outline = F)
73 | dev.off()
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/reproducibility/code/Figure6.R:
--------------------------------------------------------------------------------
 1 | # Load pre-calculated Seurat object ####
 2 | library(Seurat)
 3 | load("../data/stoeckius/CBMC.seurat.RData")
 4 | 
 5 | # Generate tSNE visualization showing celltype clustering (Fig Panel A) ####
 6 | panelA <- TSNEPlot(cbmc, do.label = TRUE, pt.size = 0.5)
 7 | panelA
 8 | 
 9 | # Load imputed data ####
10 | dca <- read.csv("../data/stoeckius/stoeckius_dca.csv", row.names = 1)
11 | magic <- read.csv("../data/stoeckius/stoeckius_magic.csv", row.names = 1)
12 | saver <- read.csv("../data/stoeckius/stoeckius_saver.csv", row.names = 1)
13 | scimpute <- read.csv("../data/stoeckius/stoeckius_scimpute.csv", row.names = 1)
14 | 
15 | # Define protein-mRNA pairs ####
16 | protein <- c("CD3", "CD19", "CD4", "CD8", "CD56", "CD16", "CD11c", "CD14")
17 | rna <- c("CD3E", "CD19", "CD4", "CD8A", "NCAM1", "FCGR3A", "ITGAX", "CD14")
18 | 
19 | # Add imputed RNA levels to Seurat object ####
20 | tmp <- dca
21 | rownames(tmp) <- gsub("HUMAN", "IMPUTED", rownames(tmp))
22 | cbmc <- SetAssayData(cbmc, assay.type = "IMPUTED", slot = "raw.data", new.data = data.matrix(tmp))
23 | cbmc <- NormalizeData(cbmc, assay.type = "IMPUTED")
24 | cbmc <- ScaleData(cbmc, assay.type = "IMPUTED", display.progress = FALSE)
25 | 
26 | tmp <- magic
27 | rownames(tmp) <- gsub("HUMAN", "MAGIC", rownames(tmp))
28 | cbmc <- SetAssayData(cbmc, assay.type = "MAGIC", slot = "raw.data", new.data = data.matrix(tmp))
29 | cbmc <- NormalizeData(cbmc, assay.type = "MAGIC")
30 | cbmc <- ScaleData(cbmc, assay.type = "MAGIC", display.progress = FALSE)
31 | 
32 | tmp <- saver
33 | rownames(tmp) <- gsub("HUMAN", "SAVER", rownames(tmp))
34 | cbmc <- SetAssayData(cbmc, assay.type = "SAVER", slot = "raw.data", new.data = data.matrix(tmp))
35 | cbmc <- NormalizeData(cbmc, assay.type = "SAVER")
36 | cbmc <- ScaleData(cbmc, assay.type = "SAVER", display.progress = FALSE)
37 | 
38 | tmp <- scimpute
39 | rownames(tmp) <- gsub("HUMAN", "SCIMPUTE", rownames(tmp))
40 | cbmc <- SetAssayData(cbmc, assay.type = "SCIMPUTE", slot = "raw.data", new.data = data.matrix(tmp))
41 | cbmc <- NormalizeData(cbmc, assay.type = "SCIMPUTE")
42 | cbmc <- ScaleData(cbmc, assay.type = "SCIMPUTE", display.progress = FALSE)
43 | 
44 | # tSNE colored by imputed and original RNA expression (Fig Panel B) ####
45 | panelB1 <- FeaturePlot(cbmc, features.plot = c(paste0("CITE_", protein[1:4]), paste0("HUMAN_", rna[1:4]), paste0("IMPUTED_", rna[1:4])),
46 |                        min.cutoff = "q05", max.cutoff = "q95", nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5, do.return = T)
47 | panelB2 <- FeaturePlot(cbmc, features.plot = c(paste0("CITE_", protein[5:8]), paste0("HUMAN_", rna[5:8]), paste0("IMPUTED_", rna[5:8])),
48 |                        min.cutoff = "q05", max.cutoff = "q95", nCol = 4, cols.use = c("lightgrey", "blue"), pt.size = 0.5, do.return = T)
49 | 
50 | # Example plot of CD3 expression in T cells (Fig Panel C) ####
51 | tmp <- SubsetData(cbmc, ident.use = c(0, 5))
52 | rna.raw <- tmp@data["HUMAN_CD3E",]
53 | protein <- tmp@assay$CITE@scale.data["CITE_CD3",]
54 | rna.imputed <- tmp@assay$IMPUTED@scale.data["IMPUTED_CD3E",]
55 | table(rna.raw == 0)[["TRUE"]]/length(rna.raw)
56 | scale01 <- function(x){
57 |   x <- (x-min(x)) / (max(x) - min(x))
58 |   x - median(x)
59 | }
60 | aframe <- data.frame(Relative.expresion = c(scale01(protein), scale01(rna.raw), scale01(rna.imputed)), type = c(rep("Protein", length(protein)), rep("Original", length(protein)), rep("Denoised", length(protein))))
61 | panelC <- ggplot(aframe, aes(Relative.expresion, colour = type)) + geom_density() + ggtitle("CD3 in T cells")
62 | panelC
63 | 
64 | # Calculate likelihoods of co-occurrence (Fig Panel D) ####
65 | protein <- c("CD3", "CD19", "CD4", "CD8", "CD56", "CD16", "CD11c", "CD14")
66 | rna <- c("CD3E", "CD19", "CD4", "CD8A", "NCAM1", "FCGR3A", "ITGAX", "CD14")=
67 | l <- list(cor(t(cbmc@scale.data[paste0("HUMAN_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"),
68 |           cor(t(cbmc@assay$IMPUTED@scale.data[paste0("IMPUTED_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"),
69 |           cor(t(cbmc@assay$MAGIC@scale.data[paste0("MAGIC_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"),
70 |           cor(t(cbmc@assay$SAVER@scale.data[paste0("SAVER_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"),
71 |           cor(t(cbmc@assay$SCIMPUTE@scale.data[paste0("SCIMPUTE_", rna),]), t(cbmc@assay$CITE@scale.data[paste0("CITE_", protein),]), method = "spearman"))
72 | l <- lapply(l, diag)
73 | boxplot(l, ylab = "Spearman Correlation", names = c("Original", "DCA", "MAGIC", "SAVER", "scImpute"), las = 2)
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/reproducibility/code/Figure8.R:
--------------------------------------------------------------------------------
 1 | # Load pre-calculated Seurat object ####
 2 | library(Seurat)
 3 | load("../data/stoeckius/CBMC.seurat.RData")
 4 | 
 5 | # Load DCA denoised data ####
 6 | dca <- read.csv("../data/stoeckius/stoeckius_dca.csv", row.names = 1)
 7 | 
 8 | # Add imputed data to Seurat object ####
 9 | tmp <- dca
10 | rownames(tmp) <- gsub("HUMAN", "IMPUTED", rownames(tmp))
11 | cbmc <- SetAssayData(cbmc, assay.type = "IMPUTED", slot = "raw.data", new.data = data.matrix(tmp))
12 | cbmc <- NormalizeData(cbmc, assay.type = "IMPUTED")
13 | cbmc <- ScaleData(cbmc, assay.type = "IMPUTED", display.progress = FALSE)
14 | 
15 | # Subset to NK cells ####
16 | sub <- SubsetData(cbmc, ident.use = 3)
17 | sub <- ScaleData(sub, assay.type = "CITE", display.progress = FALSE)
18 | sub <- ScaleData(sub, display.progress = FALSE, vars.to.regress = "nUMI")
19 | 
20 | # Generate tSNEs colored by protein levels ####
21 | FeaturePlot(sub, c("CITE_CD56", "CITE_CD16"), min.cutoff = "q01", max.cutoff = "q99", cols.use = c("grey", "blue")) # Panel A & B
22 | 
23 | # Generate scatterplot of expression levels ####
24 | par(mfrow = c(1,3))
25 | library(mclust)
26 | tmp <- sub@assay$CITE@scale.data[c('CITE_CD56', 'CITE_CD16'),]
27 | m_prot <- Mclust(t(tmp), G = 2)
28 | plot(t(tmp), col = m_prot$classification, main = 'Protein', pch = 16) # Panel C
29 | 
30 | tmp <- data.matrix(sub@data[c('HUMAN_NCAM1', 'HUMAN_FCGR3A'),])
31 | m_orig <- Mclust(t(tmp), G = 2)
32 | plot(t(tmp), col = m_prot$classification, main = 'Original RNA', pch = 16)  # Panel D
33 | 
34 | tmp <- data.matrix(sub@assay$IMPUTED@data[c('IMPUTED_NCAM1', 'IMPUTED_FCGR3A'),])
35 | m_imp <- Mclust(t(tmp), G = 2)
36 | plot(t(tmp), col = m_prot$classification, main = 'Original RNA', pch = 16)  # Panel E
37 | 
38 | fisher.test(table(m_prot$classification==1, m_imp$classification==2))
39 | fisher.test(table(m_prot$classification==1, m_orig$classification==2))
40 | 


--------------------------------------------------------------------------------
/reproducibility/code/ImputeUsingDCA.sh:
--------------------------------------------------------------------------------
1 | dca ../data/chu/chu_original.csv ../data/chu/res_dca
2 | dca ../data/francesconi/francesconi_original.csv ../data/francesconi/res_dca
3 | dca --type nb-conddisp ../data/stoeckius/stoeckius_original.csv ../data/stoeckius/res_dca
4 | 


--------------------------------------------------------------------------------
/reproducibility/code/ImputeUsingMAGIC.py:
--------------------------------------------------------------------------------
 1 | import magic
 2 | import os
 3 | 
 4 | scdata = magic.mg.SCData.from_csv("../data/chu/chu_original.csv", cell_axis="columns", data_type='sc-seq')
 5 | scdata.run_magic()
 6 | mdata = scdata.magic.data
 7 | mdata=mdata.transpose()
 8 | mdata.to_csv("../data/chu/chu_magic.csv")
 9 | 
10 | scdata = magic.mg.SCData.from_csv("../data/francesconi/francesconi_original.csv", cell_axis="columns", data_type='sc-seq')
11 | scdata.run_magic()
12 | mdata = scdata.magic.data
13 | mdata=mdata.transpose()
14 | mdata.to_csv("../data/francesconi/francesconi_magic.csv")
15 | 
16 | scdata = magic.mg.SCData.from_csv("../data/stoeckius/stoeckius_original.csv", cell_axis="columns", data_type='sc-seq')
17 | scdata.run_magic()
18 | mdata = scdata.magic.data
19 | mdata=mdata.transpose()
20 | mdata.to_csv("../data/stoeckius/stoeckius_magic.csv")
21 | 


--------------------------------------------------------------------------------
/reproducibility/code/ImputeUsingSAVER.R:
--------------------------------------------------------------------------------
 1 | # Imputation using SAVER ####
 2 | library(SAVER)
 3 | library(doParallel)
 4 | registerDoParallel(cores = 5)
 5 | sc <- read.csv("../data/francesconi/francesconi_withDropout.csv", row.names = 1)
 6 | sav <- saver(as.matrix(sc), parallel = TRUE)
 7 | sav <- sav$estimate
 8 | write.csv(sav, file = "../data/francesconi/francesconi_saver.csv", quote = F)
 9 | 
10 | sc <- read.csv("../data/chu/chu_original.csv", row.names = 1)
11 | sav <- saver(as.matrix(sc), parallel = TRUE)
12 | sav <- sav$estimate
13 | write.csv(sav, file = "../data/chu/chu_saver.csv", quote = F)
14 | 
15 | sc <- read.csv("../data/stoeckius/stoeckius_original.csv", row.names = 1)
16 | sav <- saver(as.matrix(sc), parallel = TRUE)
17 | sav <- sav$estimate
18 | write.csv(sav, file = "../data/stoeckius/stoeckius_saver.csv", quote = F)
19 | 


--------------------------------------------------------------------------------
/reproducibility/code/ImputeUsingSCIMPUTE.R:
--------------------------------------------------------------------------------
1 | library(scImpute)
2 | 
3 | scimpute(count_path = "../data/francesconi/francesconi_withDropout.csv", infile = "csv", outfile = "csv", out_dir = "../data/francesconi/scimpute", Kcluster = 1)
4 | scimpute(count_path = "../data/chu/chu_original.csv", infile = "csv", outfile = "csv", out_dir = "../data/chu/scimpute", Kcluster = 2)
5 | scimpute(count_path = "../data/stoeckius/stoeckius_original.csv", infile = "csv", outfile = "csv", out_dir = "../data/stoeckius/scimpute", Kcluster = 13)
6 | 


--------------------------------------------------------------------------------
/reproducibility/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # downloads 722M dataset file
4 | wget https://hmgubox.helmholtz-muenchen.de/f/1a014dc377f64b2b964c/?dl=1 -O datasets.zip
5 | mkdir data; cd data
6 | unzip ../datasets.zip
7 | 


--------------------------------------------------------------------------------
/scripts/seurat.R:
--------------------------------------------------------------------------------
  1 | suppressMessages(library(Seurat, quietly = T))
  2 | suppressMessages(library(ggplot2, quietly = T))
  3 | suppressMessages(library(Rtsne, quietly = T))
  4 | 
  5 | normalize <- function(x) {
  6 |   sf <- rowSums(x)
  7 |   sf <- sf / median(sf)
  8 |   x <- x / sf
  9 |   x <- log(x+1)
 10 |   scale(x, center = T, scale = T)
 11 | }
 12 | 
 13 | 
 14 | `%+%` <- paste0
 15 | args <- commandArgs(trailingOnly = T)
 16 | stopifnot(length(args) == 1)
 17 | arg <- args[[1]]
 18 | 
 19 | if (!dir.exists(arg)) {
 20 |   files <- arg
 21 | } else {
 22 |   files <- list.files(arg, recursive = T, pattern = '^counts\\..sv', full.names = T)
 23 | }
 24 | 
 25 | for (cnt.file in files) {
 26 |   print('Visualizing ' %+% cnt.file)
 27 | 
 28 |   output.dir <- dirname(cnt.file)
 29 |   tbl <- read.table(cnt.file, header = T)
 30 | 
 31 | # Load labels if available ------------------------------------------------
 32 | 
 33 |   if (file.exists(output.dir %+% '/info_cellinfo.tsv')) {
 34 |     labels <- read.table(output.dir %+% '/info_cellinfo.tsv', header=T)$Group
 35 |   } else if (file.exists(output.dir %+% '/../info_cellinfo.tsv')) {
 36 |     labels <- read.table(output.dir %+% '/../info_cellinfo.tsv', header=T)$Group
 37 |   }
 38 |   else labels <- NULL
 39 | 
 40 | # Seurat PCA and tSNE -----------------------------------------------------
 41 | 
 42 |   s <- CreateSeuratObject(tbl, min.cells = 1, min.genes = 1)
 43 |   print(s)
 44 |   s <- NormalizeData(s, display.progress = F)
 45 |   s <- ScaleData(s, display.progress = F)
 46 | 
 47 |   s <- RunPCA(s, pc.genes = rownames(s@data), do.print = F)
 48 |   s <- RunTSNE(s)
 49 |   s <- FindClusters(s, reduction.type = "pca", dims.use=1:5, save.SNN = T, print.output = 0)
 50 |   print('Number of clusters: ' %+% length(levels(s@ident)))
 51 | 
 52 |   DimPlot(s)
 53 |   ggsave(output.dir %+% '/seurat_PCA_all_CL.png')
 54 |   DimPlot(s, reduction.use = 'tsne')
 55 |   ggsave(output.dir %+% '/seurat_tSNE_all_CL.png')
 56 |   if (!is.null(labels)) {
 57 |     s@meta.data$ground.truth <- labels
 58 |     DimPlot(s, group.by='ground.truth')
 59 |     ggsave(output.dir %+% '/seurat_PCA_all_GT.png')
 60 |     DimPlot(s, reduction.use = 'tsne', group.by='ground.truth')
 61 |     ggsave(output.dir %+% '/seurat_tSNE_all_GT.png')
 62 |   }
 63 | 
 64 |   s <- FindVariableGenes(s, do.plot = F, display.progress = F)
 65 |   print('Number of variable genes: ' %+% length(s@var.genes))
 66 |   s <- RunPCA(s, do.print = F) # use variable genes by default
 67 |   s <- RunTSNE(s)
 68 |   s <- FindClusters(s, reduction.type = "pca", dims.use = 1:5, save.SNN = T, print.output = 0, force.recalc = T)
 69 |   print('Number of clusters: ' %+% length(levels(s@ident)))
 70 | 
 71 |   DimPlot(s)
 72 |   ggsave(output.dir %+% '/seurat_PCA_var_CL.png')
 73 |   DimPlot(s, reduction.use = 'tsne')
 74 |   ggsave(output.dir %+% '/seurat_tSNE_var_CL.png')
 75 |   if (!is.null(labels)) {
 76 |     DimPlot(s, group.by='ground.truth')
 77 |     ggsave(output.dir %+% '/seurat_PCA_var_GT.png')
 78 |     DimPlot(s, reduction.use = 'tsne', group.by='ground.truth')
 79 |     ggsave(output.dir %+% '/seurat_tSNE_var_GT.png')
 80 |   }
 81 | 
 82 |   write.table(data.frame(label=unname(s@ident), cell=names(s@ident)),
 83 |               output.dir %+% '/seurat_cluster_labels.tsv',
 84 |               row.names = F, quote = F)
 85 | 
 86 |   saveRDS(s, output.dir %+% '/seurat.Rds')
 87 | 
 88 |   # PCA and tSNE with sf and lognorm ----------------------------------------
 89 | 
 90 |   if (!is.null(labels)) {
 91 |     counts <- t(tbl)
 92 |     counts <- counts[, colSums(counts)>0]
 93 |     norm.counts <- normalize(counts)
 94 | 
 95 |     pca.counts <- prcomp(norm.counts, rank. = 2)$x
 96 |     qplot(pca.counts[,1], pca.counts[,2], color=labels, xlab='PC1', ylab='PC2')
 97 |     ggsave(output.dir %+% '/seurat_PCA_all_simplepre_GT.png')
 98 | 
 99 |     tsne.counts <- Rtsne(norm.counts)$Y
100 |     qplot(tsne.counts[,1], tsne.counts[,2], color=labels, xlab='tsne1', ylab='tsne2')
101 |     ggsave(output.dir %+% '/seurat_tSNE_all_simplepre_GT.png')
102 | 
103 |     if (file.exists(output.dir %+% '/info_truecounts.tsv')) {
104 | 
105 |       tr <- t(read.table(output.dir %+% '/info_truecounts.tsv'))
106 |       tr<- tr[, colSums(tr)>0]
107 |       tr.norm <- normalize(tr)
108 |       pca.tr <- prcomp(tr.norm, rank. = 2)$x
109 |       qplot(pca.tr[,1], pca.tr[,2], color=labels, xlab='pca1', ylab='pca2')
110 |       ggsave(output.dir %+% '/seurat_TRUECOUNT_PCA_all_simplepre_GT.png')
111 | 
112 |       tsne.tr <- Rtsne(tr.norm)$Y
113 |       qplot(tsne.tr[,1], tsne.tr[,2], color=labels, xlab='tsne1', ylab='tsne2')
114 |       ggsave(output.dir %+% '/seurat_TRUECOUNT_tSNE_all_simplepre_GT.png')
115 | 
116 |     }
117 |   }
118 | 
119 | }


--------------------------------------------------------------------------------
/scripts/simulate.R:
--------------------------------------------------------------------------------
 1 | # Warning! R 3.4 and Bioconductor 3.5 are required for splatter!
 2 | # library(BiocInstaller)
 3 | # biocLite('splatter')
 4 | library(splatter) # requires splatter >= 1.2.0
 5 | 
 6 | save.sim <- function(sim, dir) {
 7 |   counts     <- counts(sim)
 8 |   truecounts <- assays(sim)$TrueCounts
 9 |   drp <- 'Dropout' %in% names(assays(sim))
10 |   if (drp) {
11 |     dropout    <- assays(sim)$Dropout
12 |     mode(dropout) <- 'integer'
13 |   }
14 |   cellinfo   <- colData(sim)
15 |   geneinfo   <- rowData(sim)
16 | 
17 |   # save count matrices
18 |   write.table(counts, paste0(dir, '/counts.tsv'),
19 |               sep='\t', row.names=T, col.names=T, quote=F)
20 |   write.table(truecounts, paste0(dir, '/info_truecounts.tsv'),
21 |               sep='\t', row.names=T, col.names=T, quote=F)
22 | 
23 |   if (drp) {
24 |     # save ground truth dropout labels
25 |     write.table(dropout, paste0(dir, '/info_droupout.tsv'),
26 |                 sep='\t', row.names=T, col.names=T, quote=F)
27 |   }
28 | 
29 |   # save metadata
30 |   write.table(cellinfo, paste0(dir, '/info_cellinfo.tsv'), sep='\t',
31 |               row.names=F, quote=F)
32 |   write.table(geneinfo, paste0(dir, '/info_geneinfo.tsv'), sep='\t',
33 |               row.names=F, quote=F)
34 | 
35 |   saveRDS(sim, paste0(dir, '/sce.rds'))
36 | }
37 | 
38 | 
39 | for (dropout in c(0, 1, 3, 5)) {
40 |   for (ngroup in c(1, 2, 3, 6)) {
41 |     for(swap in c(F, T)) {
42 | 
43 |       nGenes <- 200
44 |       batchCells <- 2000
45 | 
46 |       if (swap) {
47 |         tmp <- nGenes
48 |         nGenes <- batchCells
49 |         batchCells <- tmp
50 |       }
51 | 
52 |       # split nCells into roughly ngroup groups
53 |       if(ngroup==1) {
54 |         group.prob <- 1
55 |       } else {
56 |         group.prob <- rep(1, ngroup)/ngroup
57 |       }
58 |       method <- ifelse(ngroup == 1, 'single', 'groups')
59 | 
60 |       dirname <- paste0('real/group', ngroup, '/dropout', dropout, ifelse(swap, '/swap', ''))
61 |       if (!dir.exists(dirname))
62 |         dir.create(dirname, showWarnings=F, recursive=T)
63 | 
64 |       #### Estimate parameters from the real dataset
65 |       data(sc_example_counts)
66 |       params <- splatEstimate(sc_example_counts)
67 | 
68 |       # simulate scRNA data
69 |       sim <- splatSimulate(params, group.prob=group.prob, nGenes=nGenes,
70 |                            dropout.present=(dropout!=0), dropout.shape=-1,
71 |                            dropout.mid=dropout, seed=42, method=method,
72 |                            bcv.common=1) # limit disp to get fewer true zeros
73 |       save.sim(sim, dirname)
74 | 
75 | 
76 |       dirname <- paste0('sim/group', ngroup, '/dropout', dropout, ifelse(swap, '/swap', ''))
77 |       if (!dir.exists(dirname))
78 |         dir.create(dirname, showWarnings=F, recursive=T)
79 | 
80 |       #### Simulate data without using real data
81 |       sim <- splatSimulate(group.prob=group.prob, nGenes=nGenes, batchCells=batchCells,
82 |                            dropout.present=(dropout!=0), method=method,
83 |                            seed=42, dropout.shape=-1, dropout.mid=dropout)
84 |       save.sim(sim, dirname)
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='DCA',
 5 |     version='0.3.3',
 6 |     description='Count autoencoder for scRNA-seq denoising',
 7 |     author='Gokcen Eraslan',
 8 |     author_email="gokcen.eraslan@gmail.com",
 9 |     packages=['dca'],
10 |     install_requires=['numpy>=1.7',
11 |                       'keras>=2.4,<2.6',
12 |                       'tensorflow>=2.0,<2.5',
13 |                       'h5py',
14 |                       'six>=1.10.0',
15 |                       'scikit-learn',
16 |                       'scanpy',
17 |                       'kopt',
18 |                       'pandas' #for preprocessing
19 |                       ],
20 |     url='https://github.com/theislab/dca',
21 |     entry_points={
22 |         'console_scripts': [
23 |             'dca = dca.__main__:main'
24 |     ]},
25 |     license='Apache License 2.0',
26 |     classifiers=['License :: OSI Approved :: Apache Software License',
27 |                 'Topic :: Scientific/Engineering :: Artificial Intelligence',
28 |                  'Programming Language :: Python :: 3.5'],
29 | )
30 | 


--------------------------------------------------------------------------------