├── .DS_Store ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── .DS_Store ├── .ipynb_checkpoints │ ├── scGeneFit_example-checkpoint.ipynb │ ├── scGeneFit_functional_groups-checkpoint.ipynb │ └── scGeneFit_large_scale-checkpoint.ipynb ├── plot.pdf ├── scGeneFit_example.ipynb ├── scGeneFit_functional_groups.ipynb └── scGeneFit_large_scale.ipynb ├── imgs ├── .DS_Store ├── output_11_1.png ├── output_14_1.png ├── output_17_1.png ├── output_20_1.png ├── output_25_1.png ├── output_31_1.png ├── output_34_1.png ├── output_34_2.png ├── output_34_3.png ├── output_34_4.png ├── output_34_5.png ├── output_34_6.png ├── output_34_7.png └── output_38_1.png ├── scGeneFit ├── .DS_Store ├── __init__.py ├── data_files │ ├── .DS_Store │ ├── CITEseq-labels.mat │ ├── CITEseq.mat │ ├── CITEseq_names.mat │ ├── __init__.py │ ├── data source │ ├── zeisel_data.mat │ ├── zeisel_labels1.mat │ ├── zeisel_labels2.mat │ └── zeisel_names.mat └── functions.py └── setup.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Bianca Dumitrascu, Soledad Villar, Dustin Mixon, Barbara Engelhardt 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include scGeneFit/data_files/CITEseq_names.mat 3 | include scGeneFit/data_files/CITEseq-labels.mat 4 | include scGeneFit/data_files/CITEseq.mat 5 | include scGeneFit/data_files/CITseq.mat 6 | include scGeneFit/data_files/zeisel_data.mat 7 | include scGeneFit/data_files/zeisel_labels1.mat 8 | include scGeneFit/data_files/zeisel_labels2.mat 9 | include scGeneFit/data_files/zeisel_names.mat 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scGeneFit 2 | 3 | Python code for genetic marker selection using linear programming. 4 | 5 | The algorithm is described in https://www.biorxiv.org/content/10.1101/599654v1 6 | 7 | Dependencies: numpy, matplotlib, scipy, sklearn. 8 | 9 | Examples and source code: https://github.com/solevillar/scGeneFit-python 10 | 11 | The package main function is scGeneFit.functions.get_markers() 12 | 13 | get_markers(data, labels, num_markers, method='centers', epsilon=1, sampling_rate=1, n_neighbors=3, max_constraints=1000, redundancy=0.01, verbose=True) 14 | 15 | 16 | - data: Nxd numpy array with point coordinates, N: number of points, d: dimension 17 | - labels: list with labels (N labels, one per point) 18 | - num_markers: target number of markers to select (num_markersDelta, where Delta is chosen to be epsilon times the norm of the smallest constraint (default 1) 24 | **This is the most important parameter in this problem, it determines the scale of the constraints, the rest the rest of the parameters only determine the size of the LP to adapt to limited computational resources. We include a function that finds the optimal value of epsilon given a classifier and a training/test set. We provide an example of the optimization in scGeneFit_functional_groups.ipynb** 25 | - sampling_rate: (if method=='pairwise' or 'pairwise_centers') selects constraints from a random sample of proportion sampling_rate (default 1) 26 | - n_neighbors: (if method=='pairwise') chooses the constraints from n_neighbors nearest neighbors (default 3) 27 | - max_constraints: maximum number of constraints to consider (default 1000) 28 | - redundancy: (if method=='centers') in this case not all pairwise constraints are considered but just between centers of consecutive labels plus a random fraction of constraints given by redundancy. If redundancy==1 all constraints between pairs of centers are considered 29 | - verbose: whether it prints information like size of the LP or elapsed time (default True) 30 | 31 | 32 | 33 | ```python 34 | from scGeneFit.functions import * 35 | 36 | %matplotlib inline 37 | import numpy as np 38 | np.random.seed(0) 39 | ``` 40 | 41 | 42 | ```python 43 | 44 | ``` 45 | 46 | #### Auxiliary functions 47 | 48 | 49 | ```python 50 | from sklearn.neighbors import NearestCentroid 51 | clf=NearestCentroid() 52 | 53 | def performance(X_train, y_train, X_test, y_test, clf): 54 | clf.fit(X_train, y_train) 55 | return clf.score(X_test, y_test) 56 | ``` 57 | 58 | # CITEseq example 59 | 60 | Data included in package, from 61 | 62 | [1] Marlon Stoeckius, Christoph Hafemeister, William Stephenson, Brian Houck-Loomis, Pratip K Chattopadhyay, Harold Swerdlow, Rahul Satija, and Peter Smibert. 63 | Simultaneous epitope and transcriptome measurement insingle cells. Nature Methods, 14(9):865, 2017. 64 | 65 | 66 | ```python 67 | #load data from files 68 | [data, labels, names]= load_example_data("CITEseq") 69 | N,d=data.shape 70 | ``` 71 | 72 | ## Use of scGeneFit (center based constraints) 73 | 74 | 75 | ```python 76 | num_markers=25 77 | method='centers' 78 | redundancy=0.25 79 | 80 | markers= get_markers(data, labels, num_markers, method=method, redundancy=redundancy) 81 | 82 | accuracy=performance(data, labels, data, labels, clf) 83 | accuracy_markers=performance(data[:,markers], labels, data[:,markers], labels, clf) 84 | 85 | print("Accuracy (whole data,", d, " markers): ", accuracy) 86 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 87 | ``` 88 | 89 | Solving a linear program with 500 variables and 45 constraints 90 | Time elapsed: 0.3295409679412842 seconds 91 | Accuracy (whole data, 500 markers): 0.8660786816757572 92 | Accuracy (selected 25 markers) 0.7863525588952072 93 | 94 | 95 | 96 | ```python 97 | #TSNE plot 98 | a=plot_marker_selection(data, markers, names) 99 | ``` 100 | 101 | Computing TSNE embedding 102 | Elapsed time: 117.06255102157593 seconds 103 | 104 | 105 | 106 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_11_1.png) 107 | 108 | 109 | ## Use of scGeneFit (pairwise distance constraints) 110 | 111 | 112 | ```python 113 | num_markers=25 114 | method='pairwise' 115 | sampling_rate=0.1 #use 10 percent of the data to generate constraints 116 | n_neighbors=3 #3 constraints per point 117 | epsilon=1 #Delta is 10*norm of the smallest constraint 118 | max_constraints=1000 #use at most 1000 constraints (for efficiency) 119 | 120 | markers= get_markers(data, labels, num_markers, method=method, sampling_rate=sampling_rate, 121 | n_neighbors=n_neighbors, epsilon=epsilon, max_constraints=max_constraints) 122 | 123 | accuracy=performance(data, labels, data, labels, clf) 124 | accuracy_markers=performance(data[:,markers], labels, data[:,markers], labels, clf) 125 | 126 | print("Accuracy (whole data,", d, " markers): ", accuracy) 127 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 128 | ``` 129 | 130 | Solving a linear program with 500 variables and 1000 constraints 131 | Time elapsed: 6.737841844558716 seconds 132 | Accuracy (whole data, 500 markers): 0.8660786816757572 133 | Accuracy (selected 25 markers) 0.7710340025530927 134 | 135 | 136 | 137 | ```python 138 | #TSNE plot 139 | a=plot_marker_selection(data, markers, names) 140 | ``` 141 | 142 | Computing TSNE embedding 143 | Elapsed time: 118.96086025238037 seconds 144 | 145 | 146 | 147 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_14_1.png) 148 | 149 | 150 | ## Use of scGeneFit (pairwise center based constraints) 151 | 152 | 153 | ```python 154 | num_markers=25 155 | method='pairwise_centers' 156 | sampling_rate=0.1 #use 10 percent of the data to generate constraints 157 | n_neighbors=0 #neighbors are not used for the center constraints 158 | epsilon=10 #Delta is 10*norm of the smallest constraint 159 | max_constraints=1000 #use at most 5000 constraints (for efficiency) 160 | 161 | markers= get_markers(data, labels, num_markers, method=method, 162 | sampling_rate=sampling_rate, n_neighbors=n_neighbors, epsilon=epsilon, 163 | max_constraints=max_constraints) 164 | 165 | accuracy=performance(data, labels, data, labels, clf) 166 | accuracy_markers=performance(data[:,markers], labels, data[:,markers], labels, clf) 167 | 168 | print("Accuracy (whole data,", d, " markers): ", accuracy) 169 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 170 | ``` 171 | 172 | Solving a linear program with 500 variables and 1000 constraints 173 | Time elapsed: 4.070271015167236 seconds 174 | Accuracy (whole data, 500 markers): 0.8660786816757572 175 | Accuracy (selected 25 markers) 0.7864686085644655 176 | 177 | 178 | 179 | ```python 180 | #TSNE plot 181 | a=plot_marker_selection(data, markers, names) 182 | ``` 183 | 184 | Computing TSNE embedding 185 | Elapsed time: 118.61988186836243 seconds 186 | 187 | 188 | 189 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_17_1.png) 190 | 191 | 192 | ## One vs all markers 193 | 194 | 195 | ```python 196 | markers2=one_vs_all_selection(data,labels) 197 | 198 | accuracy=performance(data, labels, data, labels, clf) 199 | accuracy_markers=performance(data[:,markers2], labels, data[:,markers2], labels, clf) 200 | 201 | print("Accuracy (whole data,", d, " markers): ", accuracy) 202 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 203 | ``` 204 | 205 | Accuracy (whole data, 500 markers): 0.8660786816757572 206 | Accuracy (selected 25 markers) 0.7537426018335848 207 | 208 | 209 | 210 | ```python 211 | a=plot_marker_selection(data, markers2, names) 212 | ``` 213 | 214 | Computing TSNE embedding 215 | Elapsed time: 115.60354685783386 seconds 216 | 217 | 218 | 219 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_20_1.png) 220 | 221 | 222 | # Zeisel example 223 | Zeisel data included in package, from 224 | 225 | [2] Amit Zeisel, Ana B Munoz-Manchado, Simone Codeluppi, Peter Lonnerberg, Gioele La Manno, Anna Jureus, Sueli Marques, Hermany Munguba, Liqun He, Christer Betsholtz, et al. 226 | Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq. Science, 347(6226):1138–1142, 2015. 227 | 228 | This example exhibits a hierarchical clustering structure. We use the function get_markers_hierarchy that takes the hierarchical structure into consideration to select the constraints. 229 | 230 | 231 | 232 | ```python 233 | #load data from file 234 | [data, labels, names]=load_example_data("zeisel") 235 | N,d=data.shape 236 | ``` 237 | 238 | ## Use of scGeneFit (center based constraints) 239 | 240 | 241 | ```python 242 | num_markers=25 243 | method='centers' 244 | redundancy=0.1 245 | 246 | markers= get_markers_hierarchy(data, labels, num_markers, method=method, redundancy=redundancy) 247 | 248 | accuracy=performance(data, labels[0], data, labels[0], clf) 249 | accuracy_markers=performance(data[:,markers], labels[0], data[:,markers], labels[0], clf) 250 | 251 | print("Accuracy (whole data,", d, " markers): ", accuracy) 252 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 253 | 254 | ``` 255 | 256 | Solving a linear program with 4000 variables and 96 constraints 257 | Time elapsed: 67.69524931907654 seconds 258 | Accuracy (whole data, 4000 markers): 0.8745424292845257 259 | Accuracy (selected 25 markers) 0.8861896838602329 260 | 261 | 262 | 263 | ```python 264 | #TSNE plot 265 | a=plot_marker_selection(data, markers, names[0]) 266 | ``` 267 | 268 | Computing TSNE embedding 269 | Elapsed time: 71.29064297676086 seconds 270 | 271 | 272 | 273 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_25_1.png) 274 | 275 | 276 | ## Use of scGeneFit (pairwise distance constraints) 277 | 278 | 279 | ```python 280 | num_markers=25 281 | method='pairwise' 282 | sampling_rate=0.05 #use 5 percent of the data to generate constraints 283 | n_neighbors=3 #3 constraints per point 284 | epsilon=10 #Delta is 10*norm of the smallest constraint 285 | max_constraints=500 #use at most 500 constraints (for efficiency) 286 | use_centers=False #constraints given by pairwise distances 287 | 288 | markers= get_markers_hierarchy(data, labels, num_markers, method=method, 289 | sampling_rate=sampling_rate, n_neighbors=n_neighbors, epsilon=epsilon) 290 | 291 | accuracy=performance(data, labels[0], data, labels[0], clf) 292 | accuracy_markers=performance(data[:,markers], labels[0], data[:,markers], labels[0], clf) 293 | 294 | print("Accuracy (whole data,", d, " markers): ", accuracy) 295 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 296 | 297 | ``` 298 | 299 | Solving a linear program with 4000 variables and 1000 constraints 300 | Time elapsed: 40.95984506607056 seconds 301 | Accuracy (whole data, 4000 markers): 0.8745424292845257 302 | Accuracy (selected 25 markers) 0.8435940099833611 303 | 304 | 305 | 306 | ```python 307 | 308 | ``` 309 | 310 | ## Use of scGeneFit (pairwise center based constraints) 311 | 312 | 313 | ```python 314 | num_markers=25 315 | method='pairwise_centers' 316 | sampling_rate=0.05 #use 5 percent of the data to generate constraints 317 | n_neighbors=0 #neighbors are not used for the center constraints 318 | epsilon=10 #Delta is 10*norm of the smallest constraint 319 | max_constraints=500 #use at most 500 constraints (for efficiency) 320 | use_centers=True #constraints given by pairwise distances 321 | 322 | markers = get_markers_hierarchy(data, labels, num_markers, method=method, 323 | sampling_rate=sampling_rate, n_neighbors=n_neighbors, epsilon=epsilon) 324 | 325 | accuracy=performance(data, labels[0], data, labels[0], clf) 326 | accuracy_markers=performance(data[:,markers], labels[0], data[:,markers], labels[0], clf) 327 | 328 | print("Accuracy (whole data,", d, " markers): ", accuracy) 329 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 330 | ``` 331 | 332 | Solving a linear program with 4000 variables and 1000 constraints 333 | Time elapsed: 168.19283509254456 seconds 334 | Accuracy (whole data, 4000 markers): 0.8745424292845257 335 | Accuracy (selected 25 markers) 0.9237936772046589 336 | 337 | 338 | 339 | ```python 340 | #TSNE plot 341 | a=plot_marker_selection(data, markers, names[0]) 342 | ``` 343 | 344 | Computing TSNE embedding 345 | Elapsed time: 69.88537192344666 seconds 346 | 347 | 348 | 349 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_31_1.png) 350 | 351 | 352 | 353 | ```python 354 | 355 | ``` 356 | 357 | ### Example from second level of the hierarchy 358 | 359 | 360 | ```python 361 | for name in set(names[0]): 362 | idx=[s for s in range(len(names[0])) if names[0][s]==name] 363 | aux=plot_marker_selection(data[idx], markers, [names[1][s] for s in idx]) 364 | ``` 365 | 366 | Computing TSNE embedding 367 | Elapsed time: 8.925884008407593 seconds 368 | Computing TSNE embedding 369 | Elapsed time: 1.5634911060333252 seconds 370 | Computing TSNE embedding 371 | Elapsed time: 0.638862133026123 seconds 372 | Computing TSNE embedding 373 | Elapsed time: 7.366800785064697 seconds 374 | Computing TSNE embedding 375 | Elapsed time: 1.0175950527191162 seconds 376 | Computing TSNE embedding 377 | Elapsed time: 2.3961689472198486 seconds 378 | Computing TSNE embedding 379 | Elapsed time: 1.0149860382080078 seconds 380 | 381 | 382 | 383 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_1.png) 384 | 385 | 386 | 387 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_2.png) 388 | 389 | 390 | 391 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_3.png) 392 | 393 | 394 | 395 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_4.png) 396 | 397 | 398 | 399 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_5.png) 400 | 401 | 402 | 403 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_6.png) 404 | 405 | 406 | 407 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_34_7.png) 408 | 409 | 410 | 411 | ```python 412 | 413 | ``` 414 | 415 | ## One vs all markers 416 | 417 | 418 | ```python 419 | markers2=one_vs_all_selection(data,labels[0]) 420 | 421 | accuracy=performance(data, labels[0], data, labels[0], clf) 422 | accuracy_markers=performance(data[:,markers2], labels[0], data[:,markers2], labels[0], clf) 423 | 424 | print("Accuracy (whole data,", d, " markers): ", accuracy) 425 | print("Accuracy (selected", num_markers, "markers)", accuracy_markers) 426 | ``` 427 | 428 | Accuracy (whole data, 4000 markers): 0.8745424292845257 429 | Accuracy (selected 25 markers) 0.8569051580698835 430 | 431 | 432 | 433 | ```python 434 | a=plot_marker_selection(data, markers2, names[0]) 435 | ``` 436 | 437 | Computing TSNE embedding 438 | Elapsed time: 69.53578591346741 seconds 439 | 440 | 441 | 442 | ![png](https://raw.githubusercontent.com/solevillar/scGeneFit-python/master/imgs/output_38_1.png) 443 | 444 | 445 | 446 | ```python 447 | 448 | ``` 449 | -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/examples/.DS_Store -------------------------------------------------------------------------------- /examples/.ipynb_checkpoints/scGeneFit_large_scale-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "colab": { 18 | "base_uri": "https://localhost:8080/", 19 | "height": 71 20 | }, 21 | "colab_type": "code", 22 | "id": "4mgHnCPFcOH9", 23 | "outputId": "c1d5321f-bbb3-4ce0-bf18-d8e74886513a" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib inline\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import itertools\n", 31 | "from scGeneFit.functions import *\n", 32 | "np.random.seed(0)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "colab": {}, 40 | "colab_type": "code", 41 | "id": "qyu6Z11fcUtF" 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "#Definition of functional groups\n", 46 | "#f_groups={functional_group_id: [mean1, mean2, ...], ...} where mean=[gene1,gene2, ...]\n", 47 | "f_groups={0:[[0,0], [1,0], [0,1], [1,1]], 1: [[0],[1]], 2: [[0,0], [0,1], [1,0], [1,1]], 3:[[0,0], [1,0], [0,1], [1,1]], 4: [[0],[1]], \n", 48 | " 5: [[0,0], [0,1], [1,0], [1,1]], 6:[[0,1], [1,0], [0,1], [1,1]], 7: [[2,2],[3,3]], 8: [[0,0], [0,4], [4,0], [4,4]],\n", 49 | " 9:[[0,0], [1,0], [0,1], [1,1]], 10: [[0],[1],[2],[4]], 11: [[0,0], [0,1], [1,0], [1,1]],\n", 50 | " 12:[[0,0], [1,0], [0,1], [1,1]], 13: [[0],[1]], 14: [[0,0], [0,1], [1,0], [1,1]]}\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "colab": {}, 58 | "colab_type": "code", 59 | "id": "7VVDyFMAdmXA" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "def create_data(f_groups, cell_types, sigma, n, repeat):\n", 64 | " aux=list(f_groups.keys())\n", 65 | " data=np.zeros( (0, sum([len(f_groups[aux[i]][0])*repeat[i] for i in range(len(aux))]) ))\n", 66 | " for cell in cell_types:\n", 67 | " expression=np.zeros((n, 0))\n", 68 | " for t in range(len(cell)):\n", 69 | " mean=f_groups[t][cell[t]]\n", 70 | " mean=np.concatenate([[mean[i]]*repeat[t] for i in range(len(mean))])\n", 71 | " expression=np.concatenate([expression, np.random.multivariate_normal(mean, sigma*np.identity(len(mean)), size=n)], axis=1)\n", 72 | " data=np.concatenate([data, expression])\n", 73 | " labels=np.concatenate([[i]*n for i in range(len(cell_types))])\n", 74 | " return data,labels" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "colab": {}, 82 | "colab_type": "code", 83 | "id": "BcqQ7aNMNQ6Y" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "#create data\n", 88 | "classes=40\n", 89 | "n=25\n", 90 | "n_groups=len(list(f_groups.keys()))\n", 91 | "cell_types = np.zeros((classes, n_groups))\n", 92 | "for i in range(n_groups):\n", 93 | " aux=np.random.permutation(classes)\n", 94 | " idx=list(f_groups.keys())[i]\n", 95 | " n_means=len(f_groups[idx])\n", 96 | " start=0\n", 97 | " offset=int(classes/n_means)\n", 98 | " for s in range(n_means):\n", 99 | " cell_types[aux[start:start+offset], i] = s\n", 100 | " start=start+offset+1\n", 101 | "cell_types=cell_types.astype(int)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": { 108 | "colab": { 109 | "base_uri": "https://localhost:8080/", 110 | "height": 134 111 | }, 112 | "colab_type": "code", 113 | "id": "7PbXSbXY1ia0", 114 | "outputId": "d543d938-4523-479f-9720-a06eb1272751" 115 | }, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "Text(0, 0.5, 'synthetic cells')" 121 | ] 122 | }, 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | }, 127 | { 128 | "data": { 129 | "image/png": "\n", 130 | "text/plain": [ 131 | "
" 132 | ] 133 | }, 134 | "metadata": { 135 | "needs_background": "light" 136 | }, 137 | "output_type": "display_data" 138 | } 139 | ], 140 | "source": [ 141 | "repeat=[10,10,10,10,10]+[np.random.randint(100, 800) for i in range(len(f_groups.keys())-5)]\n", 142 | "\n", 143 | "sc_gene, sc_labels=create_data(f_groups, cell_types, 0.5, n, repeat)\n", 144 | "plt.imshow(sc_gene)\n", 145 | "plt.xlabel('synthetic genes')\n", 146 | "plt.ylabel('synthetic cells')" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": { 153 | "colab": { 154 | "base_uri": "https://localhost:8080/", 155 | "height": 34 156 | }, 157 | "colab_type": "code", 158 | "id": "G_DEw3soIhOw", 159 | "outputId": "4a8141fa-7295-4f7f-a2f4-d49bcbbd4bab" 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "(1000, 8798)\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "#create data\n", 172 | "print(sc_gene.shape)\n", 173 | "X_train, y_train= sc_gene, sc_labels\n", 174 | "X_test, y_test= X_train, y_train\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 7, 180 | "metadata": { 181 | "colab": {}, 182 | "colab_type": "code", 183 | "id": "JuCRszrAjyMc" 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "#Choose an evaluation method (e.g. classification accuracy)\n", 188 | "from sklearn.neighbors import NearestCentroid\n", 189 | "clf=NearestCentroid()\n", 190 | "\n", 191 | "def performance(X_train, y_train, X_test, y_test, clf):\n", 192 | " clf.fit(X_train, y_train)\n", 193 | " return clf.score(X_test, y_test)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 8, 199 | "metadata": { 200 | "colab": { 201 | "base_uri": "https://localhost:8080/", 202 | "height": 34 203 | }, 204 | "colab_type": "code", 205 | "id": "JP7aMd9QuBDP", 206 | "outputId": "249774f5-9fc4-4ea2-8580-4c18a75f2250" 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "0.775\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "markers_ova=one_vs_all_selection(X_train,y_train)\n", 219 | "one_vs_all_accuracy=performance(X_train[:,markers_ova], y_train, X_test[:,markers_ova], y_test, clf)\n", 220 | "print(one_vs_all_accuracy)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": { 227 | "colab": { 228 | "base_uri": "https://localhost:8080/", 229 | "height": 323 230 | }, 231 | "colab_type": "code", 232 | "id": "gFWlEWQUIhhZ", 233 | "outputId": "bd7805e9-192c-4c1d-f4e5-62eb626669ba" 234 | }, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "Solving a linear program with 8798 variables and 43 constraints\n", 241 | "Time elapsed: 681.2142729759216 seconds\n", 242 | "markers: 25 accuracy: 0.623\n", 243 | "Solving a linear program with 8798 variables and 41 constraints\n", 244 | "Time elapsed: 908.7140011787415 seconds\n", 245 | "markers: 30 accuracy: 0.657\n", 246 | "Solving a linear program with 8798 variables and 40 constraints\n", 247 | "Time elapsed: 472.49677991867065 seconds\n", 248 | "markers: 35 accuracy: 0.782\n", 249 | "Solving a linear program with 8798 variables and 43 constraints\n", 250 | "Time elapsed: 451.2964618206024 seconds\n", 251 | "markers: 40 accuracy: 0.851\n", 252 | "Solving a linear program with 8798 variables and 42 constraints\n", 253 | "Time elapsed: 467.5282349586487 seconds\n", 254 | "markers: 45 accuracy: 0.912\n", 255 | "Solving a linear program with 8798 variables and 41 constraints\n", 256 | "Time elapsed: 444.65846705436707 seconds\n", 257 | "markers: 50 accuracy: 0.908\n" 258 | ] 259 | } 260 | ], 261 | "source": [ 262 | "m_range=range(25,55,5)\n", 263 | "\n", 264 | "#obtain markers\n", 265 | "#one vs all:\n", 266 | "opt_epsilon=[0.05 for i in m_range]\n", 267 | "\n", 268 | "\n", 269 | "#scGeneFit\n", 270 | "markers_lp=[]\n", 271 | "accuracy_list=[]\n", 272 | "for m, epsilon in zip(m_range, opt_epsilon):\n", 273 | " aux=get_markers(X_train, y_train, m, method='centers', redundancy=.002, epsilon=epsilon)\n", 274 | " markers_lp= markers_lp + [aux]\n", 275 | " \n", 276 | " accuracy = performance(X_train[:,aux], y_train, X_test[:,aux], y_test, clf)\n", 277 | " print(\"markers:\", m, \"accuracy:\", accuracy)\n", 278 | " accuracy_list+=[accuracy]\n", 279 | " " 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 10, 285 | "metadata": { 286 | "colab": { 287 | "base_uri": "https://localhost:8080/", 288 | "height": 313 289 | }, 290 | "colab_type": "code", 291 | "id": "DiR-DQnF5cGE", 292 | "outputId": "cd167e0e-0bcd-4a0a-9263-6c5e6cbcc9e4" 293 | }, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "image/png": "\n", 298 | "text/plain": [ 299 | "
" 300 | ] 301 | }, 302 | "metadata": { 303 | "needs_background": "light" 304 | }, 305 | "output_type": "display_data" 306 | } 307 | ], 308 | "source": [ 309 | "l=accuracy_list\n", 310 | "\n", 311 | "#plot\n", 312 | "plt.plot(m_range, l)+plt.plot(m_range,[one_vs_all_accuracy for i in l])\n", 313 | "plt.legend([\"scGeneFit\", \"one vs all (40 markers)\"])\n", 314 | "plt.xlabel(\"number of markers\")\n", 315 | "plt.ylabel(\"accuracy\")\n", 316 | "plt.savefig(\"plot.pdf\")\n", 317 | "\n" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "colab": { 323 | "authorship_tag": "ABX9TyPCgLeiVLppCy8MLEQwaxj9", 324 | "collapsed_sections": [], 325 | "include_colab_link": true, 326 | "machine_shape": "hm", 327 | "name": "scGeneFit_large_scale.ipynb", 328 | "provenance": [] 329 | }, 330 | "kernelspec": { 331 | "display_name": "Python 3 (ipykernel)", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.11.5" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 1 350 | } 351 | -------------------------------------------------------------------------------- /examples/plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/examples/plot.pdf -------------------------------------------------------------------------------- /examples/scGeneFit_large_scale.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "colab": { 18 | "base_uri": "https://localhost:8080/", 19 | "height": 71 20 | }, 21 | "colab_type": "code", 22 | "id": "4mgHnCPFcOH9", 23 | "outputId": "c1d5321f-bbb3-4ce0-bf18-d8e74886513a" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib inline\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import itertools\n", 31 | "from scGeneFit.functions import *\n", 32 | "np.random.seed(0)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "colab": {}, 40 | "colab_type": "code", 41 | "id": "qyu6Z11fcUtF" 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "#Definition of functional groups\n", 46 | "#f_groups={functional_group_id: [mean1, mean2, ...], ...} where mean=[gene1,gene2, ...]\n", 47 | "f_groups={0:[[0,0], [1,0], [0,1], [1,1]], 1: [[0],[1]], 2: [[0,0], [0,1], [1,0], [1,1]], 3:[[0,0], [1,0], [0,1], [1,1]], 4: [[0],[1]], \n", 48 | " 5: [[0,0], [0,1], [1,0], [1,1]], 6:[[0,1], [1,0], [0,1], [1,1]], 7: [[2,2],[3,3]], 8: [[0,0], [0,4], [4,0], [4,4]],\n", 49 | " 9:[[0,0], [1,0], [0,1], [1,1]], 10: [[0],[1],[2],[4]], 11: [[0,0], [0,1], [1,0], [1,1]],\n", 50 | " 12:[[0,0], [1,0], [0,1], [1,1]], 13: [[0],[1]], 14: [[0,0], [0,1], [1,0], [1,1]]}\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "colab": {}, 58 | "colab_type": "code", 59 | "id": "7VVDyFMAdmXA" 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "def create_data(f_groups, cell_types, sigma, n, repeat):\n", 64 | " aux=list(f_groups.keys())\n", 65 | " data=np.zeros( (0, sum([len(f_groups[aux[i]][0])*repeat[i] for i in range(len(aux))]) ))\n", 66 | " for cell in cell_types:\n", 67 | " expression=np.zeros((n, 0))\n", 68 | " for t in range(len(cell)):\n", 69 | " mean=f_groups[t][cell[t]]\n", 70 | " mean=np.concatenate([[mean[i]]*repeat[t] for i in range(len(mean))])\n", 71 | " expression=np.concatenate([expression, np.random.multivariate_normal(mean, sigma*np.identity(len(mean)), size=n)], axis=1)\n", 72 | " data=np.concatenate([data, expression])\n", 73 | " labels=np.concatenate([[i]*n for i in range(len(cell_types))])\n", 74 | " return data,labels" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "colab": {}, 82 | "colab_type": "code", 83 | "id": "BcqQ7aNMNQ6Y" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "#create data\n", 88 | "classes=40\n", 89 | "n=25\n", 90 | "n_groups=len(list(f_groups.keys()))\n", 91 | "cell_types = np.zeros((classes, n_groups))\n", 92 | "for i in range(n_groups):\n", 93 | " aux=np.random.permutation(classes)\n", 94 | " idx=list(f_groups.keys())[i]\n", 95 | " n_means=len(f_groups[idx])\n", 96 | " start=0\n", 97 | " offset=int(classes/n_means)\n", 98 | " for s in range(n_means):\n", 99 | " cell_types[aux[start:start+offset], i] = s\n", 100 | " start=start+offset+1\n", 101 | "cell_types=cell_types.astype(int)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": { 108 | "colab": { 109 | "base_uri": "https://localhost:8080/", 110 | "height": 134 111 | }, 112 | "colab_type": "code", 113 | "id": "7PbXSbXY1ia0", 114 | "outputId": "d543d938-4523-479f-9720-a06eb1272751" 115 | }, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "Text(0, 0.5, 'synthetic cells')" 121 | ] 122 | }, 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | }, 127 | { 128 | "data": { 129 | "image/png": "", 130 | "text/plain": [ 131 | "
" 132 | ] 133 | }, 134 | "metadata": {}, 135 | "output_type": "display_data" 136 | } 137 | ], 138 | "source": [ 139 | "repeat=[10,10,10,10,10]+[np.random.randint(100, 800) for i in range(len(f_groups.keys())-5)]\n", 140 | "\n", 141 | "sc_gene, sc_labels=create_data(f_groups, cell_types, 0.5, n, repeat)\n", 142 | "plt.imshow(sc_gene)\n", 143 | "plt.xlabel('synthetic genes')\n", 144 | "plt.ylabel('synthetic cells')" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "metadata": { 151 | "colab": { 152 | "base_uri": "https://localhost:8080/", 153 | "height": 34 154 | }, 155 | "colab_type": "code", 156 | "id": "G_DEw3soIhOw", 157 | "outputId": "4a8141fa-7295-4f7f-a2f4-d49bcbbd4bab" 158 | }, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "(1000, 8798)\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "#create data\n", 170 | "print(sc_gene.shape)\n", 171 | "X_train, y_train= sc_gene, sc_labels\n", 172 | "X_test, y_test= X_train, y_train\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 7, 178 | "metadata": { 179 | "colab": {}, 180 | "colab_type": "code", 181 | "id": "JuCRszrAjyMc" 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "#Choose an evaluation method (e.g. classification accuracy)\n", 186 | "from sklearn.neighbors import NearestCentroid\n", 187 | "clf=NearestCentroid()\n", 188 | "\n", 189 | "def performance(X_train, y_train, X_test, y_test, clf):\n", 190 | " clf.fit(X_train, y_train)\n", 191 | " return clf.score(X_test, y_test)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": { 198 | "colab": { 199 | "base_uri": "https://localhost:8080/", 200 | "height": 34 201 | }, 202 | "colab_type": "code", 203 | "id": "JP7aMd9QuBDP", 204 | "outputId": "249774f5-9fc4-4ea2-8580-4c18a75f2250" 205 | }, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "0.775\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "markers_ova=one_vs_all_selection(X_train,y_train)\n", 217 | "one_vs_all_accuracy=performance(X_train[:,markers_ova], y_train, X_test[:,markers_ova], y_test, clf)\n", 218 | "print(one_vs_all_accuracy)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 9, 224 | "metadata": { 225 | "colab": { 226 | "base_uri": "https://localhost:8080/", 227 | "height": 323 228 | }, 229 | "colab_type": "code", 230 | "id": "gFWlEWQUIhhZ", 231 | "outputId": "bd7805e9-192c-4c1d-f4e5-62eb626669ba" 232 | }, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Solving a linear program with 8798 variables and 43 constraints\n", 239 | "Time elapsed: 1.456153154373169 seconds\n", 240 | "markers: 25 accuracy: 0.623\n", 241 | "Solving a linear program with 8798 variables and 41 constraints\n", 242 | "Time elapsed: 5.00743293762207 seconds\n", 243 | "markers: 30 accuracy: 0.657\n", 244 | "Solving a linear program with 8798 variables and 40 constraints\n", 245 | "Time elapsed: 1.5490071773529053 seconds\n", 246 | "markers: 35 accuracy: 0.782\n", 247 | "Solving a linear program with 8798 variables and 43 constraints\n", 248 | "Time elapsed: 6.25360369682312 seconds\n", 249 | "markers: 40 accuracy: 0.851\n", 250 | "Solving a linear program with 8798 variables and 42 constraints\n", 251 | "Time elapsed: 5.604511976242065 seconds\n", 252 | "markers: 45 accuracy: 0.912\n", 253 | "Solving a linear program with 8798 variables and 41 constraints\n", 254 | "Time elapsed: 5.980199098587036 seconds\n", 255 | "markers: 50 accuracy: 0.908\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "m_range=range(25,55,5)\n", 261 | "\n", 262 | "#obtain markers\n", 263 | "#one vs all:\n", 264 | "opt_epsilon=[0.05 for i in m_range]\n", 265 | "\n", 266 | "\n", 267 | "#scGeneFit\n", 268 | "markers_lp=[]\n", 269 | "accuracy_list=[]\n", 270 | "for m, epsilon in zip(m_range, opt_epsilon):\n", 271 | " aux=get_markers(X_train, y_train, m, method='centers', redundancy=.002, epsilon=epsilon)\n", 272 | " markers_lp= markers_lp + [aux]\n", 273 | " \n", 274 | " accuracy = performance(X_train[:,aux], y_train, X_test[:,aux], y_test, clf)\n", 275 | " print(\"markers:\", m, \"accuracy:\", accuracy)\n", 276 | " accuracy_list+=[accuracy]\n", 277 | " " 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 10, 283 | "metadata": { 284 | "colab": { 285 | "base_uri": "https://localhost:8080/", 286 | "height": 313 287 | }, 288 | "colab_type": "code", 289 | "id": "DiR-DQnF5cGE", 290 | "outputId": "cd167e0e-0bcd-4a0a-9263-6c5e6cbcc9e4" 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "image/png": "", 296 | "text/plain": [ 297 | "
" 298 | ] 299 | }, 300 | "metadata": {}, 301 | "output_type": "display_data" 302 | } 303 | ], 304 | "source": [ 305 | "l=accuracy_list\n", 306 | "\n", 307 | "#plot\n", 308 | "plt.plot(m_range, l)+plt.plot(m_range,[one_vs_all_accuracy for i in l])\n", 309 | "plt.legend([\"scGeneFit\", \"one vs all (40 markers)\"])\n", 310 | "plt.xlabel(\"number of markers\")\n", 311 | "plt.ylabel(\"accuracy\")\n", 312 | "plt.savefig(\"plot.pdf\")\n", 313 | "\n" 314 | ] 315 | } 316 | ], 317 | "metadata": { 318 | "colab": { 319 | "authorship_tag": "ABX9TyPCgLeiVLppCy8MLEQwaxj9", 320 | "collapsed_sections": [], 321 | "include_colab_link": true, 322 | "machine_shape": "hm", 323 | "name": "scGeneFit_large_scale.ipynb", 324 | "provenance": [] 325 | }, 326 | "kernelspec": { 327 | "display_name": "Python 3 (ipykernel)", 328 | "language": "python", 329 | "name": "python3" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 3 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython3", 341 | "version": "3.11.5" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 1 346 | } 347 | -------------------------------------------------------------------------------- /imgs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/.DS_Store -------------------------------------------------------------------------------- /imgs/output_11_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_11_1.png -------------------------------------------------------------------------------- /imgs/output_14_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_14_1.png -------------------------------------------------------------------------------- /imgs/output_17_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_17_1.png -------------------------------------------------------------------------------- /imgs/output_20_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_20_1.png -------------------------------------------------------------------------------- /imgs/output_25_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_25_1.png -------------------------------------------------------------------------------- /imgs/output_31_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_31_1.png -------------------------------------------------------------------------------- /imgs/output_34_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_1.png -------------------------------------------------------------------------------- /imgs/output_34_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_2.png -------------------------------------------------------------------------------- /imgs/output_34_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_3.png -------------------------------------------------------------------------------- /imgs/output_34_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_4.png -------------------------------------------------------------------------------- /imgs/output_34_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_5.png -------------------------------------------------------------------------------- /imgs/output_34_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_6.png -------------------------------------------------------------------------------- /imgs/output_34_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_34_7.png -------------------------------------------------------------------------------- /imgs/output_38_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/imgs/output_38_1.png -------------------------------------------------------------------------------- /scGeneFit/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/.DS_Store -------------------------------------------------------------------------------- /scGeneFit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/__init__.py -------------------------------------------------------------------------------- /scGeneFit/data_files/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/.DS_Store -------------------------------------------------------------------------------- /scGeneFit/data_files/CITEseq-labels.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/CITEseq-labels.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/CITEseq.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/CITEseq.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/CITEseq_names.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/CITEseq_names.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | import importlib.resources as importlib_resources 4 | except ImportError: 5 | # In PY<3.7 fall-back to backported `importlib_resources`. 6 | import importlib_resources 7 | 8 | def get_data(filename): 9 | with importlib_resources.path(__name__, filename) as foo: 10 | return str(foo) -------------------------------------------------------------------------------- /scGeneFit/data_files/data source: -------------------------------------------------------------------------------- 1 | CITE-seq data from 2 | Marlon Stoeckius, Christoph Hafemeister, William Stephenson, Brian Houck-Loomis, Pratip K Chattopadhyay, Harold Swerdlow, Rahul Satija, and Peter Smibert. 3 | Simultaneous epitope and transcriptome measurement insingle cells. Nature Methods, 14(9):865, 2017. 4 | 5 | Zeisel data from 6 | Amit Zeisel, Ana B Munoz-Manchado, Simone Codeluppi, Peter Lonnerberg, Gioele La Manno, Anna Jureus, Sueli Marques, Hermany Munguba, Liqun He, Christer Betsholtz, et al. 7 | Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq. Science, 347(6226):1138–1142, 2015. 8 | -------------------------------------------------------------------------------- /scGeneFit/data_files/zeisel_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/zeisel_data.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/zeisel_labels1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/zeisel_labels1.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/zeisel_labels2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/zeisel_labels2.mat -------------------------------------------------------------------------------- /scGeneFit/data_files/zeisel_names.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/solevillar/scGeneFit-python/19c8a697519ea3e247916696b8fbec3e86c0a6e8/scGeneFit/data_files/zeisel_names.mat -------------------------------------------------------------------------------- /scGeneFit/functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import scipy 4 | import time 5 | import sklearn 6 | import sklearn.manifold 7 | import scipy.io 8 | from . import data_files 9 | 10 | 11 | 12 | def get_markers(data, labels, num_markers, method='centers', epsilon=1, sampling_rate=1, n_neighbors=3, max_constraints=1000, redundancy=0.01, verbose=True): 13 | """marker selection algorithm 14 | data: Nxd numpy array with point coordinates, N: number of points, d: dimension 15 | labels: list with labels (N labels, one per point) 16 | num_markers: target number of markers to select. num_markersDelta, where Delta is chosen to be epsilon times the norm of the smallest constraint (default 1) 19 | (This is the most important parameter in this problem, it determines the scale of the constraints, 20 | the rest the rest of the parameters only determine the size of the LP) 21 | sampling_rate: (if method=='pairwise' or 'pairwise_centers') selects constraints from a random sample of proportion sampling_rate (default 1) 22 | n_neighbors: (if method=='pairwise') chooses the constraints from n_neighbors nearest neighbors (default 3) 23 | max_constraints: maximum number of constraints to consider (default 1000) 24 | redundancy: (if method=='centers') in this case not all pairwise constraints are considered 25 | but just between centers of consecutive labels plus a random fraction of constraints given by redundancy 26 | if redundancy==1 all constraints between pairs of centers are considered """ 27 | d = data.shape[1] 28 | t = time.time() 29 | samples, samples_labels, idx = __sample(data, labels, sampling_rate) 30 | 31 | if method == 'pairwise_centers': 32 | constraints, smallest_norm = __select_constraints_centers( 33 | data, labels, samples, samples_labels) 34 | elif method == 'pairwise': 35 | constraints, smallest_norm = __select_constraints_pairwise( 36 | data, labels, samples, samples_labels, n_neighbors) 37 | else: 38 | constraints, smallest_norm = __select_constraints_summarized(data, labels, redundancy) 39 | 40 | num_cons = constraints.shape[0] 41 | if num_cons > max_constraints: 42 | p = np.random.permutation(num_cons)[0:max_constraints] 43 | constraints = constraints[p, :] 44 | if verbose: 45 | print('Solving a linear program with {} variables and {} constraints'.format( 46 | constraints.shape[1], constraints.shape[0])) 47 | sol = __lp_markers(constraints, num_markers, smallest_norm * epsilon) 48 | if verbose: 49 | print('Time elapsed: {} seconds'.format(time.time() - t)) 50 | x = sol['x'][0:d] 51 | markers = sorted(range(len(x)), key=lambda i: x[i], reverse=True)[ 52 | : num_markers] 53 | return markers 54 | 55 | 56 | def get_markers_hierarchy(data, labels, num_markers, method='centers', sampling_rate=0.1, n_neighbors=3, epsilon=10, max_constraints=1000, redundancy=0.01, verbose=True): 57 | """marker selection algorithm with hierarchical labels 58 | data: Nxd numpy array with point coordinates, N: number of points, d: dimension 59 | labels: list with T lists of labels, where T is the number of layers in the hierarchy (N labels per list, one per point) 60 | num_markers: target number of markers to select. num_markers max_constraints: 106 | p = np.random.permutation(num_cons)[0:max_constraints] 107 | constraints = constraints[p, :] 108 | if verbose: 109 | print('Solving a linear program with {} variables and {} constraints'.format(constraints.shape[1], constraints.shape[0])) 110 | sol = __lp_markers(constraints, num_markers, smallest_norm * epsilon) 111 | if verbose: 112 | print('Time elapsed: {} seconds'.format(time.time() - t)) 113 | x = sol['x'][0:d] 114 | markers = sorted(range(len(x)), key=lambda i: x[i], reverse=True)[ 115 | : num_markers] 116 | return markers 117 | 118 | 119 | def __sample(data, labels, sampling_rate): 120 | """subsample data""" 121 | indices = [] 122 | for i in set(labels): 123 | idxs = [x for x in range(len(labels)) if labels[x] == i] 124 | n = len(idxs) 125 | s = int(np.ceil(len(idxs) * sampling_rate)) 126 | aux = np.random.permutation(n)[0:s] 127 | indices += [idxs[x] for x in aux] 128 | return [data[i] for i in indices], [labels[i] for i in indices], indices 129 | 130 | 131 | def __select_constraints_summarized(data, labels, redundancy=0.01): 132 | """selects constraints of the form c_a-c_(a+1) where c_i's are the empirical centers of different classes""" 133 | constraints = [] 134 | centers = {} 135 | smallest_norm = np.inf 136 | labels_set = list(set(labels)) 137 | k = len(labels_set) 138 | for idx in labels_set: 139 | X = [data[x, :] for x in range(len(labels)) if labels[x] == idx] 140 | centers[idx] = np.array(X).mean(axis=0) 141 | for i in range(len(labels_set)): 142 | v = centers[labels_set[i]]-centers[labels_set[(i+1) % k]] 143 | constraints += [v] 144 | if np.linalg.norm(v) ** 2 < smallest_norm: 145 | smallest_norm = np.linalg.norm(v) ** 2 146 | for j in range(len(labels_set)): 147 | if j != i and j != (i+1) % k: 148 | if np.random.rand() < redundancy: 149 | v = centers[labels_set[j]]-centers[labels_set[(j+1) % k]] 150 | constraints += [v] 151 | if np.linalg.norm(v) ** 2 < smallest_norm: 152 | smallest_norm = np.linalg.norm(v) ** 2 153 | constraints = np.array(constraints) 154 | return -constraints * constraints, smallest_norm 155 | 156 | 157 | def __select_constraints_pairwise(data, labels, samples, samples_labels, n_neighbors): 158 | """select constraints of the form x-y where x,y have different labels""" 159 | constraints = [] 160 | # nearest neighbors are selected from the entire set 161 | neighbors = {} 162 | data_by_label = {} 163 | smallest_norm = np.inf 164 | for i in set(labels): 165 | X = [data[x, :] for x in range(len(labels)) if labels[x] == i] 166 | data_by_label[i] = X 167 | neighbors[i] = sklearn.neighbors.NearestNeighbors( 168 | n_neighbors=n_neighbors).fit(np.array(X)) 169 | # compute nearest neighbor for samples 170 | for i in neighbors.keys(): 171 | Y = [samples[x] 172 | for x in range(len(samples_labels)) if samples_labels[x] == i] 173 | for j in neighbors.keys(): 174 | if i != j: 175 | idx = neighbors[j].kneighbors(Y)[1] 176 | for s in range(len(Y)): 177 | for t in idx[s]: 178 | v = Y[s] - data_by_label[j][t] 179 | constraints += [v] 180 | if np.linalg.norm(v) ** 2 < smallest_norm: 181 | smallest_norm = np.linalg.norm(v) ** 2 182 | constraints = np.array(constraints) 183 | return -constraints * constraints, smallest_norm 184 | 185 | 186 | def __select_constraints_centers(data, labels, samples, samples_labels): 187 | """select constraints of the form (x-ct')^2 - (x-ct)^2> Delta^2 y where x belongs to cluster with center ct""" 188 | constraints = [] 189 | # nearest neighbors are selected from the entire set 190 | centers_by_label = {} 191 | smallest_norm = np.inf 192 | for i in set(labels): 193 | X = np.array([data[x, :] 194 | for x in range(len(labels)) if labels[x] == i]) 195 | centers_by_label[i] = np.sum(X, axis=0) / X.shape[0] 196 | # compute nearest neighbor for samples 197 | for p in range(len(samples)): 198 | # distance to it's own center 199 | aux0 = (samples[p] - centers_by_label[samples_labels[p]]) * \ 200 | (samples[p] - centers_by_label[samples_labels[p]]) 201 | for i in set(labels): 202 | if samples_labels[p] != i: 203 | # distance to other centers 204 | aux1 = (samples[p] - centers_by_label[i]) * \ 205 | (samples[p] - centers_by_label[i]) 206 | constraints += [aux0 - aux1] 207 | if np.linalg.norm(aux0 - aux1) < smallest_norm: 208 | smallest_norm = np.linalg.norm(aux0-aux1) 209 | constraints = np.array(constraints) 210 | return constraints, smallest_norm 211 | 212 | 213 | def __lp_markers(constraints, num_markers, epsilon): 214 | m, d = constraints.shape 215 | c = np.concatenate((np.zeros(d), np.ones(m))) 216 | l = np.zeros(d + m) 217 | u = np.concatenate((np.ones(d), np.array([None for i in range(m)]))) 218 | aux1 = np.concatenate((constraints, -np.identity(m)), axis=1) 219 | aux2 = np.concatenate((np.ones((1, d)), np.zeros((1, m))), axis=1) 220 | A = np.concatenate((aux1, aux2), axis=0) 221 | b = np.concatenate((-epsilon * np.ones(m), np.array([num_markers]))) 222 | bounds = [(l[i], u[i]) for i in range(d + m)] 223 | sol = scipy.optimize.linprog(c, A, b, None, None, bounds) 224 | return sol 225 | 226 | 227 | def circles_example(N=30, d=5): 228 | num_markers = 2 229 | X = np.concatenate((np.array([[np.sin(2 * np.pi * i / N), np.cos(2 * np.pi * i / N)] for i in range(N)]), 230 | np.random.random((N, d - 2))), axis=1) 231 | Y = np.concatenate((np.array([[2 * np.sin(2 * np.pi * i / N), 2 * np.cos(2 * np.pi * i / N)] for i in range(N)]), 232 | np.random.random((N, d - 2))), axis=1) 233 | data = np.concatenate((X, Y), axis=0) 234 | labels = np.concatenate((np.zeros(10), np.ones(10))) 235 | fig = plt.figure() 236 | ax = fig.add_subplot(121, projection='3d') 237 | ax.scatter(data[0:N, 0], data[0:N, 1], data[0:N, 2], c='r', marker='o') 238 | ax.scatter(data[N + 1:2 * N, 0], data[N + 1:2 * N, 1], 239 | data[N + 1:2 * N, 2], c='g', marker='x') 240 | plt.show() 241 | sol = get_markers(data, labels, num_markers, 1, 3, 10) 242 | x = sol['x'][0:d] 243 | markers = sorted(range(len(x)), key=lambda i: x[i], reverse=True)[ 244 | :num_markers] 245 | for i in range(d): 246 | if i not in markers: 247 | data[:, i] = np.zeros(2 * N) 248 | ax2 = fig.add_subplot(122, projection='3d') 249 | ax2.scatter(data[0:N, 0], data[0:N, 1], data[0:N, 2], c='r', marker='o') 250 | ax2.scatter(data[N + 1:2 * N, 0], data[N + 1:2 * N, 1], 251 | data[N + 1:2 * N, 2], c='g', marker='x') 252 | plt.show() 253 | 254 | 255 | def plot_marker_selection(data, markers, names, perplexity=40): 256 | print('Computing TSNE embedding') 257 | # code fix to deal with exceptions if there is a particular cell class type with n < 40 258 | # automatically re-scales perplexity in these cases 259 | if len(data) < 40: 260 | perplexity = len(data) - 1 261 | else: 262 | perplexity = 40 263 | t = time.time() 264 | X_original = sklearn.manifold.TSNE( 265 | n_components=2, perplexity=perplexity).fit_transform(data) 266 | X_embedded = sklearn.manifold.TSNE(n_components=2, perplexity=perplexity).fit_transform( 267 | data[:, markers]) 268 | print('Elapsed time: {} seconds'.format(time.time() - t)) 269 | cmap = plt.cm.jet 270 | unique_names = list(set(names)) 271 | num_labels = len(unique_names) 272 | colors = [cmap(int(i * 256 / num_labels)) for i in range(num_labels)] 273 | aux = [colors[unique_names.index(name)] for name in names] 274 | 275 | fig = plt.figure() 276 | ax = fig.add_subplot(121) 277 | for g in unique_names: 278 | i = [s for s in range(len(names)) if names[s] == g] 279 | ax.scatter(X_original[i, 0], X_original[i, 1], 280 | c=[aux[i[0]]], s=5, label=names[i[0]]) 281 | ax.set_title('Original data') 282 | ax2 = fig.add_subplot(122) 283 | for g in np.unique(names): 284 | i = [s for s in range(len(names)) if names[s] == g] 285 | ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], 286 | c=[aux[i[0]]], s=5, label=names[i[0]]) 287 | ax2.set_title('{} markers'.format(len(markers))) 288 | plt.legend(bbox_to_anchor=(1, 1)) 289 | plt.subplots_adjust(right=0.7) 290 | return fig 291 | 292 | 293 | def one_vs_all_selection(data, labels, num_bins=20): 294 | data_by_label = {} 295 | unique_labels = list(set(labels)) 296 | number_classes = len(unique_labels) 297 | [N, d] = data.shape 298 | for lab in unique_labels: 299 | X = [data[x, :] for x in range(len(labels)) if labels[x] == lab] 300 | data_by_label[lab] = X 301 | markers = [None for i in range(number_classes)] 302 | bins = data.max() / num_bins * range(num_bins + 1) 303 | for idx in range(number_classes): 304 | c = unique_labels[idx] 305 | current_class = np.array(data_by_label[c]) 306 | others = np.concatenate([data_by_label[lab] 307 | for lab in unique_labels if lab != c]) 308 | big_dist = 0 309 | for gene in range(d): 310 | if gene not in markers[0:idx]: 311 | [h1, b1] = np.histogram(current_class[:, gene], bins) 312 | h1 = np.array(h1).reshape(1, -1) / current_class.shape[0] 313 | [h2, b2] = np.histogram(others[:, gene], bins) 314 | h2 = np.array(h2).reshape(1, -1) / others.shape[0] 315 | dist = -sklearn.metrics.pairwise.additive_chi2_kernel(h1, h2) 316 | if dist > big_dist: 317 | markers[idx] = gene 318 | big_dist = dist 319 | return markers 320 | 321 | 322 | def optimize_epsilon(data_train, labels_train, data_test, labels_test, num_markers, method='centers', fixed_parameters={}, bounds=[(0.2 , 10)], x0=[1], max_fun_evaluations=20, n_experiments=5, clf=None, hierarchy=False, verbose=True): 323 | """ 324 | Finds the optimal value of epsilon using scipy.optimize.dual_annealing 325 | """ 326 | if clf==None: 327 | clf=sklearn.neighbors.NearestCentroid() 328 | Instance=__ScGeneInstance(data_train, labels_train, data_test, labels_test, clf, num_markers, method, fixed_parameters, n_experiments, hierarchy) 329 | print('Optimizing epsilon for', num_markers, 'markers and', method, 'method.') 330 | res = scipy.optimize.dual_annealing(Instance.error_epsilon, bounds=bounds, x0=x0, maxfun=max_fun_evaluations, no_local_search=True) 331 | return [res.x, 1-res.fun] 332 | 333 | class __ScGeneInstance: 334 | def __init__(self, X_train, y_train, X_test, y_test, clf, num_markers, method, fixed_parameters, n_experiments, hierarchy): 335 | self.X_train=X_train 336 | self.y_train=y_train 337 | self.X_test=X_test 338 | self.y_test=y_test 339 | self.clf=clf 340 | self.num_markers=num_markers 341 | self.method=method 342 | self.fixed_parameters=fixed_parameters 343 | self.n_experiments=n_experiments 344 | self.hierarchy=hierarchy 345 | def error_epsilon(self, epsilon): 346 | return 1-self.accuracy(epsilon) 347 | 348 | def accuracy(self, epsilon): 349 | #compute avg over n_experiments random samples for stability 350 | if self.hierarchy: 351 | markers=[get_markers_hierarchy(self.X_train, self.y_train, self.num_markers, self.method, epsilon=epsilon, verbose=False, **self.fixed_parameters) for i in range(self.n_experiments)] 352 | else: 353 | markers=[get_markers(self.X_train, self.y_train, self.num_markers, self.method, epsilon=epsilon, verbose=False, **self.fixed_parameters) for i in range(self.n_experiments)] 354 | val=[self.performance( markers[i] ) for i in range(self.n_experiments)] 355 | return np.mean(val) 356 | 357 | def performance(self, markers): 358 | if self.hierarchy: 359 | self.clf.fit(self.X_train[:,markers], self.y_train[0]) 360 | return self.clf.score(self.X_test[:,markers], self.y_test[0]) 361 | else: 362 | self.clf.fit(self.X_train[:,markers], self.y_train) 363 | return self.clf.score(self.X_test[:,markers], self.y_test) 364 | 365 | def load_example_data(name): 366 | if name=="CITEseq": 367 | a = scipy.io.loadmat(data_files.get_data("CITEseq.mat")) 368 | data= a['G'].T 369 | N,d=data.shape 370 | #transformation from integer entries 371 | data=np.log(data+np.ones(data.shape)) 372 | for i in range(N): 373 | data[i,:]=data[i,:]/np.linalg.norm(data[i,:]) 374 | #load labels from file 375 | a = scipy.io.loadmat(data_files.get_data("CITEseq-labels.mat")) 376 | l_aux = a['labels'] 377 | labels = np.array([i for [i] in l_aux]) 378 | #load names from file 379 | a = scipy.io.loadmat(data_files.get_data("CITEseq_names.mat")) 380 | names=[a['citeseq_names'][i][0][0] for i in range(N)] 381 | return [data, labels, names] 382 | elif name=="zeisel": 383 | #load data from file 384 | a = scipy.io.loadmat(data_files.get_data("zeisel_data.mat")) 385 | data= a['zeisel_data'].T 386 | N,d=data.shape 387 | 388 | #load labels (first level of the hierarchy) from file 389 | a = scipy.io.loadmat(data_files.get_data("zeisel_labels1.mat")) 390 | l_aux = a['zeisel_labels1'] 391 | l_0=[l_aux[i][0] for i in range(l_aux.shape[0])] 392 | #load labels (second level of the hierarchy) from file 393 | a = scipy.io.loadmat(data_files.get_data("zeisel_labels2.mat")) 394 | l_aux = a['zeisel_labels2'] 395 | l_1=[l_aux[i][0] for i in range(l_aux.shape[0])] 396 | #construct an array with hierarchy labels 397 | labels=np.array([l_0, l_1]) 398 | 399 | # load names from file 400 | a = scipy.io.loadmat(data_files.get_data("zeisel_names.mat")) 401 | names0=[a['zeisel_names'][i][0][0] for i in range(N)] 402 | names1=[a['zeisel_names'][i][1][0] for i in range(N)] 403 | return [data, labels, [names0,names1]] 404 | else: 405 | print("currently available options are only 'CITEseq' and 'zeisel'") 406 | 407 | 408 | 409 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='scGeneFit', 8 | version='1.0.0', 9 | author="Soledad Villar", 10 | author_email="soledad.villar@nyu.edu", 11 | description="Genetic marker selection with linear programming", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/solevillar/scGeneFit-python", 15 | packages=setuptools.find_packages(), 16 | include_package_data=True, 17 | install_requires=['numpy', 'matplotlib', 'scipy', 'scikit-learn'], 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | python_requires='>=3.6', 24 | ) 25 | --------------------------------------------------------------------------------