├── .gitignore ├── AUTHORS.txt ├── LICENCE.txt ├── MANIFEST ├── MANIFEST.in ├── README.md ├── adenine ├── __init__.py ├── ade_config.py ├── cluster │ ├── __init__.py │ ├── agglomerative.py │ └── optics.py ├── core │ ├── __init__.py │ ├── analyze_results.py │ ├── define_pipeline.py │ ├── job_distribution.py │ ├── pipelines.py │ ├── plotting.py │ └── template │ │ ├── __init__.py │ │ ├── d3_template.py │ │ └── svg-crowbar.js ├── examples │ ├── ade_config.py │ └── data │ │ ├── X.csv │ │ ├── X.npy │ │ ├── X_missing.csv │ │ ├── Y_missing_test.csv │ │ ├── y.csv │ │ └── y.npy ├── externals │ ├── __init__.py │ └── hierarchical.py ├── test │ ├── X_missing.csv │ ├── Y_missing_test.csv │ ├── carttest.py │ ├── imputing_test.py │ └── imputing_test_lite.py └── utils │ ├── GEO2csv.py │ ├── __init__.py │ ├── data_source.py │ ├── extensions.py │ ├── extra.py │ ├── scores.py │ └── templates.py ├── doc ├── GiHubProjectPage.txt ├── Makefile ├── devPlan │ ├── plan.pdf │ └── plan.tex └── source │ ├── adenine_logo.pdf │ ├── adenine_logo.png │ ├── conf.py │ ├── dependencies.txt │ ├── drawing.svg │ ├── index.rst │ ├── modules.rst │ ├── slipGURUTheme │ ├── layout.html │ ├── static │ │ ├── logos.png │ │ └── slipGuru.css │ └── theme.conf │ ├── sphinxext │ ├── numpydoc │ │ ├── LICENSE.txt │ │ ├── MANIFEST.in │ │ ├── PKG-INFO │ │ ├── README.txt │ │ ├── __init__.py │ │ ├── comment_eater.py │ │ ├── compiler_unparse.py │ │ ├── docscrape.py │ │ ├── docscrape_sphinx.py │ │ ├── numpydoc.py │ │ ├── phantom_import.py │ │ ├── plot_directive.py │ │ ├── setup.cfg │ │ ├── setup.py │ │ ├── tests │ │ │ └── test_docscrape.py │ │ └── traitsdoc.py │ └── sphinxcontrib │ │ ├── __init__.py │ │ ├── programoutput.py │ │ └── spelling.py │ └── tutorial.rst ├── icon.png ├── requirements.txt ├── scripts ├── ade_GEO2csv.py ├── ade_analysis.py └── ade_run.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX stuff 2 | *.DS_Store 3 | 4 | # Archivers 5 | **/*.tar.gz 6 | # -------------------------- Python -------------------------- # 7 | 8 | # Jupyter Notebook checkpoints 9 | *-checkpoint.ipynb 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # Temp 17 | *~ 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | env/ 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | # Numpy files 40 | #*.npy 41 | 42 | # Dump files 43 | *.pkl 44 | 45 | # Images 46 | *.png 47 | !*adenine_logo.png 48 | !icon.png 49 | 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *,cover 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # -------------------------- TeX -------------------------- # 85 | 86 | *.aux 87 | *.glo 88 | *.idx 89 | *.log 90 | *.toc 91 | *.ist 92 | *.acn 93 | *.acr 94 | *.alg 95 | *.bbl 96 | *.blg 97 | *.dvi 98 | *.glg 99 | *.gls 100 | *.ilg 101 | *.ind 102 | *.lof 103 | *.lot 104 | *.maf 105 | *.mtc 106 | *.mtc1 107 | *.out 108 | *.synctex.gz 109 | 110 | # -------------------------- results -------------------------- # 111 | **/results/**/* 112 | 113 | # --- LaTeX --- # 114 | ## Core latex/pdflatex auxiliary files: 115 | *.aux 116 | *.lof 117 | *.log 118 | *.lot 119 | **.fls 120 | *.out 121 | *.toc 122 | *.fmt 123 | *.fot 124 | *.cb 125 | *.cb2 126 | 127 | ## Intermediate documents: 128 | *.dvi 129 | *-converted-to.* 130 | # these rules might exclude image files for figures etc. 131 | # *.ps 132 | # *.eps 133 | #*.pdf 134 | 135 | 136 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 137 | *.bbl 138 | *.bcf 139 | *.blg 140 | *-blx.aux 141 | *-blx.bib 142 | *.brf 143 | *.run.xml 144 | 145 | ## Build tool auxiliary files: 146 | *.fdb_latexmk 147 | .synctex 148 | .synctex.gz 149 | .synctex.gz(busy) 150 | *.pdfsync 151 | 152 | ## Auxiliary and intermediate files from other packages: 153 | # algorithms 154 | *.alg 155 | *.loa 156 | 157 | # achemso 158 | acs-*.bib 159 | 160 | # amsthm 161 | *.thm 162 | 163 | # beamer 164 | *.nav 165 | *.snm 166 | *.vrb 167 | 168 | # cprotect 169 | *.cpt 170 | 171 | # fixme 172 | *.lox 173 | 174 | #(r)(e)ledmac/(r)(e)ledpar 175 | *.end 176 | *.?end 177 | *.[1-9] 178 | *.[1-9][0-9] 179 | *.[1-9][0-9][0-9] 180 | *.[1-9]R 181 | *.[1-9][0-9]R 182 | *.[1-9][0-9][0-9]R 183 | *.eledsec[1-9] 184 | *.eledsec[1-9]R 185 | *.eledsec[1-9][0-9] 186 | *.eledsec[1-9][0-9]R 187 | *.eledsec[1-9][0-9][0-9] 188 | *.eledsec[1-9][0-9][0-9]R 189 | 190 | # glossaries 191 | *.acn 192 | *.acr 193 | *.glg 194 | *.glo 195 | *.gls 196 | *.glsdefs 197 | 198 | # gnuplottex 199 | *-gnuplottex-* 200 | 201 | # hyperref 202 | *.brf 203 | 204 | # knitr 205 | *-concordance.tex 206 | # TODO Comment the next line if you want to keep your tikz graphics files 207 | *.tikz 208 | *-tikzDictionary 209 | 210 | # listings 211 | *.lol 212 | 213 | # makeidx 214 | *.idx 215 | *.ilg 216 | *.ind 217 | *.ist 218 | 219 | # minitoc 220 | *.maf 221 | *.mlf 222 | *.mlt 223 | *.mtc 224 | *.mtc[0-9] 225 | *.mtc[1-9][0-9] 226 | 227 | # minted 228 | _minted* 229 | *.pyg 230 | 231 | # morewrites 232 | *.mw 233 | 234 | # mylatexformat 235 | *.fmt 236 | 237 | # nomencl 238 | *.nlo 239 | 240 | # sagetex 241 | *.sagetex.sage 242 | *.sagetex.py 243 | *.sagetex.scmd 244 | 245 | # sympy 246 | *.sout 247 | *.sympy 248 | sympy-plots-for-*.tex/ 249 | 250 | # pdfcomment 251 | *.upa 252 | *.upb 253 | 254 | # pythontex 255 | *.pytxcode 256 | pythontex-files-*/ 257 | 258 | # thmtools 259 | *.loe 260 | 261 | # TikZ & PGF 262 | *.dpth 263 | *.md5 264 | *.auxlock 265 | 266 | # todonotes 267 | *.tdo 268 | 269 | # xindy 270 | *.xdy 271 | 272 | # xypic precompiled matrices 273 | *.xyc 274 | 275 | # endfloat 276 | *.ttt 277 | *.fff 278 | 279 | # Latexian 280 | TSWLatexianTemp* 281 | 282 | ## Editors: 283 | # WinEdt 284 | *.bak 285 | *.sav 286 | 287 | # Texpad 288 | .texpadtmp 289 | 290 | # Kile 291 | *.backup 292 | 293 | # KBibTeX 294 | *~[0-9]* 295 | -------------------------------------------------------------------------------- /AUTHORS.txt: -------------------------------------------------------------------------------- 1 | Samuele Fiorini [samuele dot fiorini at dibris dot unige dot it] 2 | Federico Tomasi [federico dot tomasi at dibris dot unige dot it] 3 | Annalisa Barla [annalisa dot barla at unige dot it] 4 | -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | ======================================================================================= 2 | Samuele Fiorini [samuele dot fiorini at dibris dot unige dot it] 3 | Federico Tomasi [federico dot tomasi at dibris dot unige dot it] 4 | Annalisa Barla [annalisa dot barla at unige dot it] 5 | 6 | This file is part of adenine. 7 | 8 | The code is released under the BSD 2-Clause (FreeBSD) License. 9 | 10 | Copyright (c) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla. 11 | All rights reserved. 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | - Redistributions of source code must retain the above copyright notice, 17 | this list of conditions and the following disclaimer. 18 | - Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 23 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 24 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 26 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 29 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 | POSSIBILITY OF SUCH DAMAGE. 32 | ======================================================================================= 33 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | README.md 3 | setup.cfg 4 | setup.py 5 | adenine/__init__.py 6 | adenine/ade_config.py 7 | adenine/core/__init__.py 8 | adenine/core/analyze_results.py 9 | adenine/core/define_pipeline.py 10 | adenine/core/job_distribution.py 11 | adenine/core/pipelines.py 12 | adenine/core/plotting.py 13 | adenine/externals/__init__.py 14 | adenine/externals/hierarchical.py 15 | adenine/utils/__init__.py 16 | adenine/utils/data_source.py 17 | adenine/utils/extensions.py 18 | adenine/utils/extra.py 19 | adenine/utils/scores.py 20 | adenine/utils/templates.py 21 | scripts/ade_analysis.py 22 | scripts/ade_run.py 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.txt 3 | include adenine/examples 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

3 |

4 | 5 | ----------------- 6 | 7 | # Adenine: A data exploration pipeline 8 | 9 | **adenine** is a machine learning and data mining Python library for exploratory data analysis. 10 | 11 | The main structure of **adenine** can be summarized in the following 4 steps. 12 | 13 | 1. **Imputing:** Does your dataset have missing entries? In the first step you can fill the missing values choosing between different strategies: feature-wise median, mean and most frequent value or k-NN imputing. 14 | 15 | 2. **Preprocessing:** Have you ever wondered what would have changed if only your data have been preprocessed in a different way? Or is it data preprocessing a good idea after all? **adenine** includes several preprocessing procedures, such as: data recentering, Min-Max scaling, standardization and normalization. **adenine** also allows you to compare the results of the analysis made with different preprocessing strategies. 16 | 17 | 3. **Dimensionality Reduction:** In the context of data exploration, this phase becomes particularly helpful for high dimensional data. This step includes manifold learning (such as isomap, multidimensional scaling, etc) and unsupervised feature learning (principal component analysis, kernel PCA, Bernoulli RBM, etc) techniques. 18 | 19 | 4. **Clustering:** This step aims at grouping data into clusters in an unsupervised manner. Several techniques such as k-means, spectral or hierarchical clustering are offered. 20 | 21 | The final output of **adenine** is a compact, textual and graphical representation of the results obtained from the pipelines made with each possible combination of the algorithms selected at each step. 22 | 23 | **adenine** can run on multiple cores/machines* and it is fully `scikit-learn` compliant. 24 | 25 | ## Installation 26 | 27 | **adenine** supports Python 2.7 28 | 29 | ### Pip installation 30 | `$ pip install adenine` 31 | 32 | ### Installing from sources 33 | ```bash 34 | $ git clone https://github.com/slipguru/adenine 35 | $ cd adenine 36 | $ python setup.py install 37 | ``` 38 | 39 | ## Try Adenine 40 | 41 | ### 1. Create your configuration file 42 | Start from the provided template and edit your configuration file with your favourite text editor 43 | ```bash 44 | $ ade_run.py -c my-config-file.py 45 | $ vim my-config-file.py 46 | ... 47 | ``` 48 | ```python 49 | from adenine.utils import data_source 50 | 51 | # -------------------------- EXPERMIENT INFO ------------------------- # 52 | exp_tag = '_experiment' 53 | output_root_folder = 'results' 54 | plotting_context = 'notebook' # one of {paper, notebook, talk, poster} 55 | file_format = 'pdf' # or 'png' 56 | 57 | # ---------------------------- INPUT DATA ---------------------------- # 58 | # Load an example dataset or specify your input data in tabular format 59 | X, y, feat_names, index = data_source.load('iris') 60 | 61 | # ----------------------- PIPELINES DEFINITION ------------------------ # 62 | # --- Missing Values Imputing --- # 63 | step0 = {'Impute': [True, {'missing_values': 'NaN', 64 | 'strategy': ['nearest_neighbors']}]} 65 | 66 | # --- Data Preprocessing --- # 67 | step1 = {'MinMax': [True, {'feature_range': [(0, 1)]}]} 68 | 69 | # --- Unsupervised feature learning --- # 70 | step2 = {'KernelPCA': [True, {'kernel': ['linear', 'rbf', 'poly']}], 71 | 'Isomap': [False, {'n_neighbors': 5}], 72 | 'MDS': [True, {'metric': True}], 73 | 'tSNE': [False], 74 | 'RBM': [True, {'n_components': 256}] 75 | } 76 | 77 | # --- Clustering --- # 78 | # affinity ca be precumputed for AP, Spectral and Hierarchical 79 | step3 = {'KMeans': [True, {'n_clusters': [3, 'auto']}], 80 | 'Spectral': [False, {'n_clusters': [3]}], 81 | 'Hierarchical': [False, {'n_clusters': [3], 82 | 'affinity': ['euclidean'], 83 | 'linkage': ['ward', 'average']}] 84 | } 85 | ``` 86 | 87 | ### 2. Run the pipelines 88 | ```bash 89 | $ ade_run.py my-config-file.py 90 | ``` 91 | 92 | ### 3. Automatically generate beautiful publication-ready plots and textual results 93 | ```bash 94 | $ ade_analysis.py results/ade_experiment_ 95 | ``` 96 | 97 | ## Need more info? 98 | Check out the project [homepage](http://slipguru.github.io/adenine/index.html) 99 | 100 | ## *Got large-scale data? 101 | 102 | **adenine** takes advantage of `mpi4py` to distribute the execution of the pipelines on HPC architectures 103 | ```bash 104 | $ mpirun -np --hosts ade_run.py my-config-file.py 105 | ``` 106 | 107 | ## Citation 108 | 109 | If you use **adenine** in a scientific publication, we would appreciate citations: 110 | ```tex 111 | @{coming soon} 112 | ``` 113 | -------------------------------------------------------------------------------- /adenine/__init__.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 3 | # 4 | # FreeBSD License 5 | ###################################################################### 6 | 7 | __version__ = "0.1.4" 8 | 9 | from adenine import utils 10 | from adenine import core 11 | from adenine.core import main 12 | -------------------------------------------------------------------------------- /adenine/ade_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Configuration file for adenine.""" 4 | 5 | from adenine.utils import data_source 6 | 7 | # -------------------------- EXPERMIENT INFO ------------------------- # 8 | exp_tag = '_experiment' 9 | output_root_folder = 'results' 10 | plotting_context = 'notebook' # one of {paper, notebook, talk, poster} 11 | file_format = 'pdf' # or 'png' 12 | use_compression = False # use gzip to compress the results 13 | 14 | # ---------------------------- INPUT DATA ---------------------------- # 15 | # Load an example dataset or specify your input data in tabular format 16 | data_file = 'data.csv' 17 | labels_file = 'labels.csv' # OPTIONAL 18 | samples_on = 'rows' # if samples lie on columns use 'cols' or 'col' 19 | data_sep = ',' # the data separator. e.g., ',', '\t', ' ', ... 20 | X, y, feat_names, index = data_source.load('custom', 21 | data_file, labels_file, 22 | samples_on=samples_on, 23 | sep=data_sep) 24 | 25 | # ----------------------- PIPELINES DEFINITION ------------------------ # 26 | # --- Missing values imputing --- # 27 | step0 = {'Impute': [False, {'missing_values': 'NaN', 28 | 'strategy': ['median', 29 | 'mean', 30 | 'nearest_neighbors']}]} 31 | 32 | # --- Data preprocessing --- # 33 | step1 = {'None': [False], 'Recenter': [False], 'Standardize': [False], 34 | 'Normalize': [False, {'norm': ['l1', 'l2']}], 35 | 'MinMax': [False, {'feature_range': [(0, 1), (-1, 1)]}]} 36 | 37 | # --- Unsupervised features learning --- # 38 | # affinity ca be precumputed for SE 39 | step2 = {'PCA': [False, {'n_components': 3}], 40 | 'IncrementalPCA': [False], 41 | 'RandomizedPCA': [False], 42 | 'KernelPCA': [False, {'kernel': ['linear', 'rbf', 'poly']}], 43 | 'Isomap': [False, {'n_neighbors': 5}], 44 | 'LLE': [False, {'n_neighbors': 5, 45 | 'method': ['standard', 'modified', 46 | 'hessian', 'ltsa']}], 47 | 'SE': [False, {'affinity': ['nearest_neighbors', 'rbf']}], 48 | 'MDS': [False, {'metric': True}], 49 | 'tSNE': [False], 50 | 'RBM': [False, {'n_components': 256}], 51 | 'None': [False] 52 | } 53 | 54 | # --- Clustering --- # 55 | # affinity ca be precumputed for AP, Spectral and Hierarchical 56 | step3 = {'KMeans': [False, {'n_clusters': [3, 'auto']}], 57 | 'AP': [False, {'preference': ['auto']}], 58 | 'MS': [False], 59 | 'Spectral': [False, {'n_clusters': [3, 8]}], 60 | 'Hierarchical': [False, {'n_clusters': [3, 8], 61 | 'affinity': ['manhattan', 'euclidean'], 62 | 'linkage': ['ward', 'complete', 'average']}] 63 | } 64 | -------------------------------------------------------------------------------- /adenine/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | from adenine.cluster.optics import Optics 2 | from adenine.cluster.agglomerative import AgglomerativeClustering 3 | -------------------------------------------------------------------------------- /adenine/cluster/agglomerative.py: -------------------------------------------------------------------------------- 1 | """Agglomerative clustering class extension.""" 2 | import logging 3 | import numpy as np 4 | from sklearn.externals.joblib import Memory 5 | from adenine.externals import AgglomerativeClustering 6 | 7 | 8 | class AgglomerativeClustering(AgglomerativeClustering): 9 | """Extension of sklearn Agglomerative Clustering. 10 | 11 | This Agglomerative Clustering class, if required, can perform automatic 12 | discovery of the number of clusters. 13 | """ 14 | 15 | def __init__(self, n_clusters=2, affinity="euclidean", 16 | memory=Memory(cachedir=None, verbose=0), 17 | connectivity=None, n_components=None, 18 | compute_full_tree='auto', linkage='ward', 19 | pooling_func=np.mean, return_distance=False): 20 | """Agglomerative Clustering. 21 | 22 | Recursively merges the pair of clusters that minimally increases 23 | a given linkage distance. 24 | 25 | Read more in the :ref:`User Guide `. 26 | 27 | Parameters 28 | ---------- 29 | n_clusters : int, default=2 30 | The number of clusters to find. 31 | 32 | connectivity : array-like or callable, optional 33 | Connectivity matrix. Defines for each sample the neighboring 34 | samples following a given structure of the data. 35 | This can be a connectivity matrix itself or a callable that 36 | transforms the data into a connectivity matrix, such as derived 37 | from kneighbors_graph. Default is None, i.e, the 38 | hierarchical clustering algorithm is unstructured. 39 | 40 | affinity : string or callable, default: "euclidean" 41 | Metric used to compute the linkage. Can be "euclidean", "l1", "l2", 42 | "manhattan", "cosine", or 'precomputed'. 43 | If linkage is "ward", only "euclidean" is accepted. 44 | 45 | memory : Instance of joblib.Memory or string (optional) 46 | Used to cache the output of the computation of the tree. 47 | By default, no caching is done. If a string is given, it is the 48 | path to the caching directory. 49 | 50 | n_components : int (optional) 51 | Number of connected components. If None the number of connected 52 | components is estimated from the connectivity matrix. 53 | NOTE: This parameter is now directly determined from the 54 | connectivity matrix and will be removed in 0.18 55 | 56 | compute_full_tree : bool or 'auto' (optional) 57 | Stop early the construction of the tree at n_clusters. This is 58 | useful to decrease computation time if the number of clusters is 59 | not small compared to the number of samples. This option is 60 | useful only when specifying a connectivity matrix. Note also that 61 | when varying the number of clusters and using caching, it may 62 | be advantageous to compute the full tree. 63 | 64 | linkage : {"ward", "complete", "average"}, optional, default: "ward" 65 | Which linkage criterion to use. The linkage criterion determines 66 | which distance to use between sets of observation. The algorithm 67 | will merge the pairs of cluster that minimize this criterion. 68 | 69 | - ward minimizes the variance of the clusters being merged. 70 | - average uses the average of the distances of each observation of 71 | the two sets. 72 | - complete or maximum linkage uses the maximum distances between 73 | all observations of the two sets. 74 | 75 | pooling_func : callable, default=np.mean 76 | This combines the values of agglomerated features into a single 77 | value, and should accept an array of shape [M, N] and the keyword 78 | argument ``axis=1``, and reduce it to an array of size [M]. 79 | 80 | Attributes 81 | ---------- 82 | labels_ : array [n_samples] 83 | cluster labels for each point 84 | 85 | n_leaves_ : int 86 | Number of leaves in the hierarchical tree. 87 | 88 | n_components_ : int 89 | The estimated number of connected components in the graph. 90 | 91 | children_ : array-like, shape (n_nodes-1, 2) 92 | The children of each non-leaf node. Values less than `n_samples` 93 | correspond to leaves of the tree which are the original samples. 94 | A node `i` greater than or equal to `n_samples` is a non-leaf 95 | node and has children `children_[i - n_samples]`. Alternatively 96 | at the i-th iteration, children[i][0] and children[i][1] 97 | are merged to form node `n_samples + i` 98 | 99 | """ 100 | super(AgglomerativeClustering, self). __init__( 101 | n_clusters, affinity, 102 | memory, connectivity, n_components, 103 | compute_full_tree, linkage, 104 | pooling_func, return_distance) 105 | 106 | def fit(self, X, **kwargs): 107 | """Fit the hierarchical clustering on the data. 108 | 109 | Parameters 110 | ---------- 111 | X : array-like, shape = [n_samples, n_features] 112 | The samples a.k.a. observations. 113 | 114 | Returns 115 | ------- 116 | self 117 | """ 118 | if self.n_clusters == 'auto': 119 | # assign an arbitrary high number for the max number of clusters 120 | self.n_clusters = int(.75 * X.shape[0]) 121 | super(AgglomerativeClustering, self).fit(X, **kwargs) 122 | try: 123 | # use self.distances 124 | # TODO 125 | raise NotImplementedError() 126 | except AttributeError: 127 | logging.error("Automatic discovery of the number of clusters " 128 | "cannot be performed. AgglomerativeClustering from " 129 | "adenine.external does not contain a " 130 | "`self.distances` attribute. Try to update adenine.") 131 | # hence, when optimal_clusters is defined, use it 132 | optimal_clusters = -1 # TODO 133 | self.n_clusters = optimal_clusters 134 | # perform the standard fit 135 | super(AgglomerativeClustering, self).fit(X, **kwargs) 136 | -------------------------------------------------------------------------------- /adenine/core/__init__.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 3 | # 4 | # FreeBSD License 5 | ###################################################################### 6 | 7 | from adenine.core.job_distribution import main 8 | -------------------------------------------------------------------------------- /adenine/core/job_distribution.py: -------------------------------------------------------------------------------- 1 | """Master slave.""" 2 | from __future__ import print_function 3 | import os 4 | import imp 5 | import logging 6 | import shutil 7 | import gzip 8 | import numpy as np 9 | 10 | from collections import deque 11 | from six.moves import cPickle as pkl 12 | 13 | from adenine.core import define_pipeline 14 | from adenine.core.pipelines import pipe_worker 15 | from adenine.utils import extra 16 | 17 | try: 18 | from mpi4py import MPI 19 | 20 | COMM = MPI.COMM_WORLD 21 | RANK = COMM.Get_rank() 22 | NAME = MPI.Get_processor_name() 23 | 24 | IS_MPI_JOB = COMM.Get_size() > 1 25 | 26 | except ImportError: 27 | # print("mpi4py module not found. MPI job distribution disabled.") 28 | COMM = None 29 | RANK = 0 30 | NAME = 'localhost' 31 | 32 | IS_MPI_JOB = False 33 | 34 | # MAX_RESUBMISSIONS = 2 35 | # constants to use as tags in communications 36 | DO_WORK = 100 37 | EXIT = 200 38 | 39 | 40 | def master_single_machine(pipes, X): 41 | """Fit and transform/predict some pipelines on some data (single machine). 42 | 43 | This function fits each pipeline in the input list on the provided data. 44 | The results are dumped into a pkl file as a dictionary of dictionaries of 45 | the form {'pipe_id': {'stepID' : [alg_name, level, params, data_out, 46 | data_in, model_obj, voronoi_suitable_object], ...}, ...}. The model_obj is 47 | the sklearn model which has been fit on the dataset, the 48 | voronoi_suitable_object is the very same model but fitted on just the first 49 | two dimensions of the dataset. If a pipeline fails for some reasons the 50 | content of the stepID key is a list of np.nan. 51 | 52 | Parameters 53 | ----------- 54 | pipes : list of list of tuples 55 | Each tuple contains a label and a sklearn Pipeline object. 56 | X : array of float, shape : n_samples x n_features, default : () 57 | The input data matrix. 58 | 59 | Returns 60 | ----------- 61 | pipes_dump : dict 62 | Dictionary with the results of the computation. 63 | """ 64 | import multiprocessing as mp 65 | jobs = [] 66 | manager = mp.Manager() 67 | pipes_dump = manager.dict() 68 | 69 | # Submit jobs 70 | for i, pipe in enumerate(pipes): 71 | pipe_id = 'pipe' + str(i) 72 | proc = mp.Process(target=pipe_worker, 73 | args=(pipe_id, pipe, pipes_dump, X)) 74 | jobs.append(proc) 75 | proc.start() 76 | logging.info("Job: %s submitted", pipe_id) 77 | 78 | # Collect results 79 | count = 0 80 | for proc in jobs: 81 | proc.join() 82 | count += 1 83 | logging.info("%d jobs collected", count) 84 | 85 | # import joblib as jl 86 | # jl.Parallel(n_jobs=-1) \ 87 | # (jl.delayed(pipe_worker)( 88 | # 'pipe' + str(i), pipe, pipes_dump, X) for i, pipe in enumerate( 89 | # pipes)) 90 | 91 | return dict(pipes_dump) 92 | 93 | 94 | @extra.timed 95 | def master(config): 96 | """Distribute pipelines with mpi4py or multiprocessing.""" 97 | # Pipeline definition 98 | pipes = define_pipeline.parse_steps( 99 | [config.step0, config.step1, 100 | config.step2, config.step3]) 101 | 102 | if not IS_MPI_JOB: 103 | return master_single_machine(pipes, config.X) 104 | 105 | # RUN PIPELINES 106 | nprocs = COMM.Get_size() 107 | # print(NAME + ": start running slaves", nprocs, NAME) 108 | queue = deque(list(enumerate(pipes))) 109 | 110 | pipe_dump = dict() 111 | count = 0 112 | n_pipes = len(queue) 113 | 114 | # seed the slaves by sending work to each processor 115 | for rankk in range(1, min(nprocs, n_pipes)): 116 | pipe_tuple = queue.popleft() 117 | COMM.send(pipe_tuple, dest=rankk, tag=DO_WORK) 118 | # print(NAME + ": send to rank", rankk) 119 | 120 | # loop until there's no more work to do. If queue is empty skips the loop. 121 | while queue: 122 | pipe_tuple = queue.popleft() 123 | # receive result from slave 124 | status = MPI.Status() 125 | pipe_id, step_dump = COMM.recv( 126 | source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) 127 | pipe_dump[pipe_id] = step_dump 128 | count += 1 129 | # send to the same slave new work 130 | COMM.send(pipe_tuple, dest=status.source, tag=DO_WORK) 131 | 132 | # there's no more work to do, so receive all the results from the slaves 133 | for rankk in range(1, min(nprocs, n_pipes)): 134 | # print(NAME + ": master - waiting from", rankk) 135 | status = MPI.Status() 136 | pipe_id, step_dump = COMM.recv( 137 | source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) 138 | pipe_dump[pipe_id] = step_dump 139 | count += 1 140 | 141 | # tell all the slaves to exit by sending an empty message with the EXIT_TAG 142 | for rankk in range(1, nprocs): 143 | # print(NAME + ": master - killing", rankk) 144 | COMM.send(0, dest=rankk, tag=EXIT) 145 | 146 | # print(NAME + ": terminating master") 147 | return pipe_dump 148 | 149 | 150 | def slave(X): 151 | """Pipeline evaluation. 152 | 153 | Parameters 154 | ---------- 155 | X : array of float, shape : n_samples x n_features, default : () 156 | The input data matrix. 157 | """ 158 | try: 159 | while True: 160 | status_ = MPI.Status() 161 | received = COMM.recv(source=0, tag=MPI.ANY_TAG, status=status_) 162 | # check the tag of the received message 163 | if status_.tag == EXIT: 164 | return 165 | # do the work 166 | i, pipe = received 167 | # print(NAME + ": slave received", RANK, i) 168 | pipe_id = 'pipe' + str(i) 169 | step_dump = pipe_worker( 170 | pipe_id, pipe, None, X) 171 | COMM.send((pipe_id, step_dump), dest=0, tag=0) 172 | 173 | except StandardError as exc: 174 | print("Quitting ... TB:", str(exc)) 175 | 176 | 177 | def main(config_file): 178 | """Generate the pipelines.""" 179 | 180 | if RANK == 0: 181 | # Load the configuration file 182 | config_path = os.path.abspath(config_file) 183 | 184 | # For some reason, it must be atomic 185 | imp.acquire_lock() 186 | config = imp.load_source('ade_config', config_path) 187 | imp.release_lock() 188 | 189 | # this barrier prevents the slave to re-download the same GEO 190 | # dataset if not locally present 191 | if IS_MPI_JOB: 192 | # Wait for all jobs to end 193 | COMM.barrier() 194 | 195 | if RANK != 0: 196 | # Load the configuration file 197 | config_path = os.path.abspath(config_file) 198 | 199 | # For some reason, it must be atomic 200 | imp.acquire_lock() 201 | config = imp.load_source('ade_config', config_path) 202 | imp.release_lock() 203 | 204 | if hasattr(config, 'use_compression'): 205 | use_compression = config.use_compression 206 | else: 207 | use_compression = False 208 | 209 | extra.set_module_defaults( 210 | config, { 211 | 'step0': {'Impute': [False]}, 212 | 'step1': {'None': [True]}, 213 | 'step2': {'None': [True]}, 214 | 'step3': {'None': [False]}, 215 | 'exp_tag': 'debug', 216 | 'output_root_folder': 'results', 217 | 'verbose': False}) 218 | 219 | # Read the variables from the config file 220 | X = config.X 221 | 222 | if RANK == 0: 223 | # Get the experiment tag and the output root folder 224 | exp_tag, root = config.exp_tag, config.output_root_folder 225 | if not os.path.exists(root): 226 | os.makedirs(root) 227 | 228 | filename = '_'.join(('ade', exp_tag, extra.get_time())) 229 | logfile = os.path.join(root, filename + '.log') 230 | logging.basicConfig(filename=logfile, level=logging.INFO, filemode='w', 231 | format='%(levelname)s (%(name)s): %(message)s') 232 | root_logger = logging.getLogger() 233 | lsh = logging.StreamHandler() 234 | lsh.setLevel(logging.DEBUG if config.verbose else logging.ERROR) 235 | lsh.setFormatter( 236 | logging.Formatter('%(levelname)s (%(name)s): %(message)s')) 237 | root_logger.addHandler(lsh) 238 | pipes_dump = master(config) 239 | else: 240 | slave(X) 241 | 242 | if IS_MPI_JOB: 243 | # Wait for all jobs to end 244 | COMM.barrier() 245 | 246 | if RANK == 0: 247 | # Output Name 248 | outfile = filename 249 | outfolder = os.path.join(root, outfile) 250 | 251 | # Create exp folder into the root folder 252 | os.makedirs(outfolder) 253 | 254 | # pkl Dump 255 | logging.info('Saving Adenine results...') 256 | if use_compression: 257 | with gzip.open(os.path.join(outfolder, outfile + '.pkl.tz'), 258 | 'wb') as out: 259 | pkl.dump(pipes_dump, out) 260 | logging.info("Dump : %s", os.path.join(outfolder, outfile + '.pkl.tz')) 261 | else: 262 | with open(os.path.join(outfolder, outfile + '.pkl'), 'wb') as out: 263 | pkl.dump(pipes_dump, out) 264 | logging.info("Dump : %s", os.path.join(outfolder, outfile + '.pkl')) 265 | 266 | # Retrieve info from the config file 267 | _index = config.index if hasattr(config, 'index') \ 268 | else np.arange(X.shape[0]) 269 | _y = config.y if hasattr(config, 'y') else None 270 | if use_compression: 271 | with gzip.open(os.path.join(outfolder, '__data.pkl.tz'), 'wb') as out: 272 | pkl.dump({'X': X, 'y': _y, 'index': _index}, out) 273 | logging.info("Dump : %s", os.path.join(outfolder, '__data.pkl.tz')) 274 | else: 275 | with open(os.path.join(outfolder, '__data.pkl'), 'wb') as out: 276 | pkl.dump({'X': X, 'y': _y, 'index': _index}, out) 277 | logging.info("Dump : %s", os.path.join(outfolder, '__data.pkl')) 278 | 279 | # Copy the ade_config just used into the outFolder 280 | shutil.copy(config_path, os.path.join(outfolder, 'ade_config.py')) 281 | 282 | root_logger.handlers[0].close() 283 | 284 | # Move the logging file into the outFolder 285 | shutil.move(logfile, outfolder) 286 | -------------------------------------------------------------------------------- /adenine/core/pipelines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | import copy 11 | import logging 12 | import numpy as np 13 | 14 | 15 | def create(pdef): 16 | """Scikit-learn Pipelines objects creation (deprecated). 17 | 18 | This function creates a list of sklearn Pipeline objects starting from the 19 | list of list of tuples given in input that could be created using the 20 | adenine.core.define_pipeline module. 21 | 22 | Parameters 23 | ----------- 24 | pdef : list of list of tuples 25 | This arguments contains the specification needed by sklearn in order 26 | to create a working Pipeline object. 27 | 28 | Returns 29 | ----------- 30 | pipes : list of sklearn.pipeline.Pipeline objects 31 | The list of Piplines, each of them can be fitted and trasformed 32 | with some data. 33 | """ 34 | from sklearn.pipeline import Pipeline 35 | return [Pipeline(p) for p in pdef] 36 | 37 | 38 | def which_level(label): 39 | """Define the step level according to the input step label [DEPRECATED]. 40 | 41 | This function return the level (i.e.: imputing, preproc, dimred, clustring, 42 | None) according to the step label provided as input. 43 | 44 | Parameters 45 | ----------- 46 | label : string 47 | This is the step level as it is reported in the ade_config file. 48 | 49 | Returns 50 | ----------- 51 | level : {imputing, preproc, dimred, clustering, None} 52 | The appropriate level of the input step. 53 | """ 54 | if not isinstance(label, basestring): 55 | raise ValueError("String expected") 56 | 57 | label = label.lower() 58 | if label.startswith('impute'): 59 | level = 'imputing' 60 | elif label in ('recenter', 'standardize', 'normalize', 'minmax'): 61 | level = 'preproc' 62 | elif label in ('pca', 'incrementalpca', 'randomizedpca', 'kernelpca', 63 | 'isomap', 'lle', 'se', 'mds', 'tsne', 'rbm'): 64 | level = 'dimred' 65 | elif label in ('kmeans', 'ap', 'ms', 'spectral', 66 | 'hierarchical'): 67 | level = 'clustering' 68 | else: 69 | level = 'None' 70 | return level 71 | 72 | 73 | def evaluate(level, step, X): 74 | """Transform or predict according to the input level. 75 | 76 | This function uses the transform or the predict method on the input 77 | sklearn-like step according to its level (i.e. imputing, preproc, dimred, 78 | clustering, none). 79 | 80 | Parameters 81 | ----------- 82 | level : {'imputing', 'preproc', 'dimred', 'clustering', 'None'} 83 | The step level. 84 | 85 | step : sklearn-like object 86 | This might be an Imputer, or a PCA, or a KMeans (and so on...) 87 | sklearn-like object. 88 | 89 | X : array of float, shape : n_samples x n_features 90 | The input data matrix. 91 | 92 | Returns 93 | ----------- 94 | res : array of float 95 | A matrix projection in case of dimred, a label vector in case of 96 | clustering, and so on. 97 | """ 98 | if level in ('imputing', 'preproc', 'dimred', 'None'): 99 | if hasattr(step, 'embedding_'): 100 | res = step.embedding_ 101 | else: 102 | res = step.transform(X) 103 | elif level == 'clustering': 104 | if hasattr(step, 'labels_'): 105 | res = step.labels_ # e.g. in case of spectral clustering 106 | elif hasattr(step, 'affinity') and step.affinity == 'precomputed': 107 | if not hasattr(step.estimator, 'labels_'): 108 | step.estimator.fit(X) 109 | res = step.estimator.labels_ 110 | else: 111 | res = step.predict(X) 112 | return res 113 | 114 | 115 | def pipe_worker(pipe_id, pipe, pipes_dump, X): 116 | """Parallel pipelines execution. 117 | 118 | Parameters 119 | ----------- 120 | pipe_id : string 121 | Pipeline identifier. 122 | 123 | pipe : list of tuples 124 | Tuple containing a label and a sklearn Pipeline object. 125 | 126 | pipes_dump : multiprocessing.Manager.dict 127 | Dictionary containing the results of the parallel execution. 128 | 129 | X : array of float, shape : n_samples x n_features, default : () 130 | The input data matrix. 131 | """ 132 | step_dump = dict() 133 | 134 | # COPY X as X_curr (to avoid that the next pipeline 135 | # works on the results of the previuos one) 136 | X_curr = np.array(X) 137 | for j, step in enumerate(pipe): 138 | # step[0] -> step_label | step[1] -> model, sklearn (or sklearn-like) 139 | # object 140 | step_id = 'step' + str(j) 141 | # 1. define which level of step is this (i.e.: imputing, preproc, 142 | # dimred, clustering, none) 143 | level = step[-1] 144 | # 2. fit the model (whatever it is) 145 | if step[1].get_params().get('method') == 'hessian': 146 | # check hessian lle constraints 147 | n_components = step[1].get_params().get('n_components') 148 | n_neighbors = 1 + (n_components * (n_components + 3) / 2) 149 | step[1].set_params(n_neighbors=n_neighbors) 150 | try: 151 | step[1].fit(X_curr) 152 | 153 | # 3. evaluate (i.e. transform or predict according to the level) 154 | # X_curr = evaluate(level, step[1], X_curr) 155 | X_next = evaluate(level, step[1], X_curr) 156 | # 3.1 if the model is suitable for voronoi tessellation: fit also 157 | # on 2D 158 | mdl_voronoi = None 159 | if hasattr(step[1], 'cluster_centers_'): 160 | mdl_voronoi = copy.copy(step[1].best_estimator_ if hasattr( 161 | step[1], 'best_estimator_') else step[1]) 162 | if not hasattr(step[1], 'affinity') or step[1].affinity != 'precomputed': 163 | mdl_voronoi.fit(X_curr[:, :2]) 164 | else: 165 | mdl_voronoi.fit(X_curr) 166 | 167 | # 4. save the results in a dictionary of dictionaries of the form: 168 | # save memory and do not dump data after preprocessing (unused in 169 | # analysys) 170 | if level in ('preproc', 'imputing'): 171 | result = [step[0], level, step[1].get_params(), 172 | np.empty(0), np.empty(0), step[1], mdl_voronoi] 173 | X_curr = np.array(X_next) # update the matrix 174 | 175 | # save memory dumping X_curr only in case of clustering 176 | elif level == 'dimred': 177 | result = [step[0], level, step[1].get_params(), 178 | X_next, np.empty(0), step[1], mdl_voronoi] 179 | X_curr = X_next # update the matrix 180 | 181 | # clustering 182 | elif level == 'clustering': 183 | result = [step[0], level, step[1].get_params(), 184 | X_next, X_curr, step[1], mdl_voronoi] 185 | if level != 'None': 186 | step_dump[step_id] = result 187 | 188 | except (AssertionError, ValueError) as e: 189 | logging.critical("Pipeline %s failed at step %s. " 190 | "Traceback: %s", pipe_id, step[0], e) 191 | 192 | 193 | # Monkey-patch, see: https://github.com/scikit-learn/scikit-learn/issues/7562 194 | # and wait for the next numpy update 195 | # step_dump['step2'][-2] = None 196 | 197 | if pipes_dump is None: 198 | return step_dump 199 | 200 | pipes_dump[pipe_id] = step_dump 201 | -------------------------------------------------------------------------------- /adenine/core/template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/adenine/core/template/__init__.py -------------------------------------------------------------------------------- /adenine/core/template/d3_template.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # note: "format" this string to load data in csv format with % string 3 | D3_TREE = r""" 4 | 5 | 6 | 32 | 35 | 36 | 37 | 92 | 93 | """ 94 | -------------------------------------------------------------------------------- /adenine/core/template/svg-crowbar.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | var doctype = ''; 3 | 4 | window.URL = (window.URL || window.webkitURL); 5 | 6 | var body = document.body; 7 | 8 | var prefix = { 9 | xmlns: "http://www.w3.org/2000/xmlns/", 10 | xlink: "http://www.w3.org/1999/xlink", 11 | svg: "http://www.w3.org/2000/svg" 12 | } 13 | 14 | initialize(); 15 | 16 | function initialize() { 17 | var documents = [window.document], 18 | SVGSources = []; 19 | iframes = document.querySelectorAll("iframe"), 20 | objects = document.querySelectorAll("object"); 21 | 22 | [].forEach.call(iframes, function(el) { 23 | try { 24 | if (el.contentDocument) { 25 | documents.push(el.contentDocument); 26 | } 27 | } catch(err) { 28 | console.log(err) 29 | } 30 | }); 31 | 32 | [].forEach.call(objects, function(el) { 33 | try { 34 | if (el.contentDocument) { 35 | documents.push(el.contentDocument); 36 | } 37 | } catch(err) { 38 | console.log(err) 39 | } 40 | }); 41 | 42 | documents.forEach(function(doc) { 43 | var styles = getStyles(doc); 44 | var newSources = getSources(doc, styles); 45 | // because of prototype on NYT pages 46 | for (var i = 0; i < newSources.length; i++) { 47 | SVGSources.push(newSources[i]); 48 | }; 49 | }) 50 | if (SVGSources.length > 1) { 51 | createPopover(SVGSources); 52 | } else if (SVGSources.length > 0) { 53 | download(SVGSources[0]); 54 | } else { 55 | alert("The Crowbar couldn’t find any SVG nodes."); 56 | } 57 | } 58 | 59 | function createPopover(sources) { 60 | cleanup(); 61 | 62 | sources.forEach(function(s1) { 63 | sources.forEach(function(s2) { 64 | if (s1 !== s2) { 65 | if ((Math.abs(s1.top - s2.top) < 38) && (Math.abs(s1.left - s2.left) < 38)) { 66 | s2.top += 38; 67 | s2.left += 38; 68 | } 69 | } 70 | }) 71 | }); 72 | 73 | var buttonsContainer = document.createElement("div"); 74 | body.appendChild(buttonsContainer); 75 | 76 | buttonsContainer.setAttribute("class", "svg-crowbar"); 77 | buttonsContainer.style["z-index"] = 1e7; 78 | buttonsContainer.style["position"] = "absolute"; 79 | buttonsContainer.style["top"] = 0; 80 | buttonsContainer.style["left"] = 0; 81 | 82 | 83 | 84 | var background = document.createElement("div"); 85 | body.appendChild(background); 86 | 87 | background.setAttribute("class", "svg-crowbar"); 88 | background.style["background"] = "rgba(255, 255, 255, 0.7)"; 89 | background.style["position"] = "fixed"; 90 | background.style["left"] = 0; 91 | background.style["top"] = 0; 92 | background.style["width"] = "100%"; 93 | background.style["height"] = "100%"; 94 | 95 | sources.forEach(function(d, i) { 96 | var buttonWrapper = document.createElement("div"); 97 | buttonsContainer.appendChild(buttonWrapper); 98 | buttonWrapper.setAttribute("class", "svg-crowbar"); 99 | buttonWrapper.style["position"] = "absolute"; 100 | buttonWrapper.style["top"] = (d.top + document.body.scrollTop) + "px"; 101 | buttonWrapper.style["left"] = (document.body.scrollLeft + d.left) + "px"; 102 | buttonWrapper.style["padding"] = "4px"; 103 | buttonWrapper.style["border-radius"] = "3px"; 104 | buttonWrapper.style["color"] = "white"; 105 | buttonWrapper.style["text-align"] = "center"; 106 | buttonWrapper.style["font-family"] = "'Helvetica Neue'"; 107 | buttonWrapper.style["background"] = "rgba(0, 0, 0, 0.8)"; 108 | buttonWrapper.style["box-shadow"] = "0px 4px 18px rgba(0, 0, 0, 0.4)"; 109 | buttonWrapper.style["cursor"] = "move"; 110 | buttonWrapper.textContent = "SVG #" + i + ": " + (d.id ? "#" + d.id : "") + (d.class ? "." + d.class : ""); 111 | 112 | var button = document.createElement("button"); 113 | buttonWrapper.appendChild(button); 114 | button.setAttribute("data-source-id", i) 115 | button.style["width"] = "150px"; 116 | button.style["font-size"] = "12px"; 117 | button.style["line-height"] = "1.4em"; 118 | button.style["margin"] = "5px 0 0 0"; 119 | button.textContent = "Download"; 120 | 121 | button.onclick = function(el) { 122 | // console.log(el, d, i, sources) 123 | download(d); 124 | }; 125 | 126 | }); 127 | 128 | } 129 | 130 | function cleanup() { 131 | var crowbarElements = document.querySelectorAll(".svg-crowbar"); 132 | 133 | [].forEach.call(crowbarElements, function(el) { 134 | el.parentNode.removeChild(el); 135 | }); 136 | } 137 | 138 | 139 | function getSources(doc, styles) { 140 | var svgInfo = [], 141 | svgs = doc.querySelectorAll("svg"); 142 | 143 | styles = (styles === undefined) ? "" : styles; 144 | 145 | [].forEach.call(svgs, function (svg) { 146 | 147 | svg.setAttribute("version", "1.1"); 148 | 149 | var defsEl = document.createElement("defs"); 150 | svg.insertBefore(defsEl, svg.firstChild); //TODO .insert("defs", ":first-child") 151 | // defsEl.setAttribute("class", "svg-crowbar"); 152 | 153 | var styleEl = document.createElement("style") 154 | defsEl.appendChild(styleEl); 155 | styleEl.setAttribute("type", "text/css"); 156 | 157 | 158 | // removing attributes so they aren't doubled up 159 | svg.removeAttribute("xmlns"); 160 | svg.removeAttribute("xlink"); 161 | 162 | // These are needed for the svg 163 | if (!svg.hasAttributeNS(prefix.xmlns, "xmlns")) { 164 | svg.setAttributeNS(prefix.xmlns, "xmlns", prefix.svg); 165 | } 166 | 167 | if (!svg.hasAttributeNS(prefix.xmlns, "xmlns:xlink")) { 168 | svg.setAttributeNS(prefix.xmlns, "xmlns:xlink", prefix.xlink); 169 | } 170 | 171 | var source = (new XMLSerializer()).serializeToString(svg).replace('', ''); 172 | var rect = svg.getBoundingClientRect(); 173 | svgInfo.push({ 174 | top: rect.top, 175 | left: rect.left, 176 | width: rect.width, 177 | height: rect.height, 178 | class: svg.getAttribute("class"), 179 | id: svg.getAttribute("id"), 180 | childElementCount: svg.childElementCount, 181 | source: [doctype + source] 182 | }); 183 | }); 184 | return svgInfo; 185 | } 186 | 187 | function download(source) { 188 | var filename = "untitled"; 189 | 190 | if (source.id) { 191 | filename = source.id; 192 | } else if (source.class) { 193 | filename = source.class; 194 | } else if (window.document.title) { 195 | filename = window.document.title.replace(/[^a-z0-9]/gi, '-').toLowerCase(); 196 | } 197 | 198 | var url = window.URL.createObjectURL(new Blob(source.source, { "type" : "text\/xml" })); 199 | 200 | var a = document.createElement("a"); 201 | body.appendChild(a); 202 | a.setAttribute("class", "svg-crowbar"); 203 | a.setAttribute("download", filename + ".svg"); 204 | a.setAttribute("href", url); 205 | a.style["display"] = "none"; 206 | a.click(); 207 | 208 | setTimeout(function() { 209 | window.URL.revokeObjectURL(url); 210 | }, 10); 211 | } 212 | 213 | function getStyles(doc) { 214 | var styles = "", 215 | styleSheets = doc.styleSheets; 216 | 217 | if (styleSheets) { 218 | for (var i = 0; i < styleSheets.length; i++) { 219 | processStyleSheet(styleSheets[i]); 220 | } 221 | } 222 | 223 | function processStyleSheet(ss) { 224 | if (ss.cssRules) { 225 | for (var i = 0; i < ss.cssRules.length; i++) { 226 | var rule = ss.cssRules[i]; 227 | if (rule.type === 3) { 228 | // Import Rule 229 | processStyleSheet(rule.styleSheet); 230 | } else { 231 | // hack for illustrator crashing on descendent selectors 232 | if (rule.selectorText) { 233 | if (rule.selectorText.indexOf(">") === -1) { 234 | styles += "\n" + rule.cssText; 235 | } 236 | } 237 | } 238 | } 239 | } 240 | } 241 | return styles; 242 | } 243 | 244 | })(); 245 | -------------------------------------------------------------------------------- /adenine/examples/ade_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | from adenine.utils import data_source 11 | from adenine.utils import extra 12 | 13 | # -------------------------- EXPERMIENT INFO ------------------------- # 14 | exp_tag = 'debug' 15 | output_root_folder = 'results' 16 | file_format = 'png' # or 'png' 17 | plotting_context = 'paper' # one of {paper, notebook, talk, poster} 18 | 19 | # ---------------------------- INPUT DATA ---------------------------- # 20 | X, y, feat_names, class_names = data_source.load('iris') 21 | #X, y, feat_names, class_names = data_source.load('gauss', n_samples=300) 22 | # X, y, feat_names, class_names = data_source.load('circles') 23 | # X, y, feat_names, class_names = data_source.load('digits') 24 | # X, y, feat_names, class_names = data_source.load('diabetes') 25 | # X, y, feat_names, class_names = data_source.load('boston') 26 | # X, y, feat_names, class_names = data_source.load('custom', 'data/X.npy', 'data/y.npy') 27 | # X, y, feat_names, class_names = data_source.load('custom', 'data/X.csv', 'data/y.csv') 28 | 29 | # X, y, feat_names, class_names = data_source.load('custom', '/home/fede/src/adenine/adenine/examples/TM_matrix.csv') 30 | # X = extra.ensure_symmetry(X) 31 | # X = 1. - X # i want affinity 32 | 33 | # ----------------------- PIPELINE DEFINITION ------------------------ # 34 | 35 | # --- Missing Values Imputing --- # 36 | # step0 = {'Impute': [False, {'missing_values': 'NaN', 37 | # 'strategy': ['median','mean','nearest_neighbors']}]} 38 | 39 | # --- Data Preprocessing --- # 40 | step1 = {'None': [False], 'Recenter': [False], 'Standardize': [False], 41 | 'Normalize': [True, {'norm': ['l2']}], 42 | 'MinMax': [False, {'feature_range': [(0,1), (-1,1)]}]} 43 | 44 | # --- Dimensionality Reduction & Manifold Learning --- # 45 | step2 = {'PCA': [True, {'n_components': 2}], 46 | 'IncrementalPCA': [False, {'n_components': 3}], 47 | 'RandomizedPCA': [False, {'n_components': 3}], 48 | 'KernelPCA': [False, {'n_components': 2, 49 | 'kernel': ['linear','rbf','poly'], 'gamma': 2}], 50 | 'Isomap': [False, {'n_components': 3, 'n_neighbors': 5}], 51 | 'LLE': [False, {'n_components': 3, 'n_neighbors': 5, # xxx 52 | 'method': ['standard','modified','hessian','ltsa']}], 53 | 'SE': [False, {'n_components': 3, 'affinity': ['nearest_neighbors','rbf']}], # can be 'precomputed' 54 | 'MDS': [False, {'n_components': 3, 'metric': [True, False]}], 55 | 'tSNE': [False, {'n_components': 3}], 56 | 'RMB': [True, {'n_components': 256}], 57 | 'None': [False, {}] 58 | } 59 | 60 | # --- Clustering --- # 61 | step3 = {'KMeans': [False, {'n_clusters': [2]}], # cannot be 'precomputed' 62 | 'AP': [False, {'preference': ['auto']}], # can be 'precomputed' 63 | 'MS': [False], # cannot be 'precomputed' 64 | 'Spectral': [False, {'n_clusters': [2]}], # can be 'precomputed' 65 | 'Hierarchical': [False, {'n_clusters': [3], 66 | #'affinity': ['manhattan','euclidean'], 67 | 'affinity': ['euclidean'], 68 | #'linkage': ['ward','complete','average']}] 69 | 'linkage': ['ward']}] 70 | } 71 | -------------------------------------------------------------------------------- /adenine/examples/data/X.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/adenine/examples/data/X.npy -------------------------------------------------------------------------------- /adenine/examples/data/X_missing.csv: -------------------------------------------------------------------------------- 1 | nan,-1.156862383698490815e+00,2.325969437444890264e-01,-1.226910178531959300e-01 2 | nan,nan,1.662453908639822120e-01,nan 3 | nan,1.078908257010945171e+00,nan,-4.521696668277273012e-01 4 | nan,nan,nan,nan 5 | -7.075918621950672005e-01,nan,1.107003123417464430e+00,3.925003704893688106e-01 6 | nan,-9.307682348162302777e-01,nan,-1.238600366412410531e-01 7 | -6.394675777546744433e-01,nan,nan,nan 8 | -1.463084369363933934e+00,nan,1.495331971007898719e+00,nan 9 | nan,-9.088301530861653266e-01,-3.489239577414761095e-01,nan 10 | 4.314432145948063901e-01,1.981623502664172642e+00,-4.459802189090978919e-01,nan 11 | 6.142043420674385412e-02,-6.118179692142385884e-01,nan,nan 12 | 1.653774270442252892e+00,nan,-7.105778390461989780e-01,-4.253239641661243908e-01 13 | nan,7.938380876396712305e-01,nan,9.734189421253153229e-01 14 | nan,nan,-9.325734485898492521e-01,nan 15 | -4.692381743406294770e-01,nan,nan,-9.854055694219750194e-02 16 | 2.856917897910776771e-01,5.442976535996231213e-01,-9.027838452675212011e-01,nan 17 | 3.228887691158054407e-01,1.178646684094214914e+00,-8.115360478215190021e-01,nan 18 | nan,-8.872088861442941621e-01,nan,nan 19 | 2.767596836546558081e-01,nan,nan,3.134103300122419861e-02 20 | nan,nan,9.208184217167506569e-01,nan 21 | nan,1.446536613951119543e+00,nan,nan 22 | 6.346579181673047687e-01,nan,nan,nan 23 | 1.392886349437185034e+00,nan,-1.198599027981311460e+00,-1.132697590062041293e+00 24 | -1.434715432544310998e+00,1.026936160278541399e+00,1.124128303209935176e+00,nan 25 | nan,2.330603277258347372e-01,nan,nan 26 | nan,nan,1.142952665043716731e+00,nan 27 | nan,1.140373046240707788e+00,nan,4.270122455127892680e-01 28 | nan,-1.361512094713794196e+00,nan,nan 29 | 1.209893933270316246e+00,nan,-1.359040178533376775e+00,nan 30 | -4.502313522982358540e-01,nan,nan,nan 31 | 2.195009994421553146e-01,-6.037814919824179283e-01,nan,-3.282889270210308762e-03 32 | -9.284702738379502218e-01,nan,9.642905063476822081e-01,nan 33 | -1.210009835227054298e+00,9.288008161700008758e-01,7.662876912390784723e-01,6.563682753521845603e-01 34 | nan,nan,-1.080027626003194241e+00,nan 35 | nan,-7.893344721439290446e-01,3.208092641934174316e-01,-2.744291585393637267e-01 36 | 1.641059936670823949e-01,nan,-4.154673123178226346e-02,-1.690398094574822596e-01 37 | nan,7.584241169204418709e-01,nan,nan 38 | 6.565432358878454944e-02,nan,nan,nan 39 | nan,9.926811177344899706e-01,nan,nan 40 | -1.041542225925210924e-01,nan,nan,nan 41 | nan,9.143221607270377582e-01,nan,1.288919718901406775e+00 42 | nan,5.019338363269474357e-01,7.100040463856538420e-01,1.545841940359383937e+00 43 | -6.600020519878190273e-01,1.104240175105188904e+00,1.132483378627092474e+00,1.465827932390227684e+00 44 | 1.744402579749481652e-01,-1.587272191140688182e+00,2.497575268980154195e-01,-8.134691613104610974e-02 45 | nan,nan,nan,nan 46 | nan,7.501720661828157333e-01,nan,nan 47 | -3.342554750911648775e-01,-7.137329100818909922e-01,-2.407084197162502048e-01,-8.340083131232416125e-02 48 | nan,nan,nan,-9.673348217456705367e-01 49 | nan,nan,-2.391048052682311353e-01,nan 50 | nan,9.300718567010340943e-01,nan,nan 51 | 1.017586261554328741e+00,nan,nan,nan 52 | 1.303392086200408251e+00,1.176009676693779760e+00,nan,nan 53 | nan,4.836988853116996889e-01,-1.092880083369777822e+00,nan 54 | -9.319851610608677062e-01,1.397977267254582046e+00,nan,1.535659733771689517e+00 55 | -6.408969054855628844e-01,nan,nan,nan 56 | nan,-9.323550811636887037e-01,nan,3.632022074493482244e-01 57 | nan,nan,-3.625792307095718203e-01,nan 58 | 9.145070118313443075e-01,nan,nan,-6.360363342469680381e-01 59 | -1.007658038296989078e+00,nan,1.179926256982412269e+00,7.463843472733622253e-01 60 | -1.187303971214693110e+00,1.107806472262462982e+00,nan,1.585263871893016763e+00 61 | -9.031651321481354300e-01,nan,nan,1.321148563212554361e+00 62 | nan,7.999924887361942183e-01,-5.778430086333533611e-01,nan 63 | 1.059243736095132560e+00,1.522616402375389422e+00,-9.552541500540825403e-01,nan 64 | nan,1.609635090789223621e+00,nan,nan 65 | -1.606690560441900173e-01,-1.229925236813150580e+00,-3.819256182206764993e-01,7.356022877628196621e-01 66 | nan,7.062859368497632628e-01,nan,nan 67 | -1.411870672836224694e+00,nan,7.086113942358228668e-01,1.492798947017852651e+00 68 | -3.040514043122001242e-01,-5.813862246900292075e-01,nan,-2.108471835999596866e-01 69 | 8.507789005424097883e-01,8.773006104066740640e-01,-7.045392379849533260e-01,nan 70 | 3.074991953218891294e-01,-1.102530028510551263e+00,-5.869166677052294334e-01,1.138990811361808436e-01 71 | nan,nan,1.461701841444241090e+00,nan 72 | nan,nan,nan,nan 73 | nan,nan,-1.098651949849287046e+00,nan 74 | -1.095979133848796971e+00,1.455316797130397521e+00,1.227837823199790623e+00,nan 75 | nan,nan,nan,nan 76 | -6.120563267013116177e-02,nan,nan,-3.142630129259655902e-01 77 | nan,-6.724793646662509117e-01,-3.051652072034404806e-01,5.981953113846391057e-01 78 | -1.329461562127501661e+00,nan,1.783776000666230654e+00,nan 79 | -1.430891473540749415e+00,nan,nan,nan 80 | 1.398979808841118500e+00,1.213372537922308148e+00,-1.139778282564449130e+00,nan 81 | nan,nan,nan,nan 82 | nan,1.053890414736040837e+00,nan,-8.991149068911545861e-01 83 | nan,nan,nan,-1.715028446120915873e-01 84 | -2.975404547273184930e-01,-9.113023565308768781e-01,1.227527126258068924e-01,nan 85 | nan,nan,-8.837576863617241374e-01,nan 86 | 5.783526399497983528e-01,-1.568427441620841467e+00,nan,nan 87 | -2.836003143929887171e-01,nan,nan,nan 88 | nan,nan,6.143182346112907588e-02,nan 89 | nan,nan,nan,nan 90 | -8.537457775861589937e-02,9.917736724082389932e-01,-4.268716727368595532e-01,nan 91 | nan,-1.019031454530859415e+00,nan,nan 92 | nan,nan,1.042925054651744121e+00,9.984537828953213845e-01 93 | -1.995090873945649934e-01,nan,5.535149367168700207e-01,nan 94 | 2.228127146357133381e-01,nan,nan,nan 95 | nan,-1.382705804273485883e+00,nan,nan 96 | nan,nan,nan,nan 97 | 3.381358892249306525e-01,-1.326733296875020063e+00,nan,4.291618094613102175e-01 98 | nan,1.592912086579896691e+00,nan,5.686762988479203695e-01 99 | nan,nan,-7.593116778742476924e-01,nan 100 | nan,1.163667252112682515e+00,-6.318787198098960722e-01,nan 101 | -------------------------------------------------------------------------------- /adenine/examples/data/Y_missing_test.csv: -------------------------------------------------------------------------------- 1 | 1.000000000000000000e+00 2 | 1.000000000000000000e+00 3 | 2.000000000000000000e+00 4 | 0.000000000000000000e+00 5 | 0.000000000000000000e+00 6 | 1.000000000000000000e+00 7 | 0.000000000000000000e+00 8 | 0.000000000000000000e+00 9 | 1.000000000000000000e+00 10 | 2.000000000000000000e+00 11 | 1.000000000000000000e+00 12 | 2.000000000000000000e+00 13 | 0.000000000000000000e+00 14 | 2.000000000000000000e+00 15 | 1.000000000000000000e+00 16 | 2.000000000000000000e+00 17 | 2.000000000000000000e+00 18 | 1.000000000000000000e+00 19 | 1.000000000000000000e+00 20 | 0.000000000000000000e+00 21 | 2.000000000000000000e+00 22 | 2.000000000000000000e+00 23 | 2.000000000000000000e+00 24 | 0.000000000000000000e+00 25 | 2.000000000000000000e+00 26 | 0.000000000000000000e+00 27 | 0.000000000000000000e+00 28 | 1.000000000000000000e+00 29 | 2.000000000000000000e+00 30 | 0.000000000000000000e+00 31 | 1.000000000000000000e+00 32 | 0.000000000000000000e+00 33 | 0.000000000000000000e+00 34 | 2.000000000000000000e+00 35 | 1.000000000000000000e+00 36 | 1.000000000000000000e+00 37 | 0.000000000000000000e+00 38 | 1.000000000000000000e+00 39 | 2.000000000000000000e+00 40 | 1.000000000000000000e+00 41 | 0.000000000000000000e+00 42 | 0.000000000000000000e+00 43 | 0.000000000000000000e+00 44 | 1.000000000000000000e+00 45 | 0.000000000000000000e+00 46 | 0.000000000000000000e+00 47 | 1.000000000000000000e+00 48 | 2.000000000000000000e+00 49 | 1.000000000000000000e+00 50 | 2.000000000000000000e+00 51 | 2.000000000000000000e+00 52 | 2.000000000000000000e+00 53 | 2.000000000000000000e+00 54 | 0.000000000000000000e+00 55 | 0.000000000000000000e+00 56 | 1.000000000000000000e+00 57 | 1.000000000000000000e+00 58 | 2.000000000000000000e+00 59 | 0.000000000000000000e+00 60 | 0.000000000000000000e+00 61 | 0.000000000000000000e+00 62 | 2.000000000000000000e+00 63 | 2.000000000000000000e+00 64 | 0.000000000000000000e+00 65 | 1.000000000000000000e+00 66 | 2.000000000000000000e+00 67 | 0.000000000000000000e+00 68 | 1.000000000000000000e+00 69 | 2.000000000000000000e+00 70 | 1.000000000000000000e+00 71 | 0.000000000000000000e+00 72 | 1.000000000000000000e+00 73 | 2.000000000000000000e+00 74 | 0.000000000000000000e+00 75 | 2.000000000000000000e+00 76 | 1.000000000000000000e+00 77 | 1.000000000000000000e+00 78 | 0.000000000000000000e+00 79 | 0.000000000000000000e+00 80 | 2.000000000000000000e+00 81 | 1.000000000000000000e+00 82 | 2.000000000000000000e+00 83 | 1.000000000000000000e+00 84 | 1.000000000000000000e+00 85 | 2.000000000000000000e+00 86 | 1.000000000000000000e+00 87 | 0.000000000000000000e+00 88 | 1.000000000000000000e+00 89 | 2.000000000000000000e+00 90 | 2.000000000000000000e+00 91 | 1.000000000000000000e+00 92 | 0.000000000000000000e+00 93 | 1.000000000000000000e+00 94 | 1.000000000000000000e+00 95 | 1.000000000000000000e+00 96 | 2.000000000000000000e+00 97 | 1.000000000000000000e+00 98 | 0.000000000000000000e+00 99 | 1.000000000000000000e+00 100 | 2.000000000000000000e+00 101 | -------------------------------------------------------------------------------- /adenine/examples/data/y.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/adenine/examples/data/y.npy -------------------------------------------------------------------------------- /adenine/externals/__init__.py: -------------------------------------------------------------------------------- 1 | from .hierarchical import AgglomerativeClustering 2 | -------------------------------------------------------------------------------- /adenine/test/Y_missing_test.csv: -------------------------------------------------------------------------------- 1 | 0.000000000000000000e+00 2 | 0.000000000000000000e+00 3 | 0.000000000000000000e+00 4 | 0.000000000000000000e+00 5 | 0.000000000000000000e+00 6 | 0.000000000000000000e+00 7 | 0.000000000000000000e+00 8 | 0.000000000000000000e+00 9 | 0.000000000000000000e+00 10 | 0.000000000000000000e+00 11 | 0.000000000000000000e+00 12 | 0.000000000000000000e+00 13 | 0.000000000000000000e+00 14 | 0.000000000000000000e+00 15 | 0.000000000000000000e+00 16 | 0.000000000000000000e+00 17 | 0.000000000000000000e+00 18 | 0.000000000000000000e+00 19 | 0.000000000000000000e+00 20 | 0.000000000000000000e+00 21 | 0.000000000000000000e+00 22 | 0.000000000000000000e+00 23 | 0.000000000000000000e+00 24 | 0.000000000000000000e+00 25 | 0.000000000000000000e+00 26 | 0.000000000000000000e+00 27 | 0.000000000000000000e+00 28 | 0.000000000000000000e+00 29 | 0.000000000000000000e+00 30 | 0.000000000000000000e+00 31 | 0.000000000000000000e+00 32 | 0.000000000000000000e+00 33 | 0.000000000000000000e+00 34 | 0.000000000000000000e+00 35 | 0.000000000000000000e+00 36 | 0.000000000000000000e+00 37 | 0.000000000000000000e+00 38 | 0.000000000000000000e+00 39 | 0.000000000000000000e+00 40 | 0.000000000000000000e+00 41 | 0.000000000000000000e+00 42 | 0.000000000000000000e+00 43 | 0.000000000000000000e+00 44 | 0.000000000000000000e+00 45 | 0.000000000000000000e+00 46 | 0.000000000000000000e+00 47 | 0.000000000000000000e+00 48 | 0.000000000000000000e+00 49 | 0.000000000000000000e+00 50 | 0.000000000000000000e+00 51 | 1.000000000000000000e+00 52 | 1.000000000000000000e+00 53 | 1.000000000000000000e+00 54 | 1.000000000000000000e+00 55 | 1.000000000000000000e+00 56 | 1.000000000000000000e+00 57 | 1.000000000000000000e+00 58 | 1.000000000000000000e+00 59 | 1.000000000000000000e+00 60 | 1.000000000000000000e+00 61 | 1.000000000000000000e+00 62 | 1.000000000000000000e+00 63 | 1.000000000000000000e+00 64 | 1.000000000000000000e+00 65 | 1.000000000000000000e+00 66 | 1.000000000000000000e+00 67 | 1.000000000000000000e+00 68 | 1.000000000000000000e+00 69 | 1.000000000000000000e+00 70 | 1.000000000000000000e+00 71 | 1.000000000000000000e+00 72 | 1.000000000000000000e+00 73 | 1.000000000000000000e+00 74 | 1.000000000000000000e+00 75 | 1.000000000000000000e+00 76 | 1.000000000000000000e+00 77 | 1.000000000000000000e+00 78 | 1.000000000000000000e+00 79 | 1.000000000000000000e+00 80 | 1.000000000000000000e+00 81 | 1.000000000000000000e+00 82 | 1.000000000000000000e+00 83 | 1.000000000000000000e+00 84 | 1.000000000000000000e+00 85 | 1.000000000000000000e+00 86 | 1.000000000000000000e+00 87 | 1.000000000000000000e+00 88 | 1.000000000000000000e+00 89 | 1.000000000000000000e+00 90 | 1.000000000000000000e+00 91 | 1.000000000000000000e+00 92 | 1.000000000000000000e+00 93 | 1.000000000000000000e+00 94 | 1.000000000000000000e+00 95 | 1.000000000000000000e+00 96 | 1.000000000000000000e+00 97 | 1.000000000000000000e+00 98 | 1.000000000000000000e+00 99 | 1.000000000000000000e+00 100 | 1.000000000000000000e+00 101 | 2.000000000000000000e+00 102 | 2.000000000000000000e+00 103 | 2.000000000000000000e+00 104 | 2.000000000000000000e+00 105 | 2.000000000000000000e+00 106 | 2.000000000000000000e+00 107 | 2.000000000000000000e+00 108 | 2.000000000000000000e+00 109 | 2.000000000000000000e+00 110 | 2.000000000000000000e+00 111 | 2.000000000000000000e+00 112 | 2.000000000000000000e+00 113 | 2.000000000000000000e+00 114 | 2.000000000000000000e+00 115 | 2.000000000000000000e+00 116 | 2.000000000000000000e+00 117 | 2.000000000000000000e+00 118 | 2.000000000000000000e+00 119 | 2.000000000000000000e+00 120 | 2.000000000000000000e+00 121 | 2.000000000000000000e+00 122 | 2.000000000000000000e+00 123 | 2.000000000000000000e+00 124 | 2.000000000000000000e+00 125 | 2.000000000000000000e+00 126 | 2.000000000000000000e+00 127 | 2.000000000000000000e+00 128 | 2.000000000000000000e+00 129 | 2.000000000000000000e+00 130 | 2.000000000000000000e+00 131 | 2.000000000000000000e+00 132 | 2.000000000000000000e+00 133 | 2.000000000000000000e+00 134 | 2.000000000000000000e+00 135 | 2.000000000000000000e+00 136 | 2.000000000000000000e+00 137 | 2.000000000000000000e+00 138 | 2.000000000000000000e+00 139 | 2.000000000000000000e+00 140 | 2.000000000000000000e+00 141 | 2.000000000000000000e+00 142 | 2.000000000000000000e+00 143 | 2.000000000000000000e+00 144 | 2.000000000000000000e+00 145 | 2.000000000000000000e+00 146 | 2.000000000000000000e+00 147 | 2.000000000000000000e+00 148 | 2.000000000000000000e+00 149 | 2.000000000000000000e+00 150 | 2.000000000000000000e+00 151 | -------------------------------------------------------------------------------- /adenine/test/carttest.py: -------------------------------------------------------------------------------- 1 | from adenine.utils.extra import modified_cartesian 2 | 3 | A = [(1,0), (2,0)] 4 | B = [(3,0),(4,0)] 5 | C = [] 6 | D = [(5,0),(6,0)] 7 | 8 | modified_cartesian(A,B,C,D) 9 | -------------------------------------------------------------------------------- /adenine/test/imputing_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | from __future__ import division 11 | 12 | import numpy as np 13 | 14 | from adenine.utils import data_source 15 | from adenine.utils.extensions import Imputer 16 | 17 | 18 | def test(missing_rate): 19 | """ 20 | Testing the KNN data imputing. 21 | """ 22 | Xreal, y, feat_names, class_names = data_source.load('iris') 23 | # Xreal, y, feat_names, class_names = data_source.load('gauss', n_samples=100) 24 | n, p = Xreal.shape 25 | print("{} x {} matrix loaded".format(n, p)) 26 | 27 | # Choose the missing rate 28 | # missing_rate = 0.5 29 | n_missing = int(missing_rate * (n*p)) 30 | 31 | # Create holes in the matrix 32 | np.random.seed(42) 33 | idx = np.random.permutation(n*p) 34 | xx = Xreal.ravel().copy() 35 | xx[idx[:n_missing]] = np.nan 36 | X = np.reshape(xx, (n, p)) 37 | print("{} values deleted".format(n_missing)) 38 | 39 | # Save data 40 | np.savetxt('X_missing.csv', X, delimiter=',') 41 | np.savetxt('Y_missing_test.csv', y, delimiter=',') 42 | 43 | # Start test 44 | strategies = ["mean", "median", "most_frequent", "nearest_neighbors"] 45 | 46 | imp = Imputer(strategy=strategies[3]) 47 | Ximp = imp.fit_transform(X) 48 | 49 | if len(np.where(np.isnan(Ximp))[0]) == 0: 50 | print("All values were imputed according to: {}-strategy".format(imp.strategy)) 51 | else: 52 | print("Empty values: {}".format(len(np.where(np.isnan(Ximp))[0]))) 53 | 54 | # Check results 55 | dist = np.sqrt(np.sum((Xreal[imp.X_mask,:].ravel() - Ximp.ravel())**2)) 56 | print("dist(Xreal - Ximp) = {}".format(dist)) 57 | 58 | # print(Ximp) 59 | 60 | 61 | def main(): 62 | for missing_rate in np.linspace(0.01, 0.3, 2): 63 | print("\nmissing rate: {}".format(missing_rate)) 64 | test(missing_rate) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /adenine/test/imputing_test_lite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | from __future__ import division 11 | 12 | import numpy as np 13 | 14 | from adenine.utils import data_source 15 | from adenine.utils.extensions import Imputer 16 | 17 | 18 | def test(missing_rate): 19 | """ 20 | Testing the KNN data imputing. 21 | """ 22 | np.random.seed(42) 23 | Xreal, y, feat_names, class_names = data_source.load('iris') 24 | # Xreal, y, feat_names, class_names = data_source.load('gauss', n_samples=80) 25 | n, p = Xreal.shape 26 | print("{} x {} matrix loaded".format(n, p)) 27 | 28 | # Choose the missing rate 29 | # missing_rate = 0.5 30 | n_missing = int(missing_rate * (n*p)) 31 | 32 | # Create holes in the matrix 33 | idx = np.random.permutation(n*p) 34 | xx = Xreal.ravel().copy() 35 | xx[idx[:n_missing]] = np.nan 36 | X = np.reshape(xx, (n, p)) 37 | # X[0,:] = np.nan 38 | print("{} values deleted".format(n_missing)) 39 | 40 | # Save data 41 | np.savetxt('X_missing.csv', X, delimiter=',') 42 | np.savetxt('Y_missing_test.csv', y, delimiter=',') 43 | 44 | # Start test 45 | strategies = ["mean", "median", "most_frequent", "nearest_neighbors"] 46 | 47 | imp = Imputer(strategy=strategies[3]) 48 | Ximp = imp.fit_transform(X) 49 | # Xtr = X[:50, :] 50 | # Xts = X[50:, :] 51 | # imp.fit(Xtr) 52 | # Ximp = imp.transform(Xts) 53 | 54 | if len(np.where(np.isnan(Ximp))[0]) == 0: 55 | print("All values were imputed according to: {}-strategy".format(imp.strategy)) 56 | else: 57 | print("Empty values: {}".format(len(np.where(np.isnan(Ximp))[0]))) 58 | 59 | # Check results 60 | dist = np.sqrt(np.sum((Xreal[imp.X_mask,:].ravel() - Ximp.ravel())**2)) 61 | print("dist(Xreal - Ximp) = {}".format(dist)) 62 | 63 | 64 | 65 | def main(): 66 | # for missing_rate in np.linspace(0.01, 0.3, 2): 67 | missing_rate = 0.3 68 | print("\nmissing rate: {}".format(missing_rate)) 69 | test(missing_rate) 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /adenine/utils/GEO2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """This module contains utility functions for GEO DataSets wrangling.""" 5 | 6 | ###################################################################### 7 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 8 | # 9 | # FreeBSD License 10 | ###################################################################### 11 | 12 | import GEOparse 13 | import logging 14 | import os 15 | import pandas as pd 16 | from sklearn import datasets 17 | from six.moves import filter 18 | 19 | 20 | def get_GEO(accession_number, phenotype_name='title', return_gse=False): 21 | """Get the GEO data from its accession number. 22 | 23 | Parameters 24 | ----------- 25 | accession_number : string 26 | 'GSEXXXXX' is any GEO accession ID loaded by `GEOparse`. 27 | 28 | Returns 29 | ----------- 30 | data : sklearn.datasets.base.Bunch 31 | the dataset bunch 32 | gse : GEOparse.GEOTypes.GSE 33 | the GEOparse object 34 | """ 35 | gse = GEOparse.get_GEO(geo=accession_number, destdir=os.curdir, 36 | silent=True, include_data=True, 37 | how='full') 38 | xx = gse.pivot_samples('VALUE').transpose() 39 | index = xx.index.tolist() 40 | feature_names = xx.columns.tolist() 41 | yy = gse.phenotype_data[phenotype_name] 42 | data = datasets.base.Bunch(data=xx.values, target=yy.values, 43 | feature_names=feature_names, 44 | index=index) 45 | 46 | 47 | print('* Desired labels can be found with --label_field = ') 48 | for k in gse.phenotype_data.keys(): 49 | print('\t{}'.format(k)) 50 | 51 | out = [data] 52 | if return_gse: 53 | out.append(gse) 54 | 55 | return out 56 | 57 | 58 | def label_mapper(raw_labels, new_labels): 59 | """Map some raw labels into new labels. 60 | 61 | When dealing with GEO DataSets it is very common that each GSM sample has 62 | a different phenotye (e.g. 'Brain - 001', 'Brain - 002', ...). This 63 | function maps these raw labels into new homogeneous labels. 64 | 65 | Parameters 66 | ----------- 67 | raw_labels : list of strings 68 | list of unpreprocessed labels 69 | new_labels : list of strings 70 | list of labels to map 71 | 72 | Returns 73 | ----------- 74 | y : array of float, shape : n_samples 75 | the modified label vector 76 | 77 | Examples 78 | ----------- 79 | >>> raw_labels = ['Brain - 001', 'Brain - 002', 'Muscle - 001', 'Muscle - 002'] 80 | >>> label_mapper(raw_labels, ['Brain', 'Muscle']) 81 | ['Brain', 'Brain', 'Muscle', 'Muscle'] 82 | """ 83 | y = [] 84 | for rl in raw_labels: 85 | for nl in new_labels: 86 | if nl in rl: 87 | y.append(nl) 88 | break 89 | else: 90 | y.append(rl) 91 | # print('No mapping rule for %s', rl) 92 | return y 93 | 94 | 95 | def GEO_select_samples(data, labels, selected_labels, index, 96 | feature_names=None): 97 | """GEO DataSets data selection tool. 98 | 99 | Modify the labels with `label_mapper` then return only the samples with 100 | labels in selected_labels. 101 | 102 | Parameters 103 | ----------- 104 | data : array of float, shape : n_samples x n_features 105 | the dataset 106 | labels : numpy array (n_samples,) 107 | the labels vector 108 | selected_labels : list of strings 109 | a subset of new_labels containing only the samples wanted in the 110 | final dataset 111 | index : list of strings 112 | the sample indexes 113 | feature_names : list of strings 114 | the feature set 115 | samples_on : string in ['col', 'cols', 'row', 'rows'] 116 | wether the samples are on columns or rows 117 | 118 | Returns 119 | ----------- 120 | data : sklearn.datasets.base.Bunch 121 | An instance of the sklearn.datasets.base.Bunch class, the meaningful 122 | attributes are .data, the data matrix, and .target, the label vector. 123 | """ 124 | mapped_y = pd.DataFrame(data=label_mapper(labels, selected_labels), 125 | index=index, columns=['Phenotype']) 126 | y = mapped_y[mapped_y['Phenotype'].isin(selected_labels)] 127 | X = pd.DataFrame(data, index=index, columns=feature_names).loc[y.index] 128 | return datasets.base.Bunch(data=X.values, feature_names=X.columns, 129 | target=y.values.ravel(), index=X.index.tolist()) 130 | 131 | def id2gs(data, gse): 132 | """Convert IDs into GENE_SYMBOL. 133 | 134 | Parameters 135 | ----------- 136 | data : sklearn.datasets.base.Bunch 137 | the dataset bunch 138 | gse : GEOparse.GEOTypes.GSE 139 | the GEOparse object 140 | 141 | Returns 142 | ----------- 143 | data : sklearn.datasets.base.Bunch 144 | where feature_names has the gene symbols 145 | """ 146 | # Get the platform name 147 | platform = gse.gpls.keys()[0] 148 | 149 | # Create the lookup table 150 | lookup_table = pd.DataFrame(data=gse.gpls[platform].table['GENE_SYMBOL'].values, 151 | index=gse.gpls[platform].table['ID'].values, 152 | columns=['GENE_SYMBOL']) 153 | # Correct NaN failures 154 | for i, lt_value in enumerate(lookup_table.values.ravel()): 155 | if pd.isnull(lt_value): 156 | lookup_table.values[i] = str(lookup_table.index[i])+'__NO-MATCH' 157 | gene_symbol = [lookup_table['GENE_SYMBOL'].loc[_id] for _id in data.feature_names] 158 | 159 | # Make bunch and return 160 | return datasets.base.Bunch(data=data.data, feature_names=gene_symbol, 161 | target=data.target, index=data.index) 162 | 163 | 164 | def restrict_to_signature(data, signature): 165 | """Restrict the data to the genes in the signature. 166 | 167 | Parameters 168 | ----------- 169 | data : sklearn.datasets.base.Bunch 170 | the dataset bunch 171 | signature : list 172 | list of signature genes 173 | 174 | Returns 175 | ----------- 176 | data : sklearn.datasets.base.Bunch 177 | where feature_names has the gene symbols restricted to signature 178 | """ 179 | df = pd.DataFrame(data=data.data, index=data.index, 180 | columns=data.feature_names) 181 | # Filter out signatures gene not in the gene set 182 | signature = list(filter(lambda x: x in data.feature_names, signature)) 183 | df = df[signature] 184 | # Make bunch and return 185 | return datasets.base.Bunch(data=df.values, feature_names=df.columns, 186 | target=data.target, index=data.index) 187 | -------------------------------------------------------------------------------- /adenine/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 3 | # 4 | # FreeBSD License 5 | ###################################################################### 6 | -------------------------------------------------------------------------------- /adenine/utils/data_source.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """This module is mainly a wrapper for some sklearn.datasets functions.""" 5 | 6 | ###################################################################### 7 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 8 | # 9 | # FreeBSD License 10 | ###################################################################### 11 | import sys 12 | import os 13 | import logging 14 | import numpy as np 15 | import pandas as pd 16 | from sklearn import datasets 17 | from sklearn.preprocessing import Binarizer 18 | 19 | # Legacy import 20 | try: 21 | from sklearn.model_selection import StratifiedShuffleSplit 22 | except ImportError: 23 | from sklearn.cross_validation import StratifiedShuffleSplit 24 | 25 | 26 | def generate_gauss(mu=None, std=None, n_sample=None): 27 | """Create a Gaussian dataset. 28 | 29 | Generates a dataset with n_sample * n_class examples and n_dim dimensions. 30 | 31 | Parameters 32 | ----------- 33 | mu : array of float, shape : n_class x n_dim 34 | The mean of each class. 35 | 36 | std : array of float, shape : n_class 37 | The standard deviation of each Gaussian distribution. 38 | 39 | n_sample : int 40 | Number of point per class. 41 | """ 42 | n_class, n_var = mu.shape 43 | 44 | X = np.zeros((n_sample * n_class, n_var)) 45 | y = np.zeros(n_sample * n_class, dtype=int) 46 | 47 | start = 0 48 | for i, s, m in zip(range(n_class), std, mu): 49 | end = start + n_sample 50 | X[start:end, :] = s * np.random.randn(n_sample, n_var) + m 51 | y[start:end] = i 52 | start = end 53 | 54 | return X, y 55 | 56 | 57 | def load_custom(x_filename, y_filename, samples_on='rows', **kwargs): 58 | """Load a custom dataset. 59 | 60 | This function loads the data matrix and the label vector returning a 61 | unique sklearn-like object dataSetObj. 62 | 63 | Parameters 64 | ----------- 65 | x_filename : string 66 | The data matrix file name. 67 | 68 | y_filename : string 69 | The label vector file name. 70 | 71 | samples_on : string 72 | This can be either in ['row', 'rows'] if the samples lie on the row of 73 | the input data matrix, or viceversa in ['col', 'cols'] the other way 74 | around. 75 | 76 | kwargs : dict 77 | Arguments of pandas.read_csv function. 78 | 79 | Returns 80 | ----------- 81 | data : sklearn.datasets.base.Bunch 82 | An instance of the sklearn.datasets.base.Bunch class, the meaningful 83 | attributes are .data, the data matrix, and .target, the label vector. 84 | """ 85 | if x_filename is None: 86 | raise IOError("Filename for X must be specified with mode 'custom'.") 87 | 88 | if x_filename.endswith('.npy'): # it an .npy file is provided 89 | try: # labels are not mandatory 90 | y = np.load(y_filename) 91 | except IOError as e: 92 | y = None 93 | e.strerror = "No labels file provided" 94 | logging.error("I/O error({0}): {1}".format(e.errno, e.strerror)) 95 | X = np.load(x_filename) 96 | if samples_on not in ['row', 'rows']: 97 | # data matrix must be n_samples x n_features 98 | X = X.T 99 | return datasets.base.Bunch(data=X, target=y, 100 | index=np.arange(X.shape[0])) 101 | 102 | elif x_filename.endswith('.csv') or x_filename.endswith('.txt'): 103 | y = None 104 | kwargs.setdefault('header', 0) # header on first row 105 | kwargs.setdefault('index_col', 0) # indexes on first 106 | try: 107 | dfx = pd.read_csv(x_filename, **kwargs) 108 | if samples_on not in ['row', 'rows']: 109 | # data matrix must be n_samples x n_features 110 | dfx = dfx.transpose() 111 | if y_filename is not None: 112 | # Before loading labels, remove parameters that were likely 113 | # specified for data only. 114 | kwargs.pop('usecols', None) 115 | y = pd.read_csv(y_filename, **kwargs).as_matrix().ravel() 116 | 117 | except IOError as e: 118 | e.strerror = "Can't open {} or {}".format(x_filename, y_filename) 119 | logging.error("I/O error({0}): {1}".format(e.errno, e.strerror)) 120 | sys.exit(-1) 121 | 122 | return datasets.base.Bunch(data=dfx.as_matrix(), feature_names=dfx.columns.tolist(), 123 | target=y, index=dfx.index.tolist()) 124 | 125 | 126 | def load(opt='custom', x_filename=None, y_filename=None, n_samples=0, 127 | samples_on='rows', **kwargs): 128 | """Load a specified dataset. 129 | 130 | This function can be used either to load one of the standard scikit-learn 131 | datasets or a different dataset saved as X.npy Y.npy in the working 132 | directory. 133 | 134 | Parameters 135 | ----------- 136 | opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons', 137 | 'custom', 'GSEXXXXX'}, default: 'custom' 138 | Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes' 139 | 'boston', 'circles' and 'moons' refer to the correspondent 140 | `scikit-learn` datasets. 'custom' can be used to load a custom dataset 141 | which name is specified in `x_filename` and `y_filename` (optional). 142 | 143 | x_filename : string, default : None 144 | The data matrix file name. 145 | 146 | y_filename : string, default : None 147 | The label vector file name. 148 | 149 | n_samples : int 150 | The number of samples to be loaded. This comes handy when dealing with 151 | large datasets. When n_samples is less than the actual size of the 152 | dataset this function performs a random subsampling that is stratified 153 | w.r.t. the labels (if provided). 154 | 155 | samples_on : string 156 | This can be either in ['row', 'rows'] if the samples lie on the row of 157 | the input data matrix, or viceversa in ['col', 'cols'] the other way 158 | around. 159 | 160 | data_sep : string 161 | The data separator. For instance comma, tab, blank space, etc. 162 | 163 | Returns 164 | ----------- 165 | X : array of float, shape : n_samples x n_features 166 | The input data matrix. 167 | 168 | y : array of float, shape : n_samples 169 | The label vector; np.nan if missing. 170 | 171 | feature_names : array of integers (or strings), shape : n_features 172 | The feature names; a range of number if missing. 173 | 174 | index : list of integers (or strings) 175 | This is the samples identifier, if provided as first column (or row) of 176 | of the input file. Otherwise it is just an incremental range of size 177 | n_samples. 178 | """ 179 | data = None 180 | try: 181 | if opt.lower() == 'iris': 182 | data = datasets.load_iris() 183 | elif opt.lower() == 'digits': 184 | data = datasets.load_digits() 185 | elif opt.lower() == 'diabetes': 186 | data = datasets.load_diabetes() 187 | b = Binarizer(threshold=np.mean(data.target)) 188 | data.target = b.fit_transform(data.data) 189 | elif opt.lower() == 'boston': 190 | data = datasets.load_boston() 191 | b = Binarizer(threshold=np.mean(data.target)) 192 | data.target = b.fit_transform(data.data) 193 | elif opt.lower() == 'gauss': 194 | means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]]) 195 | sigmas = np.array([0.33, 0.33, 0.33]) 196 | if n_samples <= 1: 197 | n_samples = 333 198 | xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples) 199 | data = datasets.base.Bunch(data=xx, target=yy) 200 | elif opt.lower() == 'circles': 201 | if n_samples == 0: 202 | n_samples = 400 203 | xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3, 204 | noise=.05) 205 | data = datasets.base.Bunch(data=xx, target=yy) 206 | elif opt.lower() == 'moons': 207 | if n_samples == 0: 208 | n_samples = 400 209 | xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01) 210 | data = datasets.base.Bunch(data=xx, target=yy) 211 | elif opt.lower() == 'custom': 212 | data = load_custom(x_filename, y_filename, samples_on, **kwargs) 213 | elif opt.lower().startswith('gse'): 214 | raise Exception("Use ade_GEO2csv.py to convert GEO DataSets" 215 | "into csv files.") 216 | except IOError as e: 217 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 218 | 219 | X, y = data.data, data.target 220 | if n_samples > 0 and X.shape[0] > n_samples: 221 | if y is not None: 222 | try: # Legacy for sklearn 223 | sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1) 224 | # idx = np.random.permutation(X.shape[0])[:n_samples] 225 | except TypeError: 226 | sss = StratifiedShuffleSplit(test_size=n_samples) \ 227 | .split(X, y) 228 | _, idx = list(sss)[0] 229 | else: 230 | idx = np.arange(X.shape[0]) 231 | np.random.shuffle(idx) 232 | idx = idx[:n_samples] 233 | 234 | X, y = X[idx, :], y[idx] 235 | else: 236 | # The length of index must be consistent with the number of samples 237 | idx = np.arange(X.shape[0]) 238 | 239 | feat_names = data.feature_names if hasattr(data, 'feature_names') \ 240 | else np.arange(X.shape[1]) 241 | index = np.array(data.index)[idx] if hasattr(data, 'index') \ 242 | else np.arange(X.shape[0]) 243 | 244 | return X, y, feat_names, index 245 | -------------------------------------------------------------------------------- /adenine/utils/extra.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | import os 11 | import time 12 | import matplotlib; matplotlib.use('Agg') 13 | import seaborn as sns 14 | 15 | from datetime import datetime 16 | from itertools import product 17 | 18 | 19 | class Palette(): 20 | """Wrapper for seaborn palette.""" 21 | 22 | def __init__(self, name='Set1', n_colors=6): 23 | self.name = name 24 | self.palette = sns.color_palette(name, n_colors) 25 | 26 | def get(self, i=0): 27 | return self.palette[i] 28 | 29 | def next(self): 30 | self.palette.append(self.palette.pop(0)) 31 | return self.palette[-1] 32 | 33 | def reset(self, n_colors=6): 34 | self.palette = sns.color_palette(self.name, n_colors) 35 | 36 | 37 | # ensure_list = lambda x: x if type(x) == list else [x] 38 | def ensure_list(x): 39 | return x if type(x) == list else [x] 40 | 41 | 42 | def values_iterator(dictionary): 43 | """Add support for python2 or 3 dictionary iterators.""" 44 | try: 45 | v = dictionary.itervalues() # python 2 46 | except: 47 | v = dictionary.values() # python 3 48 | return v 49 | 50 | 51 | def items_iterator(dictionary): 52 | """Add support for python2 or 3 dictionary iterators.""" 53 | try: 54 | gen = dictionary.iteritems() # python 2 55 | except: 56 | gen = dictionary.items() # python 3 57 | return gen 58 | 59 | 60 | def modified_cartesian(*args, **kwargs): 61 | """Modified Cartesian product. 62 | 63 | This takes two (or more) lists and returns their Cartesian product. 64 | If one of two list is empty this function returns the non-empty one. 65 | 66 | Parameters 67 | ----------- 68 | *args : lists, length : two or more 69 | The group of input lists. 70 | 71 | Returns 72 | ----------- 73 | cp : list 74 | The Cartesian Product of the two (or more) nonempty input lists. 75 | """ 76 | # Get the non-empty input lists 77 | if kwargs.get('pipes_mode', False): 78 | nonempty = [ensure_list(arg) for arg in args if len(ensure_list(arg)) > 0] 79 | else: 80 | nonempty = [ensure_list(arg) if len(ensure_list(arg)) > 0 else [None] for arg in args] 81 | 82 | # Cartesian product 83 | return [list(c) for c in product(*nonempty)] 84 | 85 | 86 | def make_time_flag(): 87 | """Generate a time flag. 88 | 89 | This function simply generates a time flag using the current time. 90 | 91 | Returns 92 | ----------- 93 | timeFlag : string 94 | A unique time flag. 95 | """ 96 | y = str(time.localtime().tm_year) 97 | mo = str(time.localtime().tm_mon) 98 | d = str(time.localtime().tm_mday) 99 | h = str(time.localtime().tm_hour) 100 | mi = str(time.localtime().tm_min) 101 | s = str(time.localtime().tm_sec) 102 | return h + ':' + mi + ':' + s + '_' + d + '-' + mo + '-' + y 103 | 104 | 105 | def sec_to_time(seconds): 106 | """Transform seconds into a formatted time string. 107 | 108 | Parameters 109 | ----------- 110 | seconds : int 111 | Seconds to be transformed. 112 | 113 | Returns 114 | ----------- 115 | time : string 116 | A well formatted time string. 117 | """ 118 | m, s = divmod(seconds, 60) 119 | h, m = divmod(m, 60) 120 | return "%02d:%02d:%02d" % (h, m, s) 121 | 122 | 123 | def get_time(): 124 | """Get time of now, in string.""" 125 | return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H.%M.%S') 126 | 127 | 128 | def title_from_filename(root, step_sep="$\mapsto$"): 129 | # Define the plot title. List is smth like ['results', 'ade_debug_', 'Standardize', 'PCA'] 130 | i = [i for i, s in enumerate(root.split(os.sep)) if 'ade_' in s][0] 131 | 132 | # lambda function below does: ('a_b_c') -> 'c b a' 133 | return step_sep.join(map(lambda x: ' '.join(x.split('_')[::-1]), root.split(os.sep)[i+1:])) 134 | 135 | 136 | def ensure_symmetry(X): 137 | """Ensure matrix symmetry. 138 | 139 | Parameters 140 | ----------- 141 | X : numpy.ndarray 142 | Input matrix of precomputed pairwise distances. 143 | 144 | Returns 145 | ----------- 146 | new_X : numpy.ndarray 147 | Symmetric distance matrix. Values are averaged. 148 | """ 149 | if not (X.T == X).all(): 150 | return (X.T + X) / 2. 151 | else: 152 | return X 153 | 154 | 155 | def timed(function): 156 | """Decorator that measures wall time of the decored function.""" 157 | def timed_function(*args, **kwargs): 158 | t0 = time.time() 159 | result = function(*args, **kwargs) 160 | print("\nAdenine {} - Elapsed time : {} s\n" 161 | .format(function.__name__, sec_to_time(time.time() - t0))) 162 | return result 163 | return timed_function 164 | 165 | 166 | def set_module_defaults(module, dictionary): 167 | """Set default variables of a module, given a dictionary. 168 | 169 | Used after the loading of the configuration file to set some defaults. 170 | """ 171 | for k, v in items_iterator(dictionary): 172 | try: 173 | getattr(module, k) 174 | except AttributeError: 175 | setattr(module, k, v) 176 | -------------------------------------------------------------------------------- /adenine/utils/scores.py: -------------------------------------------------------------------------------- 1 | """Validation utils for clustering algorithms. 2 | 3 | Notes 4 | ----- 5 | Precision, recall and F score 6 | In multiclass classification / clustering, a confusion matrix can be 7 | obtained. To validate the result, one can use precision, recall and 8 | f score. These are obtained using TP, FP, FN, TN. 9 | In particular, for each class (true label) x, in a confusion matrix cm: 10 | - true positive: diagonal position, cm(x, x). 11 | - false positive: sum of column x (without main diagonal), 12 | sum(cm(:, x)) - cm(x, x). 13 | - false negative: sum of row x (without main diagonal), 14 | sum(cm(x, :), 2) - cm(x, x). 15 | - true negative: sum of all the matrix without tp, fp, fn. 16 | 17 | Averaging over all classes (with or without weighting) gives values for the 18 | entire model. 19 | 20 | Author: Federico Tomasi 21 | Copyright (c) 2016, Federico Tomasi. 22 | Licensed under the FreeBSD license (see LICENSE.txt). 23 | """ 24 | import matplotlib; matplotlib.use('Agg') 25 | import numpy as np 26 | import pandas as pd 27 | import seaborn as sns 28 | 29 | 30 | def get_clones_real_estimated(filename): 31 | """Get true and estimated labels from a partis-generated dataset.""" 32 | df = pd.read_csv(filename, dialect='excel-tab', header=0, 33 | usecols=('SEQUENCE_ID', 'CLONE')) 34 | df['CLONE_ID'] = df['SEQUENCE_ID'].str.split('_').apply(lambda x: x[3]) 35 | 36 | clone_ids = np.array(df['CLONE_ID'], dtype=str) 37 | found_ids = np.array(df['CLONE'], dtype=str) 38 | return clone_ids, found_ids 39 | 40 | 41 | def order_cm(cm): 42 | """Reorder a multiclass confusion matrix.""" 43 | # reorder rows 44 | idx_rows = np.max(cm, axis=1).argsort()[::-1] 45 | b = cm[idx_rows, :] 46 | 47 | # reorder cols 48 | max_idxs = np.ones(b.shape[1], dtype=bool) 49 | final_idxs = [] 50 | for i, row in enumerate(b.copy()): 51 | if i == b.shape[0] or not max_idxs.any(): 52 | break 53 | row[~max_idxs] = np.min(cm) - 1 54 | max_idx = np.argmax(row) 55 | final_idxs.append(max_idx) 56 | max_idxs[max_idx] = False 57 | 58 | idx_cols = np.append(np.array(final_idxs, dtype=int), 59 | np.argwhere(max_idxs).T[0]) # residuals 60 | 61 | # needs also this one 62 | b = b[:, idx_cols] 63 | bb = b.copy() 64 | max_idxs = np.ones(b.shape[0], dtype=bool) 65 | final_idxs = [] 66 | for i in range(b.shape[1]): 67 | # for each column 68 | if i == b.shape[1] or not max_idxs.any(): 69 | break 70 | col = bb[:, i] 71 | col[~max_idxs] = -1 72 | max_idx = np.argmax(col) 73 | final_idxs.append(max_idx) 74 | max_idxs[max_idx] = False 75 | 76 | idx_rows2 = np.append(np.array(final_idxs, dtype=int), 77 | np.argwhere(max_idxs).T[0]) # residuals 78 | 79 | idx = np.argsort(idx_rows) 80 | return b[idx_rows2, :], idx_rows2[idx], idx_cols 81 | 82 | 83 | def confusion_matrix(true_labels, estimated_labels, ordered=True): 84 | """Return a confusion matrix in a multiclass / multilabel problem.""" 85 | true_labels = np.array(true_labels, dtype=str) 86 | estimated_labels = np.array(estimated_labels, dtype=str) 87 | if true_labels.shape[0] != estimated_labels.shape[0]: 88 | raise ValueError("Inputs must have the same dimensions.") 89 | rows = np.unique(true_labels) 90 | cols = np.unique(estimated_labels) 91 | 92 | # padding only on columns 93 | cm = np.zeros((rows.shape[0], max(cols.shape[0], rows.shape[0]))) 94 | from collections import Counter 95 | for i, row in enumerate(rows): 96 | idx_rows = true_labels == row 97 | counter = Counter(estimated_labels[idx_rows]) 98 | for g in counter: 99 | idx_col = np.where(cols == g)[0][0] 100 | cm[i, idx_col] += counter[g] 101 | 102 | cols = np.append(cols, ['pad'] * (cm.shape[1] - cols.shape[0])) 103 | if ordered: 104 | cm, rr, cc = order_cm(cm) 105 | rows, cols = rows[rr], cols[cc] 106 | return cm, rows, cols 107 | 108 | 109 | def precision_recall_fscore(a, method='micro', beta=1.): 110 | """Return a precision / recall value for multiclass confuison matrix cm. 111 | 112 | See 113 | http://stats.stackexchange.com/questions/44261/how-to-determine-the-quality-of-a-multiclass-classifier 114 | """ 115 | def _single_measures(a, i): 116 | tp = a[i, i] 117 | fp = np.sum(a[:, i]) - tp 118 | fn = np.sum(a[i, :]) - tp 119 | tn = a.sum() - tp - fp - fn 120 | return tp, fp, fn, tn 121 | 122 | singles = zip(*[_single_measures(a, i) for i in range(min(a.shape))]) 123 | tps, fps, fns, tns = map(lambda x: np.array(list(x), dtype=float), singles) 124 | 125 | if method == 'micro': 126 | precision = float(tps.sum()) / (tps + fps).sum() 127 | recall = float(tps.sum()) / (tps + fns).sum() 128 | elif method == 'macro': 129 | sum_ = tps + fps 130 | idx = np.where(sum_) 131 | precision = (tps[idx] / sum_[idx]).mean() 132 | 133 | sum_ = tps + fns 134 | idx = np.where(sum_) 135 | recall = (tps[idx] / sum_[idx]).mean() 136 | fscore = (1 + beta * beta) * precision * recall / \ 137 | (beta * beta * precision + recall) 138 | return precision, recall, fscore 139 | 140 | 141 | def show_heatmap(filename): 142 | """Show confusion matrix given of a partis-generated tab-delimited db.""" 143 | true_labels, estimated_labels = get_clones_real_estimated(filename) 144 | cm, rows, cols = confusion_matrix(true_labels, estimated_labels) 145 | df = pd.DataFrame(cm, index=rows, columns=cols) 146 | sns.heatmap(df) 147 | sns.plt.show() 148 | -------------------------------------------------------------------------------- /adenine/utils/templates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | def new_fun(arg1 = 'Default', arg2 = 'Default'): 11 | """Short explanation. 12 | 13 | This is the very long explanation 14 | 15 | Parameters 16 | ----------- 17 | arg1 : type, default : 'Default' 18 | What is arg1. 19 | 20 | arg2 : {'Default', 'Different', 'Another'} 21 | What is arg2. 22 | 23 | Returns 24 | ----------- 25 | out : type 26 | What is out. 27 | """ 28 | -------------------------------------------------------------------------------- /doc/GiHubProjectPage.txt: -------------------------------------------------------------------------------- 1 | ### Welcome to ADENINE. 2 | ADENINE is a machine learning and data mining framework that helps you answering the tedious question: are my data relevant for the problem I'm dealing with? 3 | 4 | ### Implementation 5 | With ADENINE you can build different unsupervised data analysis pipelines made of the following steps: 6 | 7 | 1. missing values imputing 8 | 2. preprocessing 9 | 3. dimensionality reduction 10 | 4. clustering 11 | 12 | a list of the most common state-of-the-art methods is available for each step. 13 | 14 | ### Dependencies 15 | ADENINE is developed using Python 2.7 and inherits its main functionalities from: 16 | * numpy 17 | * scipy 18 | * scikit-learn 19 | * matplotlib 20 | * seaborn 21 | 22 | ### Authors and Contributors 23 | Current developers: Samuele Fiorini (@samuelefiorini) and Federico Tomasi (@fdtomasi). 24 | 25 | ### Support or Contact 26 | Having trouble with ADENINE? Check out our [documentation](http://www.slipguru.unige.it/Software/adenine/) or contact us: 27 | * samuele [dot] fiorini [at] dibris [dot] unige [dot] it 28 | * federico [dot] tomasi [at] dibris [dot] unige [dot] it 29 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/adenine.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/adenine.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/adenine" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/adenine" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/devPlan/plan.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/doc/devPlan/plan.pdf -------------------------------------------------------------------------------- /doc/devPlan/plan.tex: -------------------------------------------------------------------------------- 1 | %---------------------------------------------------------------------------------------- 2 | % PACKAGES AND OTHER DOCUMENT CONFIGURATIONS 3 | %---------------------------------------------------------------------------------------- 4 | 5 | \documentclass[paper=a4, fontsize=10pt]{scrartcl} % A4 paper and 10pt font size 6 | 7 | \usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs 8 | \usepackage[english]{babel} % English language/hyphenation 9 | \usepackage{amsmath,amsfonts,amsthm} % Math packages 10 | 11 | \usepackage[margin=1in]{geometry} 12 | 13 | \usepackage{xspace} % space after new commands 14 | \usepackage{hyperref} 15 | \usepackage{enumitem} 16 | 17 | 18 | \usepackage{sectsty} % Allows customizing section commands 19 | \allsectionsfont{\centering \normalfont\scshape} % Make all sections centered, the default font and small caps 20 | 21 | \usepackage{fancyhdr} % Custom headers and footers 22 | \pagestyle{fancyplain} % Makes all pages in the document conform to the custom headers and footers 23 | \fancyhead{} % No page header - if you want one, create it in the same way as the footers below 24 | \fancyfoot[L]{} % Empty left footer 25 | \fancyfoot[C]{} % Empty center footer 26 | \fancyfoot[R]{\thepage} % Page numbering for right footer 27 | \renewcommand{\headrulewidth}{0pt} % Remove header underlines 28 | \renewcommand{\footrulewidth}{0pt} % Remove footer underlines 29 | \setlength{\headheight}{11pt} % Customize the height of the header 30 | 31 | \numberwithin{equation}{section} % Number equations within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 32 | \numberwithin{figure}{section} % Number figures within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 33 | \numberwithin{table}{section} % Number tables within sections (i.e. 1.1, 1.2, 2.1, 2.2 instead of 1, 2, 3, 4) 34 | 35 | \setlength\parindent{0pt} % Removes all indentation from paragraphs - comment this line for an assignment with lots of text 36 | 37 | %---------------------------------------------------------------------------------------- 38 | % TITLE SECTION 39 | %---------------------------------------------------------------------------------------- 40 | 41 | \newcommand{\horrule}[1]{\rule{\linewidth}{#1}} % Create horizontal rule command with 1 argument of height 42 | \newcommand{\adenine}{{\tt adenine}\xspace} 43 | 44 | \title{ 45 | \normalfont \normalsize 46 | \huge{\tt ADENINE}: A Data ExploratioN pipelINE \\ 47 | \horrule{2pt} \\[0.5cm] % Thick bottom horizontal rule 48 | development plan \\ % The assignment title 49 | } 50 | 51 | \author{Samuele Fiorini} % Your name 52 | 53 | \date{\normalsize\today} % Today's date or a custom date 54 | 55 | \begin{document} 56 | 57 | \maketitle % Print the title 58 | 59 | %---------------------------------------------------------------------------------------- 60 | % PROBLEM 1 61 | %---------------------------------------------------------------------------------------- 62 | 63 | \section{Introduction and Motivation} 64 | 65 | A question that arises at the beginning of almost every new data analysis is 66 | the following: {\sl are my data relevant for the problem I'm dealing with}? \\ 67 | 68 | The final goal of this project (named \adenine) is to help its user to have a glimpse of the answer of 69 | this tedious question. \\ 70 | 71 | In order to reach this goal, \adenine will take advantage of machine learning and 72 | data mining techniques. The final pipeline will essentially consist of three steps: 73 | 74 | \begin{enumerate} 75 | 76 | \item {\bf Preprocessing}: have you ever wondered what would have 77 | changed if only your data have been preprocessed in a different way? Or if 78 | data preprocessing is a good idea at all? \adenine will offer several 79 | preprocessing procedures, such as: data centering, Min-Max scaling, 80 | standardization or normalization and allows you to compare the results of the 81 | analysis conducted with different starting point. 82 | 83 | \item {\bf Dimensionality Reduction} (DR): in the context of data 84 | exploration, this phase becomes particularly helpful for high dimensional data (e.g. 85 | -omics scenario). This step, generically named DR, may actually include some 86 | manifold learning (such as Isomap, Multidimensional Scaling, etc), supervised 87 | (Linear Discriminant Analysis) and unsupervised (Principal Component Analysis, 88 | kernel PCA) techniques. 89 | 90 | \item {\bf Clustering}: this section aims at grouping data into clusters without taking 91 | into account the class labels. Several techniques such as K-Means, Spectral or Hierarchical 92 | clustering will work on both original and dimensionality reduced data. 93 | 94 | \end{enumerate} 95 | 96 | The final output of \adenine will be an as compact as possible visual and textual representation of 97 | the results obtained from the pipelines made with each possible combination of the algorithms 98 | implemented at each step. As an example, referring to a pipeline built as: 99 | 100 | \begin{center} 101 | {\sl Data normalization $\rightarrow$ PCA $\rightarrow$ K-Means} 102 | \end{center} 103 | 104 | the output would be something like: 105 | 106 | \begin{itemize} 107 | 108 | \item an output file containing the norm of the original variables (which has 109 | been used to coerce all the features in $[0,1]$), 110 | 111 | \item a 2-D or 3-D scatter plot of the data projected along the principal 112 | components and the percentage of explained variance associated with each 113 | one of them, 114 | 115 | \item a pictorial representation of the data clustering results 116 | obtained with the optimum number of cluster (learned from the data). 117 | 118 | \end{itemize} 119 | 120 | \subsection{Material for PhD progress} 121 | 122 | The study behind the implementation of \adenine will be useful in terms of 123 | four PhD courses of my first-year work plan: 124 | 125 | \begin{enumerate} 126 | 127 | \item {\sl A Machine Learning Crash Course} [DIBRIS] (Odone, Rosasco): \adenine will cover 128 | a fair number of (mainly unsupervised) machine learning techniques. Hence, this course 129 | has been fundamental to acquire the statistical learning background needed to become aware of 130 | the underlying mechanisms of the algorithms. 131 | 132 | \item {\sl Programming Concepts in Python} [DIBRIS] (Tacchella): I plan to implement \adenine in 133 | {\tt Python}. Hence, most of the implementation choices will be made on the basis of the material 134 | covered in the course. 135 | 136 | \item {\sl Programming Complex Heterogeneous Parallel Systems} [IMATI] 137 | (Clematis, D'Agostino, Danovaro, Galizia) and {the \sl 24th Summer School on 138 | Parallel Computing} [CINECA] (Erbacci): \adenine will present several {\sl embarrassingly 139 | parallel workload} as well as several {\sl isolate GPU accelerable} computations. 140 | The former PhD course and the latter school will allow me to develop the parallel computing 141 | attitude I need to implement \adenine in an as optimized as possible way. 142 | 143 | \end{enumerate} 144 | 145 | 146 | \section{Implemented Algorithms} 147 | 148 | The implementation of nearly all the algorithms of \adenine will refer to the 149 | \href{http://scikit-learn.org/stable/index.html}{\tt scikit-learn} python 150 | library. See the following \href{http://scikit-learn.org/stable/unsupervised_learning.html}{\tt link} for a 151 | comprehensive list, 152 | 153 | \subsection{Preprocessing} 154 | 155 | At this step the data will be fed to the following preprocessing procedures: 156 | \begin{enumerate}[start = 0] 157 | \item no preprocessing: the analysis will be conducted on raw data; 158 | 159 | \item na\"ive recentering: remove the mean; 160 | 161 | \item standardization: remove the mean and scale each feature by 162 | their standard deviations, this will make the data normally distributed; 163 | 164 | \item normalization: scale all the samples to have unit norm 165 | 166 | \end{enumerate} 167 | 168 | In its first version \adenine will allow the user to impute the missing values by means of the 169 | median, the mean or the most frequent value (future works are in Section~\ref{sec:future}). 170 | See the {\tt sklearn} \href{http://scikit-learn.org/stable/modules/preprocessing.html}{docs} 171 | on data preprocessing for further details. 172 | 173 | \subsection{Dimensionality reduction} 174 | 175 | The following is a work-in-progress list of the techniques I plan to 176 | make available in \adenine. The list includes algorithms that come 177 | from very different standpoint, but that have a common outcome: 178 | the estimation of a low-dimensional embedding (manifold) in which the data can 179 | be projected for visualization or further purposes. 180 | 181 | \begin{enumerate}[label=(\alph*)] 182 | 183 | \item Principal Component Analysis (PCA), in its Incremental or Randomized variants 184 | in case of big data; 185 | 186 | \item Kernel PCA, which may come along different kernels (Gaussian, 187 | polynomial, and so on); 188 | 189 | \item Isomap; 190 | 191 | \item Locally Linear Embedding (LLE), in its modified (MLLE) or Hessian 192 | (HLLE) regularized version; 193 | 194 | \item Spectral Embedding (SE); 195 | 196 | \item Local Tangent Space Alignment (LTSA); 197 | 198 | \item Multidimensional Scaling (MDS), in its metric and non-metric version; 199 | 200 | \item t-distributed Stochastic Neighbor Embedding (t-SNE). 201 | 202 | \end{enumerate} 203 | 204 | \subsection{Clustering} 205 | 206 | On the same line, this section presents a list of the clustering techniques I 207 | plan to include in \adenine. 208 | 209 | \begin{enumerate} 210 | 211 | \item [($\alpha$)] K-Means, in its Mini-Batch variant for big data; 212 | 213 | \item [($\beta$)] Affinity Propagation; 214 | 215 | \item [($\gamma$)] Mean Shift; 216 | 217 | \item [($\delta$)] Spectral Clustering; 218 | 219 | \item [($\epsilon$)] Hierarchical Agglomerative Clustering, exploring 220 | different linkage type, i.e., Ward, complete, average as well as different 221 | metrics, e.g. Euclidean, Manhattan, Minkowski, etc.; 222 | 223 | \item [($\zeta$)] DBSCAN; 224 | 225 | \item [($\eta$)] Birch. 226 | 227 | \end{enumerate} 228 | 229 | Several indexes to analyze the clustering performances will be included, some 230 | of them may require ground truth labels (such as Adjusted Rand Index (ARI), the 231 | Adjusted Mutual Information (AMI), the homogeneity, completeness or V measure 232 | scores), while others may evaluate the cluster compactness or the separation 233 | between clusters (such as the silhouette score). 234 | 235 | \section{Future Works} \label{sec:future} 236 | 237 | Indeed \adenine is not meant to be an all-inclusive tool. This section, that 238 | will always be a work-in-progress, aims at mentioning all the features that 239 | are not going to be implemented in the first version of \adenine, but that may 240 | be implemented later on. 241 | 242 | \begin{itemize} 243 | 244 | \item How can we handle missing values? \adenine may have some statistically robust 245 | imputation tools (such as low-rank matrix completion, or collaborative filtering) in 246 | future versions; 247 | 248 | \item Kernel K-Means; 249 | 250 | \item Dictionary Learning; 251 | 252 | \item Factor Analysis; 253 | 254 | \item Non-negative Matrix Factorization; 255 | 256 | \item Outliers Detection. 257 | 258 | \end{itemize} 259 | %---------------------------------------------------------------------------------------- 260 | 261 | \end{document} 262 | -------------------------------------------------------------------------------- /doc/source/adenine_logo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/doc/source/adenine_logo.pdf -------------------------------------------------------------------------------- /doc/source/adenine_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/doc/source/adenine_logo.png -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # adenine documentation build configuration file, created by 4 | # sphinx-quickstart on Fri May 22 12:31:54 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('.')) 22 | sys.path.insert(0, os.path.abspath('sphinxext')) 23 | 24 | from adenine import __version__ as VERSION 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.doctest', 37 | 'sphinx.ext.todo', 38 | 'sphinx.ext.coverage', 39 | 'sphinx.ext.mathjax', 40 | 'sphinx.ext.viewcode', 41 | 'sphinx.ext.autosummary', 42 | 'sphinx.ext.intersphinx', 43 | 'numpydoc', 44 | 'sphinxcontrib.programoutput', 45 | ] 46 | 47 | # Extension configurations 48 | autoclass_content = 'init' 49 | autodoc_member_order = 'bysource' 50 | numpydoc_show_class_members = False 51 | 52 | # Add any paths that contain templates here, relative to this directory. 53 | templates_path = ['_templates'] 54 | 55 | # The suffix of source filenames. 56 | source_suffix = '.rst' 57 | 58 | # The encoding of source files. 59 | #source_encoding = 'utf-8-sig' 60 | 61 | # The master toctree document. 62 | master_doc = 'index' 63 | 64 | # General information about the project. 65 | project = u'ADENINE' 66 | copyright = u'2016, Samuele Fiorini - Federico Tomasi - Annalisa Barla' 67 | #modindex_common_prefix = ['adenine.'] 68 | 69 | # The version info for the project you're documenting, acts as replacement for 70 | # |version| and |release|, also used in various other places throughout the 71 | # built documents. 72 | # 73 | # The short X.Y version. 74 | version = VERSION 75 | # The full version, including alpha/beta/rc tags. 76 | release = version 77 | 78 | # The language for content autogenerated by Sphinx. Refer to documentation 79 | # for a list of supported languages. 80 | #language = None 81 | 82 | # There are two options for replacing |today|: either, you set today to some 83 | # non-false value, then it is used: 84 | #today = '' 85 | # Else, today_fmt is used as the format for a strftime call. 86 | #today_fmt = '%B %d, %Y' 87 | 88 | # List of patterns, relative to source directory, that match files and 89 | # directories to ignore when looking for source files. 90 | exclude_patterns = [] 91 | 92 | # The reST default role (used for this markup: `text`) to use for all 93 | # documents. 94 | #default_role = None 95 | 96 | # If true, '()' will be appended to :func: etc. cross-reference text. 97 | #add_function_parentheses = True 98 | 99 | # If true, the current module name will be prepended to all description 100 | # unit titles (such as .. function::). 101 | #add_module_names = True 102 | 103 | # If true, sectionauthor and moduleauthor directives will be shown in the 104 | # output. They are ignored by default. 105 | #show_authors = False 106 | 107 | # The name of the Pygments (syntax highlighting) style to use. 108 | pygments_style = 'sphinx' 109 | 110 | # A list of ignored prefixes for module index sorting. 111 | #modindex_common_prefix = [] 112 | 113 | # If true, keep warnings as "system message" paragraphs in the built documents. 114 | #keep_warnings = False 115 | 116 | 117 | # -- Options for HTML output ---------------------------------------------- 118 | 119 | # The theme to use for HTML and HTML Help pages. See the documentation for 120 | # a list of builtin themes. 121 | # html_theme = 'default' 122 | # html_theme = "nature" 123 | html_theme = 'slipGURUTheme' 124 | 125 | 126 | # Theme options are theme-specific and customize the look and feel of a theme 127 | # further. For a list of options available for each theme, see the 128 | # documentation. 129 | #html_theme_options = {} 130 | 131 | # Add any paths that contain custom themes here, relative to this directory. 132 | html_theme_path = ['.'] 133 | 134 | # The name for this set of Sphinx documents. If None, it defaults to 135 | # " v documentation". 136 | #html_title = None 137 | 138 | # A shorter title for the navigation bar. Default is the same as html_title. 139 | #html_short_title = None 140 | 141 | # The name of an image file (relative to this directory) to place at the top 142 | # of the sidebar. 143 | html_logo = 'adenine_logo.png' 144 | 145 | # The name of an image file (within the static path) to use as favicon of the 146 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 147 | # pixels large. 148 | #html_favicon = None 149 | 150 | # Add any paths that contain custom static files (such as style sheets) here, 151 | # relative to this directory. They are copied after the builtin static files, 152 | # so a file named "default.css" will overwrite the builtin "default.css". 153 | html_static_path = ['_static'] 154 | 155 | # Add any extra paths that contain custom files (such as robots.txt or 156 | # .htaccess) here, relative to this directory. These files are copied 157 | # directly to the root of the documentation. 158 | #html_extra_path = [] 159 | 160 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 161 | # using the given strftime format. 162 | #html_last_updated_fmt = '%b %d, %Y' 163 | 164 | # If true, SmartyPants will be used to convert quotes and dashes to 165 | # typographically correct entities. 166 | #html_use_smartypants = True 167 | 168 | # Custom sidebar templates, maps document names to template names. 169 | #html_sidebars = {} 170 | 171 | # Additional templates that should be rendered to pages, maps page names to 172 | # template names. 173 | #html_additional_pages = {} 174 | 175 | # If false, no module index is generated. 176 | #html_domain_indices = True 177 | 178 | # If false, no index is generated. 179 | #html_use_index = True 180 | 181 | # If true, the index is split into individual pages for each letter. 182 | #html_split_index = False 183 | 184 | # If true, links to the reST sources are added to the pages. 185 | #html_show_sourcelink = True 186 | 187 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 188 | #html_show_sphinx = True 189 | 190 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 191 | #html_show_copyright = True 192 | 193 | # If true, an OpenSearch description file will be output, and all pages will 194 | # contain a tag referring to it. The value of this option must be the 195 | # base URL from which the finished HTML is served. 196 | #html_use_opensearch = '' 197 | 198 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 199 | #html_file_suffix = None 200 | 201 | # Output file base name for HTML help builder. 202 | htmlhelp_basename = 'adeninedoc' 203 | 204 | 205 | # -- Options for LaTeX output --------------------------------------------- 206 | 207 | latex_elements = { 208 | # The paper size ('letterpaper' or 'a4paper'). 209 | #'papersize': 'letterpaper', 210 | 211 | # The font size ('10pt', '11pt' or '12pt'). 212 | #'pointsize': '10pt', 213 | 214 | # Additional stuff for the LaTeX preamble. 215 | #'preamble': '', 216 | } 217 | 218 | # Grouping the document tree into LaTeX files. List of tuples 219 | # (source start file, target name, title, 220 | # author, documentclass [howto, manual, or own class]). 221 | latex_documents = [ 222 | ('index', 'adenine.tex', u'adenine Documentation', 223 | u'Samuele Fiorini - Federico Tomasi - Annalisa Barla', 'manual'), 224 | ] 225 | 226 | # The name of an image file (relative to this directory) to place at the top of 227 | # the title page. 228 | latex_logo = 'adenine_logo.png' 229 | 230 | # For "manual" documents, if this is true, then toplevel headings are parts, 231 | # not chapters. 232 | #latex_use_parts = False 233 | 234 | # If true, show page references after internal links. 235 | #latex_show_pagerefs = False 236 | 237 | # If true, show URL addresses after external links. 238 | #latex_show_urls = False 239 | 240 | # Documents to append as an appendix to all manuals. 241 | #latex_appendices = [] 242 | 243 | # If false, no module index is generated. 244 | #latex_domain_indices = True 245 | 246 | 247 | # -- Options for manual page output --------------------------------------- 248 | 249 | # One entry per manual page. List of tuples 250 | # (source start file, name, description, authors, manual section). 251 | man_pages = [ 252 | ('index', 'adenine', u'adenine Documentation', 253 | [u'Samuele Fiorini - Federico Tomasi - Annalisa Barla'], 1) 254 | ] 255 | 256 | # If true, show URL addresses after external links. 257 | #man_show_urls = False 258 | 259 | 260 | # -- Options for Texinfo output ------------------------------------------- 261 | 262 | # Grouping the document tree into Texinfo files. List of tuples 263 | # (source start file, target name, title, author, 264 | # dir menu entry, description, category) 265 | texinfo_documents = [ 266 | ('index', 'adenine', u'adenine Documentation', 267 | u'Samuele Fiorini - Federico Tomasi - Annalisa Barla', 'adenine', 'One line description of project.', 268 | 'Miscellaneous'), 269 | ] 270 | 271 | # Documents to append as an appendix to all manuals. 272 | #texinfo_appendices = [] 273 | 274 | # If false, no module index is generated. 275 | #texinfo_domain_indices = True 276 | 277 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 278 | #texinfo_show_urls = 'footnote' 279 | 280 | # If true, do not generate a @detailmenu in the "Top" node's menu. 281 | #texinfo_no_detailmenu = False 282 | -------------------------------------------------------------------------------- /doc/source/dependencies.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | matplotlib 3 | seaborn 4 | pydot 5 | scikit-learn 6 | -------------------------------------------------------------------------------- /doc/source/drawing.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 40 | 42 | 43 | 45 | image/svg+xml 46 | 48 | 49 | 50 | 51 | 52 | 58 | 66 | 74 | 82 | 87 | 92 | 100 | 108 | 116 | 124 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. adenine documentation master file, created by 2 | sphinx-quickstart on Fri May 22 12:31:54 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ===================================== 7 | ADENINE (A Data ExploratioN pIpeliNE) 8 | ===================================== 9 | 10 | **ADENINE** is a machine learning and data mining Python pipeline that helps you to answer this tedious question: are my data relevant with the problem I'm dealing with? 11 | 12 | The main structure of adenine can be summarized in the following 4 steps. 13 | 14 | 1. **Imputing:** Does your dataset have missing entries? In the first step you can fill the missing values choosing between different strategies: feature-wise median, mean and most frequent value or a more stable k-NN imputing. 15 | 16 | 2. **Preprocessing:** Have you ever wondered what would have changed if only your data have been preprocessed in a different way? Or is it data preprocessing a good idea after all? ADENINE offers several preprocessing procedures, such as: data recentering, Min-Max scaling, standardization or normalization and allows you to compare the results of the analysis made with different preprocessing step as starting point. 17 | 18 | 3. **Dimensionality Reduction:** In the context of data exploration, this phase becomes particularly helpful for high dimensional data. This step includes some manifold learning (such as isomap, multidimensional scaling, etc) and unsupervised dimensionality reduction (principal component analysis, kernel PCA) techniques. 19 | 20 | 4. **Clustering:** This step aims at grouping data into clusters in an unsupervised manner. Several techniques such as k-means, spectral or hierarchical clustering are offered. 21 | 22 | The final output of adenine is a compact and textual representation of the results obtained from the pipelines made with each possible combination of the algorithms implemented at each step. 23 | 24 | User documentation 25 | ================== 26 | .. toctree:: 27 | :maxdepth: 2 28 | 29 | tutorial.rst 30 | 31 | .. _api: 32 | 33 | *********************** 34 | API 35 | *********************** 36 | 37 | .. toctree:: 38 | :maxdepth: 1 39 | 40 | 41 | Pipeline utilities 42 | ----------------------------- 43 | 44 | .. automodule:: adenine.core.define_pipeline 45 | :members: 46 | 47 | .. automodule:: adenine.core.pipelines 48 | :members: 49 | 50 | .. automodule:: adenine.core.analyze_results 51 | :members: 52 | 53 | Input Data 54 | ----------------------------- 55 | 56 | .. automodule:: adenine.utils.data_source 57 | :members: 58 | 59 | 60 | Plotting functions 61 | ----------------------------- 62 | 63 | .. automodule:: adenine.core.plotting 64 | :members: 65 | 66 | 67 | Extra tools 68 | ----------------------------- 69 | 70 | .. automodule:: adenine.utils.extra 71 | :members: 72 | 73 | 74 | .. Indices and tables 75 | .. ================== 76 | 77 | .. * :ref:`genindex` 78 | .. * :ref:`modindex` 79 | .. * :ref:`search` 80 | 81 | -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- 1 | . 2 | = 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | adenine 8 | setup 9 | -------------------------------------------------------------------------------- /doc/source/slipGURUTheme/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "basic/layout.html" %} 2 | 3 | {% block sidebarsearch %} 4 | {{ super() }} 5 | 8 | {% endblock %} 9 | 10 | {% block extrahead %} 11 | 14 | 15 | SlipGURU 17 | Dipartimento di Informatica e Scienze dell'Informazione 20 | Università Degli Studi di Genova 23 | 24 | {% endblock %} 25 | 26 | {% block sidebarrel %} 27 | {% if prev %} 28 | {{ super() }} 29 | {% else %} 30 | {% endif %} 31 | {% endblock %} 32 | 33 | {% block sidebartoc %} 34 | {% if prev %} 35 | {{ super() }} 36 | {% else %} 37 |

Download

38 |

Current version: {{ release }}

39 |

Get {{ project }} from the 40 | Python Package Index, 41 | or install it with: 42 |

43 |
pip install --upgrade {{ project }}
44 |

or clone it from our GitHub repository:

45 |
git clone https://github.com/slipguru/{{ project }}
46 | 47 | 48 | 53 |

54 | {% endif %} 55 | {% endblock %} 56 | -------------------------------------------------------------------------------- /doc/source/slipGURUTheme/static/logos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/doc/source/slipGURUTheme/static/logos.png -------------------------------------------------------------------------------- /doc/source/slipGURUTheme/static/slipGuru.css: -------------------------------------------------------------------------------- 1 | @import "default.css"; 2 | 3 | /** 4 | * Spacing fixes 5 | */ 6 | 7 | div.body p, div.body dd, div.body li { 8 | line-height: 125%; 9 | } 10 | 11 | ul.simple { 12 | margin-top: 0; 13 | margin-bottom: 0; 14 | padding-top: 0; 15 | padding-bottom: 0; 16 | } 17 | 18 | /* spacing around blockquoted fields in parameters/attributes/returns */ 19 | td.field-body > blockquote { 20 | margin-top: 0.1em; 21 | margin-bottom: 0.5em; 22 | } 23 | 24 | /* spacing around example code */ 25 | div.highlight > pre { 26 | padding: 2px 5px 2px 5px; 27 | } 28 | 29 | /* spacing in see also definition lists */ 30 | dl.last > dd { 31 | margin-top: 1px; 32 | margin-bottom: 5px; 33 | margin-left: 30px; 34 | } 35 | 36 | /* hide overflowing content in the sidebar */ 37 | div.sphinxsidebarwrapper p.topless { 38 | overflow: hidden; 39 | } 40 | 41 | /** 42 | * Hide dummy toctrees 43 | */ 44 | 45 | ul { 46 | padding-top: 0; 47 | padding-bottom: 0; 48 | margin-top: 0; 49 | margin-bottom: 0; 50 | } 51 | ul li { 52 | padding-top: 0; 53 | padding-bottom: 0; 54 | margin-top: 0; 55 | margin-bottom: 0; 56 | } 57 | ul li a.reference { 58 | padding-top: 0; 59 | padding-bottom: 0; 60 | margin-top: 0; 61 | margin-bottom: 0; 62 | } 63 | 64 | /** 65 | * Make high-level subsections easier to distinguish from top-level ones 66 | */ 67 | div.body h3 { 68 | background-color: transparent; 69 | } 70 | 71 | div.body h4 { 72 | border: none; 73 | background-color: transparent; 74 | } 75 | 76 | /** 77 | * Scipy colors 78 | */ 79 | 80 | body { 81 | background-color: rgb(100,135,220); 82 | } 83 | 84 | div.document { 85 | background-color: rgb(230,230,230); 86 | } 87 | 88 | div.sphinxsidebar { 89 | background-color: rgb(230,230,230); 90 | } 91 | 92 | div.related { 93 | background-color: rgb(100,135,220); 94 | } 95 | 96 | div.sphinxsidebar h3 { 97 | color: rgb(0,102,204); 98 | } 99 | 100 | div.sphinxsidebar h3 a { 101 | color: rgb(0,102,204); 102 | } 103 | 104 | div.sphinxsidebar h4 { 105 | color: rgb(0,82,194); 106 | } 107 | 108 | div.sphinxsidebar p { 109 | color: black; 110 | } 111 | 112 | div.sphinxsidebar a { 113 | color: #355f7c; 114 | } 115 | 116 | div.sphinxsidebar ul.want-points { 117 | list-style: disc; 118 | } 119 | 120 | .field-list th { 121 | color: rgb(0,102,204); 122 | white-space: nowrap; 123 | } 124 | 125 | /** 126 | * Extra admonitions 127 | */ 128 | 129 | div.tip { 130 | background-color: #ffffe4; 131 | border: 1px solid #ee6; 132 | } 133 | 134 | div.plot-output { 135 | clear-after: both; 136 | } 137 | 138 | div.plot-output .figure { 139 | float: left; 140 | text-align: center; 141 | margin-bottom: 0; 142 | padding-bottom: 0; 143 | } 144 | 145 | div.plot-output .caption { 146 | margin-top: 2; 147 | padding-top: 0; 148 | } 149 | 150 | div.plot-output p.admonition-title { 151 | display: none; 152 | } 153 | 154 | div.plot-output:after { 155 | content: ""; 156 | display: block; 157 | height: 0; 158 | clear: both; 159 | } 160 | 161 | 162 | /* 163 | div.admonition-example { 164 | background-color: #e4ffe4; 165 | border: 1px solid #ccc; 166 | }*/ 167 | 168 | 169 | /** 170 | * Styling for field lists 171 | */ 172 | 173 | table.field-list th { 174 | border-left: 1px solid #aaa !important; 175 | padding-left: 5px; 176 | } 177 | 178 | table.field-list { 179 | border-collapse: separate; 180 | border-spacing: 10px; 181 | } 182 | 183 | /** 184 | * Styling for footnotes 185 | */ 186 | 187 | table.footnote td, table.footnote th { 188 | border: none; 189 | } 190 | -------------------------------------------------------------------------------- /doc/source/slipGURUTheme/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = default 3 | stylesheet = slipGuru.css 4 | pygments_style = sphinx 5 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/LICENSE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | The files 3 | - numpydoc.py 4 | - autosummary.py 5 | - autosummary_generate.py 6 | - docscrape.py 7 | - docscrape_sphinx.py 8 | - phantom_import.py 9 | have the following license: 10 | 11 | Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are 15 | met: 16 | 17 | 1. Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | 2. Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in 21 | the documentation and/or other materials provided with the 22 | distribution. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 25 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 26 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 27 | DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, 28 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 29 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 30 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 32 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING 33 | IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | POSSIBILITY OF SUCH DAMAGE. 35 | 36 | ------------------------------------------------------------------------------- 37 | The files 38 | - compiler_unparse.py 39 | - comment_eater.py 40 | - traitsdoc.py 41 | have the following license: 42 | 43 | This software is OSI Certified Open Source Software. 44 | OSI Certified is a certification mark of the Open Source Initiative. 45 | 46 | Copyright (c) 2006, Enthought, Inc. 47 | All rights reserved. 48 | 49 | Redistribution and use in source and binary forms, with or without 50 | modification, are permitted provided that the following conditions are met: 51 | 52 | * Redistributions of source code must retain the above copyright notice, this 53 | list of conditions and the following disclaimer. 54 | * Redistributions in binary form must reproduce the above copyright notice, 55 | this list of conditions and the following disclaimer in the documentation 56 | and/or other materials provided with the distribution. 57 | * Neither the name of Enthought, Inc. nor the names of its contributors may 58 | be used to endorse or promote products derived from this software without 59 | specific prior written permission. 60 | 61 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 62 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 63 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 64 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 65 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 66 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 67 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 68 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 69 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 70 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 71 | 72 | 73 | ------------------------------------------------------------------------------- 74 | The files 75 | - only_directives.py 76 | - plot_directive.py 77 | originate from Matplotlib (http://matplotlib.sf.net/) which has 78 | the following license: 79 | 80 | Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. 81 | 82 | 1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. 83 | 84 | 2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. 85 | 86 | 3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. 87 | 88 | 4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. 89 | 90 | 5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 91 | 92 | 6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. 93 | 94 | 7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. 95 | 96 | 8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. 97 | 98 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests *.py 2 | include *.txt 3 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: numpydoc 3 | Version: 0.4 4 | Summary: Sphinx extension to support docstrings in Numpy format 5 | Home-page: http://github.com/numpy/numpy/tree/master/doc/sphinxext 6 | Author: Pauli Virtanen and others 7 | Author-email: pav@iki.fi 8 | License: BSD 9 | Description: UNKNOWN 10 | Keywords: sphinx numpy 11 | Platform: UNKNOWN 12 | Classifier: Development Status :: 3 - Alpha 13 | Classifier: Environment :: Plugins 14 | Classifier: License :: OSI Approved :: BSD License 15 | Classifier: Topic :: Documentation 16 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/README.txt: -------------------------------------------------------------------------------- 1 | ===================================== 2 | numpydoc -- Numpy's Sphinx extensions 3 | ===================================== 4 | 5 | Numpy's documentation uses several custom extensions to Sphinx. These 6 | are shipped in this ``numpydoc`` package, in case you want to make use 7 | of them in third-party projects. 8 | 9 | The following extensions are available: 10 | 11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add 12 | the code description directives ``np:function``, ``np-c:function``, etc. 13 | that support the Numpy docstring syntax. 14 | 15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. 16 | 17 | - ``numpydoc.plot_directive``: Adaptation of Matplotlib's ``plot::`` 18 | directive. Note that this implementation may still undergo severe 19 | changes or eventually be deprecated. 20 | 21 | 22 | numpydoc 23 | ======== 24 | 25 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings 26 | following the Numpy/Scipy format to a form palatable to Sphinx. 27 | 28 | Options 29 | ------- 30 | 31 | The following options can be set in conf.py: 32 | 33 | - numpydoc_use_plots: bool 34 | 35 | Whether to produce ``plot::`` directives for Examples sections that 36 | contain ``import matplotlib``. 37 | 38 | - numpydoc_show_class_members: bool 39 | 40 | Whether to show all members of a class in the Methods and Attributes 41 | sections automatically. 42 | 43 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) 44 | 45 | Whether to insert an edit link after docstrings. 46 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/__init__.py: -------------------------------------------------------------------------------- 1 | from numpydoc import setup 2 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/comment_eater.py: -------------------------------------------------------------------------------- 1 | from cStringIO import StringIO 2 | import compiler 3 | import inspect 4 | import textwrap 5 | import tokenize 6 | 7 | from compiler_unparse import unparse 8 | 9 | 10 | class Comment(object): 11 | """ A comment block. 12 | """ 13 | is_comment = True 14 | def __init__(self, start_lineno, end_lineno, text): 15 | # int : The first line number in the block. 1-indexed. 16 | self.start_lineno = start_lineno 17 | # int : The last line number. Inclusive! 18 | self.end_lineno = end_lineno 19 | # str : The text block including '#' character but not any leading spaces. 20 | self.text = text 21 | 22 | def add(self, string, start, end, line): 23 | """ Add a new comment line. 24 | """ 25 | self.start_lineno = min(self.start_lineno, start[0]) 26 | self.end_lineno = max(self.end_lineno, end[0]) 27 | self.text += string 28 | 29 | def __repr__(self): 30 | return '%s(%r, %r, %r)' % (self.__class__.__name__, self.start_lineno, 31 | self.end_lineno, self.text) 32 | 33 | 34 | class NonComment(object): 35 | """ A non-comment block of code. 36 | """ 37 | is_comment = False 38 | def __init__(self, start_lineno, end_lineno): 39 | self.start_lineno = start_lineno 40 | self.end_lineno = end_lineno 41 | 42 | def add(self, string, start, end, line): 43 | """ Add lines to the block. 44 | """ 45 | if string.strip(): 46 | # Only add if not entirely whitespace. 47 | self.start_lineno = min(self.start_lineno, start[0]) 48 | self.end_lineno = max(self.end_lineno, end[0]) 49 | 50 | def __repr__(self): 51 | return '%s(%r, %r)' % (self.__class__.__name__, self.start_lineno, 52 | self.end_lineno) 53 | 54 | 55 | class CommentBlocker(object): 56 | """ Pull out contiguous comment blocks. 57 | """ 58 | def __init__(self): 59 | # Start with a dummy. 60 | self.current_block = NonComment(0, 0) 61 | 62 | # All of the blocks seen so far. 63 | self.blocks = [] 64 | 65 | # The index mapping lines of code to their associated comment blocks. 66 | self.index = {} 67 | 68 | def process_file(self, file): 69 | """ Process a file object. 70 | """ 71 | for token in tokenize.generate_tokens(file.next): 72 | self.process_token(*token) 73 | self.make_index() 74 | 75 | def process_token(self, kind, string, start, end, line): 76 | """ Process a single token. 77 | """ 78 | if self.current_block.is_comment: 79 | if kind == tokenize.COMMENT: 80 | self.current_block.add(string, start, end, line) 81 | else: 82 | self.new_noncomment(start[0], end[0]) 83 | else: 84 | if kind == tokenize.COMMENT: 85 | self.new_comment(string, start, end, line) 86 | else: 87 | self.current_block.add(string, start, end, line) 88 | 89 | def new_noncomment(self, start_lineno, end_lineno): 90 | """ We are transitioning from a noncomment to a comment. 91 | """ 92 | block = NonComment(start_lineno, end_lineno) 93 | self.blocks.append(block) 94 | self.current_block = block 95 | 96 | def new_comment(self, string, start, end, line): 97 | """ Possibly add a new comment. 98 | 99 | Only adds a new comment if this comment is the only thing on the line. 100 | Otherwise, it extends the noncomment block. 101 | """ 102 | prefix = line[:start[1]] 103 | if prefix.strip(): 104 | # Oops! Trailing comment, not a comment block. 105 | self.current_block.add(string, start, end, line) 106 | else: 107 | # A comment block. 108 | block = Comment(start[0], end[0], string) 109 | self.blocks.append(block) 110 | self.current_block = block 111 | 112 | def make_index(self): 113 | """ Make the index mapping lines of actual code to their associated 114 | prefix comments. 115 | """ 116 | for prev, block in zip(self.blocks[:-1], self.blocks[1:]): 117 | if not block.is_comment: 118 | self.index[block.start_lineno] = prev 119 | 120 | def search_for_comment(self, lineno, default=None): 121 | """ Find the comment block just before the given line number. 122 | 123 | Returns None (or the specified default) if there is no such block. 124 | """ 125 | if not self.index: 126 | self.make_index() 127 | block = self.index.get(lineno, None) 128 | text = getattr(block, 'text', default) 129 | return text 130 | 131 | 132 | def strip_comment_marker(text): 133 | """ Strip # markers at the front of a block of comment text. 134 | """ 135 | lines = [] 136 | for line in text.splitlines(): 137 | lines.append(line.lstrip('#')) 138 | text = textwrap.dedent('\n'.join(lines)) 139 | return text 140 | 141 | 142 | def get_class_traits(klass): 143 | """ Yield all of the documentation for trait definitions on a class object. 144 | """ 145 | # FIXME: gracefully handle errors here or in the caller? 146 | source = inspect.getsource(klass) 147 | cb = CommentBlocker() 148 | cb.process_file(StringIO(source)) 149 | mod_ast = compiler.parse(source) 150 | class_ast = mod_ast.node.nodes[0] 151 | for node in class_ast.code.nodes: 152 | # FIXME: handle other kinds of assignments? 153 | if isinstance(node, compiler.ast.Assign): 154 | name = node.nodes[0].name 155 | rhs = unparse(node.expr).strip() 156 | doc = strip_comment_marker(cb.search_for_comment(node.lineno, default='')) 157 | yield name, rhs, doc 158 | 159 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/docscrape_sphinx.py: -------------------------------------------------------------------------------- 1 | import re, inspect, textwrap, pydoc 2 | import sphinx 3 | from docscrape import NumpyDocString, FunctionDoc, ClassDoc 4 | 5 | class SphinxDocString(NumpyDocString): 6 | def __init__(self, docstring, config={}): 7 | self.use_plots = config.get('use_plots', False) 8 | NumpyDocString.__init__(self, docstring, config=config) 9 | 10 | # string conversion routines 11 | def _str_header(self, name, symbol='`'): 12 | return ['.. rubric:: ' + name, ''] 13 | 14 | def _str_field_list(self, name): 15 | return [':' + name + ':'] 16 | 17 | def _str_indent(self, doc, indent=4): 18 | out = [] 19 | for line in doc: 20 | out += [' '*indent + line] 21 | return out 22 | 23 | def _str_signature(self): 24 | return [''] 25 | if self['Signature']: 26 | return ['``%s``' % self['Signature']] + [''] 27 | else: 28 | return [''] 29 | 30 | def _str_summary(self): 31 | return self['Summary'] + [''] 32 | 33 | def _str_extended_summary(self): 34 | return self['Extended Summary'] + [''] 35 | 36 | def _str_param_list(self, name): 37 | out = [] 38 | if self[name]: 39 | out += self._str_field_list(name) 40 | out += [''] 41 | for param,param_type,desc in self[name]: 42 | out += self._str_indent(['**%s** : %s' % (param.strip(), 43 | param_type)]) 44 | out += [''] 45 | out += self._str_indent(desc,8) 46 | out += [''] 47 | return out 48 | 49 | @property 50 | def _obj(self): 51 | if hasattr(self, '_cls'): 52 | return self._cls 53 | elif hasattr(self, '_f'): 54 | return self._f 55 | return None 56 | 57 | def _str_member_list(self, name): 58 | """ 59 | Generate a member listing, autosummary:: table where possible, 60 | and a table where not. 61 | 62 | """ 63 | out = [] 64 | if self[name]: 65 | out += ['.. rubric:: %s' % name, ''] 66 | prefix = getattr(self, '_name', '') 67 | 68 | if prefix: 69 | prefix = '~%s.' % prefix 70 | 71 | autosum = [] 72 | others = [] 73 | for param, param_type, desc in self[name]: 74 | param = param.strip() 75 | if not self._obj or hasattr(self._obj, param): 76 | autosum += [" %s%s" % (prefix, param)] 77 | else: 78 | others.append((param, param_type, desc)) 79 | 80 | if autosum: 81 | out += ['.. autosummary::', ' :toctree:', ''] 82 | out += autosum 83 | 84 | if others: 85 | maxlen_0 = max([len(x[0]) for x in others]) 86 | maxlen_1 = max([len(x[1]) for x in others]) 87 | hdr = "="*maxlen_0 + " " + "="*maxlen_1 + " " + "="*10 88 | fmt = '%%%ds %%%ds ' % (maxlen_0, maxlen_1) 89 | n_indent = maxlen_0 + maxlen_1 + 4 90 | out += [hdr] 91 | for param, param_type, desc in others: 92 | out += [fmt % (param.strip(), param_type)] 93 | out += self._str_indent(desc, n_indent) 94 | out += [hdr] 95 | out += [''] 96 | return out 97 | 98 | def _str_section(self, name): 99 | out = [] 100 | if self[name]: 101 | out += self._str_header(name) 102 | out += [''] 103 | content = textwrap.dedent("\n".join(self[name])).split("\n") 104 | out += content 105 | out += [''] 106 | return out 107 | 108 | def _str_see_also(self, func_role): 109 | out = [] 110 | if self['See Also']: 111 | see_also = super(SphinxDocString, self)._str_see_also(func_role) 112 | out = ['.. seealso::', ''] 113 | out += self._str_indent(see_also[2:]) 114 | return out 115 | 116 | def _str_warnings(self): 117 | out = [] 118 | if self['Warnings']: 119 | out = ['.. warning::', ''] 120 | out += self._str_indent(self['Warnings']) 121 | return out 122 | 123 | def _str_index(self): 124 | idx = self['index'] 125 | out = [] 126 | if len(idx) == 0: 127 | return out 128 | 129 | out += ['.. index:: %s' % idx.get('default','')] 130 | for section, references in idx.iteritems(): 131 | if section == 'default': 132 | continue 133 | elif section == 'refguide': 134 | out += [' single: %s' % (', '.join(references))] 135 | else: 136 | out += [' %s: %s' % (section, ','.join(references))] 137 | return out 138 | 139 | def _str_references(self): 140 | out = [] 141 | if self['References']: 142 | out += self._str_header('References') 143 | if isinstance(self['References'], str): 144 | self['References'] = [self['References']] 145 | out.extend(self['References']) 146 | out += [''] 147 | # Latex collects all references to a separate bibliography, 148 | # so we need to insert links to it 149 | if sphinx.__version__ >= "0.6": 150 | out += ['.. only:: latex',''] 151 | else: 152 | out += ['.. latexonly::',''] 153 | items = [] 154 | for line in self['References']: 155 | m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) 156 | if m: 157 | items.append(m.group(1)) 158 | out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] 159 | return out 160 | 161 | def _str_examples(self): 162 | examples_str = "\n".join(self['Examples']) 163 | 164 | if (self.use_plots and 'import matplotlib' in examples_str 165 | and 'plot::' not in examples_str): 166 | out = [] 167 | out += self._str_header('Examples') 168 | out += ['.. plot::', ''] 169 | out += self._str_indent(self['Examples']) 170 | out += [''] 171 | return out 172 | else: 173 | return self._str_section('Examples') 174 | 175 | def __str__(self, indent=0, func_role="obj"): 176 | out = [] 177 | out += self._str_signature() 178 | out += self._str_index() + [''] 179 | out += self._str_summary() 180 | out += self._str_extended_summary() 181 | for param_list in ('Parameters', 'Returns', 'Other Parameters', 182 | 'Raises', 'Warns'): 183 | out += self._str_param_list(param_list) 184 | out += self._str_warnings() 185 | out += self._str_see_also(func_role) 186 | out += self._str_section('Notes') 187 | out += self._str_references() 188 | out += self._str_examples() 189 | for param_list in ('Attributes', 'Methods'): 190 | out += self._str_member_list(param_list) 191 | out = self._str_indent(out,indent) 192 | return '\n'.join(out) 193 | 194 | class SphinxFunctionDoc(SphinxDocString, FunctionDoc): 195 | def __init__(self, obj, doc=None, config={}): 196 | self.use_plots = config.get('use_plots', False) 197 | FunctionDoc.__init__(self, obj, doc=doc, config=config) 198 | 199 | class SphinxClassDoc(SphinxDocString, ClassDoc): 200 | def __init__(self, obj, doc=None, func_doc=None, config={}): 201 | self.use_plots = config.get('use_plots', False) 202 | ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) 203 | 204 | class SphinxObjDoc(SphinxDocString): 205 | def __init__(self, obj, doc=None, config={}): 206 | self._f = obj 207 | SphinxDocString.__init__(self, doc, config=config) 208 | 209 | def get_doc_object(obj, what=None, doc=None, config={}): 210 | if what is None: 211 | if inspect.isclass(obj): 212 | what = 'class' 213 | elif inspect.ismodule(obj): 214 | what = 'module' 215 | elif callable(obj): 216 | what = 'function' 217 | else: 218 | what = 'object' 219 | if what == 'class': 220 | return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, 221 | config=config) 222 | elif what in ('function', 'method'): 223 | return SphinxFunctionDoc(obj, doc=doc, config=config) 224 | else: 225 | if doc is None: 226 | doc = pydoc.getdoc(obj) 227 | return SphinxObjDoc(obj, doc, config=config) 228 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/numpydoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ======== 3 | numpydoc 4 | ======== 5 | 6 | Sphinx extension that handles docstrings in the Numpy standard format. [1] 7 | 8 | It will: 9 | 10 | - Convert Parameters etc. sections to field lists. 11 | - Convert See Also section to a See also entry. 12 | - Renumber references. 13 | - Extract the signature from the docstring, if it can't be determined otherwise. 14 | 15 | .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard 16 | 17 | """ 18 | 19 | import os, re, pydoc 20 | from docscrape_sphinx import get_doc_object, SphinxDocString 21 | from sphinx.util.compat import Directive 22 | import inspect 23 | 24 | def mangle_docstrings(app, what, name, obj, options, lines, 25 | reference_offset=[0]): 26 | 27 | cfg = dict(use_plots=app.config.numpydoc_use_plots, 28 | show_class_members=app.config.numpydoc_show_class_members) 29 | 30 | if what == 'module': 31 | # Strip top title 32 | title_re = re.compile(ur'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*', 33 | re.I|re.S) 34 | lines[:] = title_re.sub(u'', u"\n".join(lines)).split(u"\n") 35 | else: 36 | doc = get_doc_object(obj, what, u"\n".join(lines), config=cfg) 37 | lines[:] = unicode(doc).split(u"\n") 38 | 39 | if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ 40 | obj.__name__: 41 | if hasattr(obj, '__module__'): 42 | v = dict(full_name=u"%s.%s" % (obj.__module__, obj.__name__)) 43 | else: 44 | v = dict(full_name=obj.__name__) 45 | lines += [u'', u'.. htmlonly::', ''] 46 | lines += [u' %s' % x for x in 47 | (app.config.numpydoc_edit_link % v).split("\n")] 48 | 49 | # replace reference numbers so that there are no duplicates 50 | references = [] 51 | for line in lines: 52 | line = line.strip() 53 | m = re.match(ur'^.. \[([a-z0-9_.-])\]', line, re.I) 54 | if m: 55 | references.append(m.group(1)) 56 | 57 | # start renaming from the longest string, to avoid overwriting parts 58 | references.sort(key=lambda x: -len(x)) 59 | if references: 60 | for i, line in enumerate(lines): 61 | for r in references: 62 | if re.match(ur'^\d+$', r): 63 | new_r = u"R%d" % (reference_offset[0] + int(r)) 64 | else: 65 | new_r = u"%s%d" % (r, reference_offset[0]) 66 | lines[i] = lines[i].replace(u'[%s]_' % r, 67 | u'[%s]_' % new_r) 68 | lines[i] = lines[i].replace(u'.. [%s]' % r, 69 | u'.. [%s]' % new_r) 70 | 71 | reference_offset[0] += len(references) 72 | 73 | def mangle_signature(app, what, name, obj, options, sig, retann): 74 | # Do not try to inspect classes that don't define `__init__` 75 | if (inspect.isclass(obj) and 76 | (not hasattr(obj, '__init__') or 77 | 'initializes x; see ' in pydoc.getdoc(obj.__init__))): 78 | return '', '' 79 | 80 | if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): return 81 | if not hasattr(obj, '__doc__'): return 82 | 83 | doc = SphinxDocString(pydoc.getdoc(obj)) 84 | if doc['Signature']: 85 | sig = re.sub(u"^[^(]*", u"", doc['Signature']) 86 | return sig, u'' 87 | 88 | def setup(app, get_doc_object_=get_doc_object): 89 | global get_doc_object 90 | get_doc_object = get_doc_object_ 91 | 92 | app.connect('autodoc-process-docstring', mangle_docstrings) 93 | app.connect('autodoc-process-signature', mangle_signature) 94 | app.add_config_value('numpydoc_edit_link', None, False) 95 | app.add_config_value('numpydoc_use_plots', None, False) 96 | app.add_config_value('numpydoc_show_class_members', True, True) 97 | 98 | # Extra mangling domains 99 | app.add_domain(NumpyPythonDomain) 100 | app.add_domain(NumpyCDomain) 101 | 102 | #------------------------------------------------------------------------------ 103 | # Docstring-mangling domains 104 | #------------------------------------------------------------------------------ 105 | 106 | from docutils.statemachine import ViewList 107 | from sphinx.domains.c import CDomain 108 | from sphinx.domains.python import PythonDomain 109 | 110 | class ManglingDomainBase(object): 111 | directive_mangling_map = {} 112 | 113 | def __init__(self, *a, **kw): 114 | super(ManglingDomainBase, self).__init__(*a, **kw) 115 | self.wrap_mangling_directives() 116 | 117 | def wrap_mangling_directives(self): 118 | for name, objtype in self.directive_mangling_map.items(): 119 | self.directives[name] = wrap_mangling_directive( 120 | self.directives[name], objtype) 121 | 122 | class NumpyPythonDomain(ManglingDomainBase, PythonDomain): 123 | name = 'np' 124 | directive_mangling_map = { 125 | 'function': 'function', 126 | 'class': 'class', 127 | 'exception': 'class', 128 | 'method': 'function', 129 | 'classmethod': 'function', 130 | 'staticmethod': 'function', 131 | 'attribute': 'attribute', 132 | } 133 | 134 | class NumpyCDomain(ManglingDomainBase, CDomain): 135 | name = 'np-c' 136 | directive_mangling_map = { 137 | 'function': 'function', 138 | 'member': 'attribute', 139 | 'macro': 'function', 140 | 'type': 'class', 141 | 'var': 'object', 142 | } 143 | 144 | def wrap_mangling_directive(base_directive, objtype): 145 | class directive(base_directive): 146 | def run(self): 147 | env = self.state.document.settings.env 148 | 149 | name = None 150 | if self.arguments: 151 | m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) 152 | name = m.group(2).strip() 153 | 154 | if not name: 155 | name = self.arguments[0] 156 | 157 | lines = list(self.content) 158 | mangle_docstrings(env.app, objtype, name, None, None, lines) 159 | self.content = ViewList(lines, self.content.parent) 160 | 161 | return base_directive.run(self) 162 | 163 | return directive 164 | 165 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/phantom_import.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============== 3 | phantom_import 4 | ============== 5 | 6 | Sphinx extension to make directives from ``sphinx.ext.autodoc`` and similar 7 | extensions to use docstrings loaded from an XML file. 8 | 9 | This extension loads an XML file in the Pydocweb format [1] and 10 | creates a dummy module that contains the specified docstrings. This 11 | can be used to get the current docstrings from a Pydocweb instance 12 | without needing to rebuild the documented module. 13 | 14 | .. [1] http://code.google.com/p/pydocweb 15 | 16 | """ 17 | import imp, sys, compiler, types, os, inspect, re 18 | 19 | def setup(app): 20 | app.connect('builder-inited', initialize) 21 | app.add_config_value('phantom_import_file', None, True) 22 | 23 | def initialize(app): 24 | fn = app.config.phantom_import_file 25 | if (fn and os.path.isfile(fn)): 26 | print "[numpydoc] Phantom importing modules from", fn, "..." 27 | import_phantom_module(fn) 28 | 29 | #------------------------------------------------------------------------------ 30 | # Creating 'phantom' modules from an XML description 31 | #------------------------------------------------------------------------------ 32 | def import_phantom_module(xml_file): 33 | """ 34 | Insert a fake Python module to sys.modules, based on a XML file. 35 | 36 | The XML file is expected to conform to Pydocweb DTD. The fake 37 | module will contain dummy objects, which guarantee the following: 38 | 39 | - Docstrings are correct. 40 | - Class inheritance relationships are correct (if present in XML). 41 | - Function argspec is *NOT* correct (even if present in XML). 42 | Instead, the function signature is prepended to the function docstring. 43 | - Class attributes are *NOT* correct; instead, they are dummy objects. 44 | 45 | Parameters 46 | ---------- 47 | xml_file : str 48 | Name of an XML file to read 49 | 50 | """ 51 | import lxml.etree as etree 52 | 53 | object_cache = {} 54 | 55 | tree = etree.parse(xml_file) 56 | root = tree.getroot() 57 | 58 | # Sort items so that 59 | # - Base classes come before classes inherited from them 60 | # - Modules come before their contents 61 | all_nodes = dict([(n.attrib['id'], n) for n in root]) 62 | 63 | def _get_bases(node, recurse=False): 64 | bases = [x.attrib['ref'] for x in node.findall('base')] 65 | if recurse: 66 | j = 0 67 | while True: 68 | try: 69 | b = bases[j] 70 | except IndexError: break 71 | if b in all_nodes: 72 | bases.extend(_get_bases(all_nodes[b])) 73 | j += 1 74 | return bases 75 | 76 | type_index = ['module', 'class', 'callable', 'object'] 77 | 78 | def base_cmp(a, b): 79 | x = cmp(type_index.index(a.tag), type_index.index(b.tag)) 80 | if x != 0: return x 81 | 82 | if a.tag == 'class' and b.tag == 'class': 83 | a_bases = _get_bases(a, recurse=True) 84 | b_bases = _get_bases(b, recurse=True) 85 | x = cmp(len(a_bases), len(b_bases)) 86 | if x != 0: return x 87 | if a.attrib['id'] in b_bases: return -1 88 | if b.attrib['id'] in a_bases: return 1 89 | 90 | return cmp(a.attrib['id'].count('.'), b.attrib['id'].count('.')) 91 | 92 | nodes = root.getchildren() 93 | nodes.sort(base_cmp) 94 | 95 | # Create phantom items 96 | for node in nodes: 97 | name = node.attrib['id'] 98 | doc = (node.text or '').decode('string-escape') + "\n" 99 | if doc == "\n": doc = "" 100 | 101 | # create parent, if missing 102 | parent = name 103 | while True: 104 | parent = '.'.join(parent.split('.')[:-1]) 105 | if not parent: break 106 | if parent in object_cache: break 107 | obj = imp.new_module(parent) 108 | object_cache[parent] = obj 109 | sys.modules[parent] = obj 110 | 111 | # create object 112 | if node.tag == 'module': 113 | obj = imp.new_module(name) 114 | obj.__doc__ = doc 115 | sys.modules[name] = obj 116 | elif node.tag == 'class': 117 | bases = [object_cache[b] for b in _get_bases(node) 118 | if b in object_cache] 119 | bases.append(object) 120 | init = lambda self: None 121 | init.__doc__ = doc 122 | obj = type(name, tuple(bases), {'__doc__': doc, '__init__': init}) 123 | obj.__name__ = name.split('.')[-1] 124 | elif node.tag == 'callable': 125 | funcname = node.attrib['id'].split('.')[-1] 126 | argspec = node.attrib.get('argspec') 127 | if argspec: 128 | argspec = re.sub('^[^(]*', '', argspec) 129 | doc = "%s%s\n\n%s" % (funcname, argspec, doc) 130 | obj = lambda: 0 131 | obj.__argspec_is_invalid_ = True 132 | obj.func_name = funcname 133 | obj.__name__ = name 134 | obj.__doc__ = doc 135 | if inspect.isclass(object_cache[parent]): 136 | obj.__objclass__ = object_cache[parent] 137 | else: 138 | class Dummy(object): pass 139 | obj = Dummy() 140 | obj.__name__ = name 141 | obj.__doc__ = doc 142 | if inspect.isclass(object_cache[parent]): 143 | obj.__get__ = lambda: None 144 | object_cache[name] = obj 145 | 146 | if parent: 147 | if inspect.ismodule(object_cache[parent]): 148 | obj.__module__ = parent 149 | setattr(object_cache[parent], name.split('.')[-1], obj) 150 | 151 | # Populate items 152 | for node in root: 153 | obj = object_cache.get(node.attrib['id']) 154 | if obj is None: continue 155 | for ref in node.findall('ref'): 156 | if node.tag == 'class': 157 | if ref.attrib['ref'].startswith(node.attrib['id'] + '.'): 158 | setattr(obj, ref.attrib['name'], 159 | object_cache.get(ref.attrib['ref'])) 160 | else: 161 | setattr(obj, ref.attrib['name'], 162 | object_cache.get(ref.attrib['ref'])) 163 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | tag_build = 3 | tag_date = 0 4 | tag_svn_revision = 0 5 | 6 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | import setuptools 3 | import sys, os 4 | 5 | version = "0.4" 6 | 7 | setup( 8 | name="numpydoc", 9 | packages=["numpydoc"], 10 | package_dir={"numpydoc": ""}, 11 | version=version, 12 | description="Sphinx extension to support docstrings in Numpy format", 13 | # classifiers from http://pypi.python.org/pypi?%3Aaction=list_classifiers 14 | classifiers=["Development Status :: 3 - Alpha", 15 | "Environment :: Plugins", 16 | "License :: OSI Approved :: BSD License", 17 | "Topic :: Documentation"], 18 | keywords="sphinx numpy", 19 | author="Pauli Virtanen and others", 20 | author_email="pav@iki.fi", 21 | url="http://github.com/numpy/numpy/tree/master/doc/sphinxext", 22 | license="BSD", 23 | zip_safe=False, 24 | install_requires=["Sphinx >= 1.0.1"], 25 | package_data={'numpydoc': 'tests', '': ''}, 26 | entry_points={ 27 | "console_scripts": [ 28 | "autosummary_generate = numpydoc.autosummary_generate:main", 29 | ], 30 | }, 31 | ) 32 | -------------------------------------------------------------------------------- /doc/source/sphinxext/numpydoc/traitsdoc.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========= 3 | traitsdoc 4 | ========= 5 | 6 | Sphinx extension that handles docstrings in the Numpy standard format, [1] 7 | and support Traits [2]. 8 | 9 | This extension can be used as a replacement for ``numpydoc`` when support 10 | for Traits is required. 11 | 12 | .. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard 13 | .. [2] http://code.enthought.com/projects/traits/ 14 | 15 | """ 16 | 17 | import inspect 18 | import os 19 | import pydoc 20 | 21 | import docscrape 22 | import docscrape_sphinx 23 | from docscrape_sphinx import SphinxClassDoc, SphinxFunctionDoc, SphinxDocString 24 | 25 | import numpydoc 26 | 27 | import comment_eater 28 | 29 | class SphinxTraitsDoc(SphinxClassDoc): 30 | def __init__(self, cls, modulename='', func_doc=SphinxFunctionDoc): 31 | if not inspect.isclass(cls): 32 | raise ValueError("Initialise using a class. Got %r" % cls) 33 | self._cls = cls 34 | 35 | if modulename and not modulename.endswith('.'): 36 | modulename += '.' 37 | self._mod = modulename 38 | self._name = cls.__name__ 39 | self._func_doc = func_doc 40 | 41 | docstring = pydoc.getdoc(cls) 42 | docstring = docstring.split('\n') 43 | 44 | # De-indent paragraph 45 | try: 46 | indent = min(len(s) - len(s.lstrip()) for s in docstring 47 | if s.strip()) 48 | except ValueError: 49 | indent = 0 50 | 51 | for n,line in enumerate(docstring): 52 | docstring[n] = docstring[n][indent:] 53 | 54 | self._doc = docscrape.Reader(docstring) 55 | self._parsed_data = { 56 | 'Signature': '', 57 | 'Summary': '', 58 | 'Description': [], 59 | 'Extended Summary': [], 60 | 'Parameters': [], 61 | 'Returns': [], 62 | 'Raises': [], 63 | 'Warns': [], 64 | 'Other Parameters': [], 65 | 'Traits': [], 66 | 'Methods': [], 67 | 'See Also': [], 68 | 'Notes': [], 69 | 'References': '', 70 | 'Example': '', 71 | 'Examples': '', 72 | 'index': {} 73 | } 74 | 75 | self._parse() 76 | 77 | def _str_summary(self): 78 | return self['Summary'] + [''] 79 | 80 | def _str_extended_summary(self): 81 | return self['Description'] + self['Extended Summary'] + [''] 82 | 83 | def __str__(self, indent=0, func_role="func"): 84 | out = [] 85 | out += self._str_signature() 86 | out += self._str_index() + [''] 87 | out += self._str_summary() 88 | out += self._str_extended_summary() 89 | for param_list in ('Parameters', 'Traits', 'Methods', 90 | 'Returns','Raises'): 91 | out += self._str_param_list(param_list) 92 | out += self._str_see_also("obj") 93 | out += self._str_section('Notes') 94 | out += self._str_references() 95 | out += self._str_section('Example') 96 | out += self._str_section('Examples') 97 | out = self._str_indent(out,indent) 98 | return '\n'.join(out) 99 | 100 | def looks_like_issubclass(obj, classname): 101 | """ Return True if the object has a class or superclass with the given class 102 | name. 103 | 104 | Ignores old-style classes. 105 | """ 106 | t = obj 107 | if t.__name__ == classname: 108 | return True 109 | for klass in t.__mro__: 110 | if klass.__name__ == classname: 111 | return True 112 | return False 113 | 114 | def get_doc_object(obj, what=None, config=None): 115 | if what is None: 116 | if inspect.isclass(obj): 117 | what = 'class' 118 | elif inspect.ismodule(obj): 119 | what = 'module' 120 | elif callable(obj): 121 | what = 'function' 122 | else: 123 | what = 'object' 124 | if what == 'class': 125 | doc = SphinxTraitsDoc(obj, '', func_doc=SphinxFunctionDoc, config=config) 126 | if looks_like_issubclass(obj, 'HasTraits'): 127 | for name, trait, comment in comment_eater.get_class_traits(obj): 128 | # Exclude private traits. 129 | if not name.startswith('_'): 130 | doc['Traits'].append((name, trait, comment.splitlines())) 131 | return doc 132 | elif what in ('function', 'method'): 133 | return SphinxFunctionDoc(obj, '', config=config) 134 | else: 135 | return SphinxDocString(pydoc.getdoc(obj), config=config) 136 | 137 | def setup(app): 138 | # init numpydoc 139 | numpydoc.setup(app, get_doc_object) 140 | 141 | -------------------------------------------------------------------------------- /doc/source/sphinxext/sphinxcontrib/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | sphinxcontrib 4 | ~~~~~~~~~~~~~ 5 | 6 | This package is a namespace package that contains all extensions 7 | distributed in the ``sphinx-contrib`` distribution. 8 | 9 | :copyright: Copyright 2007-2009 by the Sphinx team, see AUTHORS. 10 | :license: BSD, see LICENSE for details. 11 | """ 12 | 13 | __import__('pkg_resources').declare_namespace(__name__) 14 | 15 | -------------------------------------------------------------------------------- /doc/source/sphinxext/sphinxcontrib/programoutput.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) 2010, 2011, Sebastian Wiesner 3 | # All rights reserved. 4 | 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | 8 | # 1. Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 2. Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | 14 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | # POSSIBILITY OF SUCH DAMAGE. 25 | 26 | 27 | """ 28 | sphinxcontrib.programoutput 29 | =========================== 30 | 31 | This extension provides a directive to include the output of commands as 32 | literal block while building the docs. 33 | 34 | .. moduleauthor:: Sebastian Wiesner 35 | """ 36 | 37 | from __future__ import (print_function, division, unicode_literals, 38 | absolute_import) 39 | 40 | import sys 41 | import shlex 42 | from subprocess import Popen, PIPE, STDOUT 43 | from collections import defaultdict, namedtuple 44 | 45 | from docutils import nodes 46 | from docutils.parsers import rst 47 | from docutils.parsers.rst.directives import flag, unchanged, nonnegative_int 48 | 49 | 50 | __version__ = '0.5' 51 | 52 | 53 | class program_output(nodes.Element): 54 | pass 55 | 56 | 57 | def _slice(value): 58 | parts = [int(v.strip()) for v in value.split(',')] 59 | if len(parts) > 2: 60 | raise ValueError('too many slice parts') 61 | return tuple((parts + [None]*2)[:2]) 62 | 63 | 64 | class ProgramOutputDirective(rst.Directive): 65 | has_content = False 66 | final_argument_whitespace = True 67 | required_arguments = 1 68 | 69 | option_spec = dict(shell=flag, prompt=flag, nostderr=flag, 70 | ellipsis=_slice, extraargs=unchanged, 71 | returncode=nonnegative_int) 72 | 73 | def run(self): 74 | node = program_output() 75 | node.line = self.lineno 76 | node['command'] = self.arguments[0] 77 | 78 | if self.name == 'command-output': 79 | node['show_prompt'] = True 80 | else: 81 | node['show_prompt'] = 'prompt' in self.options 82 | 83 | node['hide_standard_error'] = 'nostderr' in self.options 84 | node['extraargs'] = self.options.get('extraargs', '') 85 | node['use_shell'] = 'shell' in self.options 86 | node['returncode'] = self.options.get('returncode', 0) 87 | if 'ellipsis' in self.options: 88 | node['strip_lines'] = self.options['ellipsis'] 89 | return [node] 90 | 91 | 92 | _Command = namedtuple('Command', 'command shell hide_standard_error') 93 | 94 | 95 | class Command(_Command): #pylint: disable=W0232 96 | """ 97 | A command to be executed. 98 | """ 99 | 100 | def __new__(cls, command, shell=False, hide_standard_error=False): 101 | if isinstance(command, list): 102 | command = tuple(command) 103 | return _Command.__new__(cls, command, shell, hide_standard_error) 104 | 105 | @classmethod 106 | def from_program_output_node(cls, node): 107 | """ 108 | Create a command from a :class:`program_output` node. 109 | """ 110 | extraargs = node.get('extraargs', '') 111 | command = (node['command'] + ' ' + extraargs).strip() 112 | return cls(command, node['use_shell'], node['hide_standard_error']) 113 | 114 | def execute(self): 115 | """ 116 | Execute this command. 117 | 118 | Return the :class:`~subprocess.Popen` object representing the running 119 | command. 120 | """ 121 | # pylint: disable=E1101 122 | if isinstance(self.command, unicode): 123 | command = self.command.encode(sys.getfilesystemencoding()) 124 | else: 125 | command = self.command 126 | if isinstance(command, basestring) and not self.shell: 127 | command = shlex.split(command) 128 | return Popen(command, shell=self.shell, stdout=PIPE, 129 | stderr=PIPE if self.hide_standard_error else STDOUT) 130 | 131 | def get_output(self): 132 | """ 133 | Get the output of this command. 134 | 135 | Return a tuple ``(returncode, output)``. ``returncode`` is the 136 | integral return code of the process, ``output`` is the output as 137 | unicode string, with final trailing spaces and new lines stripped. 138 | """ 139 | process = self.execute() 140 | output = process.communicate()[0].decode( 141 | sys.getfilesystemencoding()).rstrip() 142 | return process.returncode, output 143 | 144 | def __str__(self): 145 | # pylint: disable=E1101 146 | if isinstance(self.command, tuple): 147 | return repr(list(self.command)) 148 | return repr(self.command) 149 | 150 | 151 | class ProgramOutputCache(defaultdict): # pylint: disable=W0232 152 | """ 153 | Execute command and cache their output. 154 | 155 | This class is a mapping. Its keys are :class:`Command` objects represeting 156 | command invocations. Its values are tuples of the form ``(returncode, 157 | output)``, where ``returncode`` is the integral return code of the command, 158 | and ``output`` is the output as unicode string. 159 | 160 | The first time, a key is retrieved from this object, the command is 161 | invoked, and its result is cached. Subsequent access to the same key 162 | returns the cached value. 163 | """ 164 | 165 | def __missing__(self, command): 166 | """ 167 | Called, if a command was not found in the cache. 168 | 169 | ``command`` is an instance of :class:`Command`. 170 | """ 171 | result = command.get_output() 172 | self[command] = result 173 | return result 174 | 175 | 176 | def run_programs(app, doctree): 177 | """ 178 | Execute all programs represented by ``program_output`` nodes in 179 | ``doctree``. Each ``program_output`` node in ``doctree`` is then 180 | replaced with a node, that represents the output of this program. 181 | 182 | The program output is retrieved from the cache in 183 | ``app.env.programoutput_cache``. 184 | """ 185 | if app.config.programoutput_use_ansi: 186 | # enable ANSI support, if requested by config 187 | from sphinxcontrib.ansi import ansi_literal_block 188 | node_class = ansi_literal_block 189 | else: 190 | node_class = nodes.literal_block 191 | 192 | cache = app.env.programoutput_cache 193 | 194 | for node in doctree.traverse(program_output): 195 | command = Command.from_program_output_node(node) 196 | try: 197 | returncode, output = cache[command] 198 | except EnvironmentError as error: 199 | error_message = 'Command {0} failed: {1}'.format(command, error) 200 | error_node = doctree.reporter.error(error_message, base_node=node) 201 | node.replace_self(error_node) 202 | else: 203 | if returncode != node['returncode']: 204 | app.warn('Unexpected return code {0} from command {1}'.format( 205 | returncode, command)) 206 | 207 | # replace lines with ..., if ellipsis is specified 208 | if 'strip_lines' in node: 209 | lines = output.splitlines() 210 | start, stop = node['strip_lines'] 211 | lines[start:stop] = ['...'] 212 | output = '\n'.join(lines) 213 | 214 | if node['show_prompt']: 215 | tmpl = app.config.programoutput_prompt_template 216 | output = tmpl.format(command=node['command'], output=output, 217 | returncode=returncode) 218 | 219 | new_node = node_class(output, output) 220 | new_node['language'] = 'text' 221 | node.replace_self(new_node) 222 | 223 | 224 | def init_cache(app): 225 | """ 226 | Initialize the cache for program output at 227 | ``app.env.programoutput_cache``, if not already present (e.g. being 228 | loaded from a pickled environment). 229 | 230 | The cache is of type :class:`ProgramOutputCache`. 231 | """ 232 | if not hasattr(app.env, 'programoutput_cache'): 233 | app.env.programoutput_cache = ProgramOutputCache() 234 | 235 | 236 | def setup(app): 237 | app.add_config_value('programoutput_use_ansi', False, 'env') 238 | app.add_config_value('programoutput_prompt_template', 239 | '$ {command}\n{output}', 'env') 240 | app.add_directive('program-output', ProgramOutputDirective) 241 | app.add_directive('command-output', ProgramOutputDirective) 242 | app.connect(b'builder-inited', init_cache) 243 | app.connect(b'doctree-read', run_programs) 244 | -------------------------------------------------------------------------------- /doc/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _tutorial: 2 | 3 | Quick start tutorial 4 | ==================== 5 | ADENINE may be installed using standard Python tools (with 6 | administrative or sudo permissions on GNU-Linux platforms):: 7 | 8 | $ pip install adenine 9 | 10 | or 11 | 12 | $ easy_install adenine 13 | 14 | Installation from sources 15 | ------------------------- 16 | If you like to manually install ADENINE, download the .zip or .tar.gz archive 17 | from ``_. Then extract it and move into the root directory:: 18 | 19 | $ unzip slipguru-adenine-|release|.zip 20 | $ cd adenine-|release|/ 21 | 22 | or:: 23 | 24 | $ tar xvf slipguru-adenine-|release|.tar.gz 25 | $ cd adenine-|release|/ 26 | 27 | Otherwise you can clone our `GitHub repository `_:: 28 | 29 | $ git clone https://github.com/slipguru/adenine.git 30 | 31 | From here, you can follow the standard Python installation step:: 32 | 33 | $ python setup.py install 34 | 35 | After ADENINE installation, you should have access to two scripts, 36 | named with a common ``ade_`` prefix:: 37 | 38 | $ ade_ 39 | ade_analysis.py ade_run.py 40 | 41 | This tutorial assumes that you downloaded and extracted ADENINE 42 | source package which contains a ``examples\data`` directory with some data files (``.npy`` or ``.csv``) which will be used to show ADENINE functionalities. 43 | 44 | ADENINE needs only 3 ingredients: 45 | 46 | * ``n_samples x n_variables`` input matrix 47 | * ``n_samples x 1`` output vector (optional) 48 | * ``configuration`` file 49 | 50 | 51 | Input data format 52 | ----------------- 53 | Input data are assumed to be: 54 | 55 | * ``numpy`` array stored in ``.npy`` files organized with a row for each sample and a column for each feature, 56 | * tabular data stored in comma separated ``.csv`` files presenting the variables header on the first row and the sample indexes on the first column, 57 | * toy examples available from ``adenine.utils.data_source`` function. 58 | 59 | .. _configuration: 60 | 61 | Configuration File 62 | ------------------ 63 | ADENINE configuration file is a standard Python script. It is 64 | imported as a module, then all the code is executed. In this file the user can define all the option needed to read the data and to create the pipelines. 65 | 66 | .. literalinclude:: ../../adenine/ade_config.py 67 | :language: python 68 | 69 | .. _experiment: 70 | 71 | Experiment runner 72 | ----------------- 73 | The ``ade_run.py`` script, executes the full ADENINE framework. The prototype is the following:: 74 | 75 | $ ade_run.py ade_config.py 76 | 77 | When launched, the script reads the data, then it creates and runs each pipeline saving the results in a tree-like structure which has the current folder as root. 78 | 79 | .. _analysis: 80 | 81 | Results analysis 82 | ---------------- 83 | The ``ade_analysis.py`` script provides useful summaries and graphs from the results of the experiment. This script accepts as only parameter a result directory 84 | already created:: 85 | 86 | $ ade_analysis.py result-dir 87 | 88 | The script produces a set of textual and graphical results. An output example obtained by one of the implemented pipelines is represented below. 89 | 90 | .. image:: pca.png 91 | :scale: 80 % 92 | :alt: broken link 93 | 94 | .. image:: kpca.png 95 | :scale: 80 % 96 | :alt: broken link 97 | 98 | You can reproduce the example above specifying ``data_source.load('circles')`` in the configuration file. 99 | 100 | Example dataset 101 | ---------------- 102 | An example dataset can be dowloaded :download:`here `. The dataset is a random extraction of 801 samples (with dimension 20531) measuring RNA-Seq gene expression of patients affected by 5 different types of tumor: breast invasive carcinoma (BRCA), kidney renal clear cell carcinoma (KIRC), colon (COAD), lung (LUAD) and prostate adenocarcinoma (PRAD). The full dataset is maintained by The Cancer Genome Atlas Pan-Cancer Project [1] and we refer to the `original repository `_ for furher details. 103 | 104 | Reference 105 | ---------------- 106 | [1] Weinstein, John N., et al. "The cancer genome atlas pan-cancer analysis project." Nature genetics 45.10 (2013): 1113-1120. 107 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slipguru/adenine/cd0f65512cc4f66007a057e35619d124f6474389/icon.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | functools32==3.2.3.post2 3 | matplotlib==2.0.0 4 | numpy==1.12.0 5 | pandas==0.19.2 6 | pydot==p 7 | pyparsing==2.1.4 8 | python-dateutil==2.6.0 9 | pytz==2016.10 10 | scikit-learn==0.18.1 11 | scipy==0.18.1 12 | seaborn==0.7.1 13 | six==1.10.0 14 | subprocess32==3.2.7 15 | GEOparse==0.1.10 16 | fastcluster==1.1.20 17 | -------------------------------------------------------------------------------- /scripts/ade_GEO2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | import argparse 11 | import pandas as pd 12 | 13 | from adenine.utils import GEO2csv 14 | from adenine import __version__ 15 | 16 | 17 | def main(): 18 | """Adenine GEO2csv main script.""" 19 | parser = argparse.ArgumentParser(description='Adenine script for ' 20 | 'GEO2csv conversion.') 21 | parser.add_argument('--version', action='version', 22 | version='%(prog)s v' + __version__) 23 | parser.add_argument('accession_number', help='GEO DataSets Accession number') 24 | parser.add_argument('--label_field', dest='pheno_name', 25 | default='title', help='The field in which ' 26 | 'phenotypes information are stored.') 27 | parser.add_argument('--phenotypes', '--pheno', dest='pheno', 28 | action='store', default=None, 29 | help='Select samples by their phenotypes (' 30 | 'comma separated) e.g.: Severe,Mild,Control,...') 31 | parser.add_argument('--gene_symbol', action='store_true', dest='gs', 32 | help='Use this option to convert the platform IDs ' 33 | 'to gene symbols') 34 | parser.add_argument('--signature', dest='signature', 35 | default=None, help='Generate a data matrix comprising ' 36 | 'only the genes in the signature.') 37 | args = parser.parse_args() 38 | 39 | # Get the data 40 | try: 41 | if args.gs or (args.signature is not None): 42 | data, gse = GEO2csv.get_GEO(args.accession_number, args.pheno_name, True) 43 | else: 44 | data = GEO2csv.get_GEO(args.accession_number, args.pheno_name)[0] 45 | print('* GEO dataset {} loaded'.format(args.accession_number)) 46 | 47 | # Filter samples per phenotype 48 | if args.pheno is not None: 49 | data = GEO2csv.GEO_select_samples( 50 | data.data, data.target, selected_labels=args.pheno.split(','), 51 | index=data.index, feature_names=data.feature_names) 52 | print('* Phenotypes {}'.format(args.pheno)) 53 | 54 | if args.gs or (args.signature is not None): 55 | data = GEO2csv.id2gs(data, gse) 56 | print('* Probe ID converted to gene symbols') 57 | 58 | if args.signature is not None: 59 | data = GEO2csv.restrict_to_signature(data, args.signature.split(',')) 60 | print('* Dataset restricted to {}'.format(data.feature_names)) 61 | 62 | # Save dataset 63 | pd.DataFrame(data=data.data, columns=data.feature_names, 64 | index=data.index).to_csv('{}_data.csv'.format(args.accession_number)) 65 | print('* {}_data.csv created: {} samples x {} features'.format(args.accession_number, 66 | *data.data.shape)) 67 | pd.DataFrame(data=data.target, columns=['Phenotype'], 68 | index=data.index).to_csv('{}_labels.csv'.format(args.accession_number)) 69 | print('* {}_labels.csv created: {} samples'.format(args.accession_number, 70 | len(data.target))) 71 | 72 | except Exception as e: 73 | print('Raised {}'.format(e)) 74 | raise ValueError('Cannot parse {}. Check ' 75 | 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={}' 76 | ' for more info on the GEO series'.format(args.accession_number, 77 | args.accession_number)) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /scripts/ade_analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Adenine analysis script.""" 3 | ###################################################################### 4 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 5 | # 6 | # FreeBSD License 7 | ###################################################################### 8 | 9 | from __future__ import print_function 10 | 11 | import imp 12 | import sys 13 | import os 14 | import time 15 | import logging 16 | import argparse 17 | import gzip 18 | import numpy as np 19 | try: 20 | import cPickle as pkl 21 | except: 22 | import pickle as pkl 23 | 24 | from adenine.core import analyze_results 25 | from adenine.utils import extra 26 | 27 | 28 | def init_main(): 29 | """Init analysis main.""" 30 | from adenine import __version__ 31 | parser = argparse.ArgumentParser(description='Adenine script for ' 32 | 'analysing pipelines.') 33 | parser.add_argument('--version', action='version', 34 | version='%(prog)s v' + __version__) 35 | parser.add_argument("result_folder", help="specify results directory") 36 | args = parser.parse_args() 37 | 38 | root_folder = args.result_folder 39 | filename = [f for f in os.listdir(root_folder) 40 | if os.path.isfile(os.path.join(root_folder, f)) and 41 | '.pkl' in f and f != "__data.pkl"] 42 | if not filename: 43 | sys.stderr.write("No .pkl file found in {}. Aborting...\n" 44 | .format(root_folder)) 45 | sys.exit(-1) 46 | 47 | # Run analysis 48 | # print("Starting the analysis of {}".format(filename)) 49 | main(os.path.join(os.path.abspath(root_folder), filename[0])) 50 | 51 | 52 | def main(dumpfile): 53 | """Analyze the pipelines.""" 54 | # Load the configuration file 55 | config_path = os.path.dirname(dumpfile) 56 | config_path = os.path.join(os.path.abspath(config_path), 'ade_config.py') 57 | config = imp.load_source('ade_config', config_path) 58 | extra.set_module_defaults(config, {'file_format': 'pdf', 59 | 'plotting_context': 'paper', 60 | 'verbose': False}) 61 | if hasattr(config, 'use_compression'): 62 | use_compression = config.use_compression 63 | else: 64 | use_compression = False 65 | 66 | # Load the results used with ade_run.py 67 | try: 68 | if use_compression: 69 | with gzip.open(os.path.join(os.path.dirname(dumpfile), 70 | '__data.pkl.tz'), 'r') as fdata: 71 | data_X_y_index = pkl.load(fdata) 72 | data = data_X_y_index['X'] 73 | labels = data_X_y_index['y'] 74 | index = data_X_y_index['index'] 75 | else: 76 | with open(os.path.join(os.path.dirname(dumpfile), 77 | '__data.pkl'), 'r') as fdata: 78 | data_X_y_index = pkl.load(fdata) 79 | data = data_X_y_index['X'] 80 | labels = data_X_y_index['y'] 81 | index = data_X_y_index['index'] 82 | except IOError: 83 | if use_compression: 84 | data_filename = '__data.pkl.tz' 85 | else: 86 | data_filename = '__data.pkl' 87 | 88 | sys.stderr.write("Cannot load {} Reloading data from " 89 | "config file ...".format(data_filename)) 90 | data = config.X 91 | labels = config.y 92 | index = config.index if hasattr(config, 'index') \ 93 | else np.arange(data.shape[0]) 94 | 95 | # Read the feature names from the config file 96 | feat_names = config.feat_names if hasattr(config, 'feat_names') \ 97 | else np.arange(data.shape[1]) 98 | # Initialize the log file 99 | filename = 'results_' + os.path.basename(dumpfile)[0:-7] 100 | logfile = os.path.join(os.path.dirname(dumpfile), filename + '.log') 101 | logging.basicConfig(filename=logfile, level=logging.INFO, filemode='w', 102 | format='%(levelname)s (%(name)s): %(message)s') 103 | root_logger = logging.getLogger() 104 | lsh = logging.StreamHandler() 105 | lsh.setLevel(20 if config.verbose else logging.ERROR) 106 | lsh.setFormatter( 107 | logging.Formatter('%(levelname)s (%(name)s): %(message)s')) 108 | root_logger.addHandler(lsh) 109 | 110 | tic = time.time() 111 | print("\nUnpickling output ...", end=' ') 112 | # Load the results 113 | if use_compression: 114 | with gzip.open(dumpfile, 'r') as fres: 115 | res = pkl.load(fres) 116 | else: 117 | with open(dumpfile, 'r') as fres: 118 | res = pkl.load(fres) 119 | 120 | print("done: {} s".format(extra.sec_to_time(time.time() - tic))) 121 | 122 | # Analyze the pipelines 123 | analyze_results.analyze(input_dict=res, root=os.path.dirname(dumpfile), 124 | y=labels, feat_names=feat_names, index=index, 125 | plotting_context=config.plotting_context, 126 | file_format=config.file_format) 127 | 128 | root_logger.handlers[0].close() 129 | 130 | 131 | if __name__ == '__main__': 132 | init_main() 133 | -------------------------------------------------------------------------------- /scripts/ade_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | ###################################################################### 5 | # Copyright (C) 2016 Samuele Fiorini, Federico Tomasi, Annalisa Barla 6 | # 7 | # FreeBSD License 8 | ###################################################################### 9 | 10 | import os 11 | import shutil 12 | import argparse 13 | 14 | from adenine import main 15 | 16 | 17 | def init_main(): 18 | """Initialize main for ade_run.py.""" 19 | from adenine import __version__ 20 | parser = argparse.ArgumentParser(description='Adenine script for ' 21 | 'pipeline generation.') 22 | parser.add_argument('--version', action='version', 23 | version='%(prog)s v' + __version__) 24 | parser.add_argument("-c", "--create", dest="create", action="store_true", 25 | help="create config file", default=False) 26 | parser.add_argument("configuration_file", help="specify config file", 27 | default='ade_config.py') 28 | args = parser.parse_args() 29 | 30 | if args.create: 31 | import adenine as ade 32 | std_config_path = os.path.join(ade.__path__[0], 'ade_config.py') 33 | # Check for .pyc 34 | if std_config_path.endswith('.pyc'): 35 | std_config_path = std_config_path[:-1] 36 | # Check if the file already exists 37 | if os.path.exists(args.configuration_file): 38 | parser.error("adenine configuration file already exists") 39 | # Copy the config file 40 | shutil.copy(std_config_path, args.configuration_file) 41 | else: 42 | main(args.configuration_file) 43 | 44 | 45 | if __name__ == '__main__': 46 | init_main() 47 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """adenine setup script.""" 3 | 4 | from setuptools import setup 5 | 6 | # Package Version 7 | from adenine import __version__ as version 8 | 9 | setup( 10 | name='adenine', 11 | version=version, 12 | 13 | description=('A Data ExploratioN pIpeliNE'), 14 | long_description=open('README.md').read(), 15 | author='Samuele Fiorini, Federico Tomasi', 16 | author_email='{samuele.fiorini, federico.tomasi}@dibris.unige.it', 17 | maintainer='Samuele Fiorini, Federico Tomasi', 18 | maintainer_email='{samuele.fiorini, federico.tomasi}@dibris.unige.it', 19 | url='https://github.com/slipguru/adenine', 20 | download_url='https://github.com/slipguru/adenine/tarball/'+version, 21 | classifiers=[ 22 | 'Development Status :: 4 - Beta', 23 | 'Environment :: Console', 24 | 'Intended Audience :: Science/Research', 25 | 'Intended Audience :: Developers', 26 | 'Programming Language :: Python', 27 | 'License :: OSI Approved :: BSD License', 28 | 'Topic :: Software Development', 29 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 30 | 'Operating System :: POSIX', 31 | 'Operating System :: Unix', 32 | 'Operating System :: MacOS' 33 | ], 34 | license='FreeBSD', 35 | 36 | packages=['adenine', 'adenine.core', 'adenine.utils', 'adenine.externals'], 37 | install_requires=['numpy (>=1.10.1)', 38 | 'scipy (>=0.16.1)', 39 | 'scikit-learn (>=0.18)', 40 | 'matplotlib (>=1.5.1)', 41 | 'seaborn (>=0.7.0)', 42 | # 'joblib', 43 | 'fastcluster (>=1.1.20)', 44 | 'GEOparse (>=0.1.10)', 45 | 'pydot (>=1.2.3)'], 46 | scripts=['scripts/ade_run.py', 'scripts/ade_analysis.py', 47 | 'scripts/ade_GEO2csv.py'], 48 | ) 49 | --------------------------------------------------------------------------------