├── .gitignore ├── .test.sh ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── diffacto ├── __init__.py ├── __main__.py └── diffacto.py ├── environment.yml ├── example ├── HBY20Mix.peptides.csv ├── HBY20Mix.samples.lst ├── UP000002311_559292.fasta ├── iPRG.novo.pep.csv ├── iPRG.samples.lst └── readme.md ├── run_diffacto.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd example 3 | diffacto -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt \ 4 | -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False 5 | protfile_size=$(wc -l < iPRG.denovo.protein.txt) 6 | #fdrfile_size=$(wc -l < iPRG.denovo.protein.FDR) 7 | #if [ $protfile_size -ge '200' ] && [ $fdrfile_size -ge '200' ]; then 8 | if [ $protfile_size -ge '200' ]; then 9 | # All OK 10 | exit 0 11 | else 12 | # Something is wrong 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | # - "3.5" 4 | - "3.6" 5 | # command to install dependencies 6 | install: 7 | - python setup.py install 8 | # command to run tests 9 | script: ./.test.sh 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2017 Bo Zhang and Lukas Käll 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Diffacto: Differential Factor Analysis for Comparative Shotgun Proteomics 2 | ========================================================================== 3 | 4 | Requirements 5 | -------------- 6 | 7 | Anaconda_ Python3.5+ 8 | 9 | Packages needed: 10 | 11 | - numpy 1.10+ 12 | - scipy 0.17+ 13 | - pandas 0.18+ 14 | - networkx 1.10+ 15 | - scikit-learn 0.17+ 16 | - pyteomics_ 3.3+ 17 | 18 | .. _Anaconda: https://www.anaconda.com/ 19 | .. _pyteomics: https://pyteomics.readthedocs.io/ 20 | 21 | Installation via ``pip`` 22 | ************************* 23 | 24 | :: 25 | 26 | pip install diffacto 27 | 28 | 29 | Usage 30 | ----- 31 | 32 | :: 33 | 34 | diffacto.py [-h] -i I [-db [DB]] [-samples [SAMPLES]] [-log2 LOG2] 35 | [-normalize {average,median,GMM,None}] 36 | [-farms_mu FARMS_MU] [-farms_alpha FARMS_ALPHA] 37 | [-reference REFERENCE] [-min_samples MIN_SAMPLES] 38 | [-use_unique USE_UNIQUE] 39 | [-impute_threshold IMPUTE_THRESHOLD] 40 | [-cutoff_weight CUTOFF_WEIGHT] [-fast FAST] [-out OUT] 41 | [-mc_out MC_OUT] 42 | optional arguments: 43 | -h, --help show this help message and exit 44 | -i I Peptides abundances in CSV format. The first row 45 | should contain names for all samples. The first column 46 | should contain unique peptide sequences. Missing 47 | values should be empty instead of zeros. (default: 48 | None) 49 | -db [DB] Protein database in FASTA format. If None, the peptide 50 | file must have protein ID(s) in the second column. 51 | (default: None) 52 | -samples [SAMPLES] File of the sample list. One run and its sample group 53 | per line, separated by tab. If None, read from peptide 54 | file headings, then each run will be summarized as a 55 | group. (default: None) 56 | -log2 LOG2 Input abundances are in log scale (True) or linear 57 | scale (False) (default: False) 58 | -normalize {average,median,GMM,None} 59 | Method for sample-wise normalization. (default: None) 60 | -farms_mu FARMS_MU Hyperparameter mu (default: 0.1) 61 | -farms_alpha FARMS_ALPHA 62 | Hyperparameter weight of prior probability (default: 63 | 0.1) 64 | -reference REFERENCE Names of reference sample groups (separated by 65 | semicolon) (default: average) 66 | -min_samples MIN_SAMPLES 67 | Minimum number of samples peptides needed to be 68 | quantified in (default: 1) 69 | -use_unique USE_UNIQUE 70 | Use unique peptides only (default: False) 71 | -impute_threshold IMPUTE_THRESHOLD 72 | Minimum fraction of missing values in the group. 73 | Impute missing values if missing fraction is larger 74 | than the threshold. (default: 0.99) 75 | -cutoff_weight CUTOFF_WEIGHT 76 | Peptides weighted lower than the cutoff will be 77 | excluded (default: 0.5) 78 | -fast FAST Allow early termination in EM calculation when noise 79 | is sufficiently small. (default: False) 80 | -out OUT Path to output file (writing in TSV format). 81 | -mc_out MC_OUT Path to MCFDR output (writing in TSV format). 82 | (default: None) 83 | 84 | 85 | Example 86 | ------- 87 | 88 | Examples are given in the example_ directory. 89 | 90 | .. _example: ./example 91 | -------------------------------------------------------------------------------- /diffacto/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/statisticalbiotechnology/diffacto/6389c9410f4c01aed6f33fd787a9fce0cf304a9e/diffacto/__init__.py -------------------------------------------------------------------------------- /diffacto/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """diffacto.__main__: executed when bootstrap directory is called as script.""" 5 | 6 | 7 | from .diffacto import main 8 | main() 9 | -------------------------------------------------------------------------------- /diffacto/diffacto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: UTF-8 -*- 3 | from __future__ import division, print_function 4 | 5 | """diffacto.diffacto: provides entry point main().""" 6 | 7 | __version__ = "1.0.7" 8 | 9 | import csv 10 | import re 11 | import warnings 12 | from collections import defaultdict 13 | from multiprocessing import Pool 14 | 15 | from scipy import optimize, stats 16 | import networkx as nx 17 | import numpy as np 18 | import pandas 19 | from numpy import array, isfinite, nanmean, nansum 20 | from pyteomics import fasta 21 | 22 | # from numba import jit # # Enable just-in-time compiler for speeding up 23 | # @jit 24 | def fast_farms( 25 | probes: np.array, 26 | weight: float = 0.5, 27 | mu: float = 0, 28 | max_iter: int = 1000, 29 | force_iter: bool = False, 30 | min_noise: float = 1e-4, 31 | fill_nan: float = 0.0, 32 | ): 33 | """Bayesian Factor Analysis for Proteomics Summarization 34 | A python translation of function "generateExprVal.method.farms" from 35 | Bioconductor FARMS. 36 | [http://www.bioconductor.org/packages/release/bioc/html/farms.html] 37 | [http://www.bioinf.jku.at/publications/papers/farms/supplementary.ps] 38 | 39 | Reference: 40 | Hochreiter S, Clevert D and Obermayer K (2006). A new summarization 41 | method for affymetrix probe level data. Bioinformatics, 22(8), 42 | http://bioinformatics.oxfordjournals.org/cgi/content/abstract/22/8/943. 43 | 44 | Inputs: 45 | probes: Peptide abundance array (N peptides, M samples) in log scale. 46 | weight: Hyperparameter (backscale factor) value in the range of [0,1] 47 | which determines the influence of the prior. 48 | mu: Hyperparameter value which allows to quantify different aspects 49 | of potential prior knowledge. A value near zero assumes that 50 | most genes do not contain a signal, and introduces a bias for 51 | loading matrix elements near zero. """ 52 | 53 | readouts = np.array(probes) 54 | if fill_nan != 0: 55 | readouts[np.isnan(readouts)] = fill_nan 56 | 57 | # normalize and transform X 58 | X = np.nan_to_num(readouts).T 59 | X = X - np.nanmean(X, axis=0) 60 | xsd = np.nanstd(X, axis=0) 61 | xsd[xsd < min_noise] = 1.0 62 | X /= xsd 63 | X[~isfinite(X)] = 0 64 | 65 | n_samples, n_features = X.shape 66 | C = np.cov(X.T, ddof=0) 67 | 68 | # positive definite 69 | C = 0.5 * (C + C.T) 70 | C[np.where(C < 0)] = 0 71 | 72 | # robustness 73 | U, s, V = np.linalg.svd(C) 74 | s[s < min_noise] = min_noise 75 | C = U.dot(np.diag(s)).dot(V) 76 | 77 | # initiation 78 | λ = np.sqrt(np.diag(C) * 0.75) 79 | ψ = np.diag(C) - λ ** 2 80 | old_psi = ψ 81 | old_lambda = λ 82 | alpha = weight * n_features 83 | E = 1.0 84 | min_noise_square = min_noise ** 2 85 | C_diag = np.diag(C) 86 | 87 | for i in range(max_iter): 88 | # E step 89 | φ = λ / ψ 90 | a = 1 + np.matmul(λ.reshape(1, -1), φ.reshape(-1, 1)) 91 | η = φ / a 92 | ζ = C.dot(η.T) 93 | E = 1 - η.dot(λ) + η.dot(ζ) 94 | 95 | # M step 96 | λ = ζ.T / (E + ψ * alpha) 97 | λ = np.asarray(λ)[0] 98 | ψ = C_diag - np.asarray(ζ)[0] * λ + ψ * alpha * λ * (mu - λ) 99 | ψ = np.maximum(ψ, min_noise_square) 100 | if ( 101 | ψ[-1] == old_psi[-1] 102 | and ψ[0] == old_psi[0] 103 | and np.array_equal(ψ, old_psi) 104 | and np.array_equal(λ, old_lambda) 105 | ): 106 | break 107 | 108 | if not force_iter: 109 | if abs(ψ - old_psi).max() / old_psi.max() < min_noise / 10: 110 | break 111 | 112 | old_psi = ψ 113 | old_lambda = λ 114 | 115 | loading = np.sqrt(E[0, 0]) * λ 116 | φ = loading / ψ 117 | weights = loading / loading.max() # rescale loadings to the range of [0,1] 118 | noise = 1 / (1 + np.matmul(loading.reshape(1, -1), φ.reshape(-1, 1))) 119 | noise = noise[0, 0] 120 | return weights, noise 121 | 122 | 123 | # @jit(nogil=True) 124 | def fast_gmean_nomissing(weights, pep_abd, group_ix): 125 | """ 126 | Calculate geometric means based on non-missing peptide readouts. 127 | """ 128 | abd_w = pep_abd * weights[..., None] 129 | one_w = abd_w / abd_w * weights[..., None] 130 | a_sums = np.nansum(abd_w, axis=0) 131 | w_sums = np.nansum(one_w, axis=0) 132 | expr = a_sums[group_ix].sum(axis=1) / w_sums[group_ix].sum(axis=1) 133 | return expr 134 | 135 | 136 | # @jit(nogil=True) 137 | def sum_squares(pep_abd, group_ix, estimates): 138 | """ 139 | Calculate sum of squared residuals 140 | """ 141 | global nGroups 142 | residual = 0.0 143 | for i in range(nGroups): 144 | res = pep_abd[:, group_ix[i]] - estimates[i] 145 | residual += np.nansum(res * res) 146 | return residual 147 | 148 | 149 | # @jit(nogil=True) 150 | def f_ANOVA(pep_abd, group_ix, estimates, null_ave, dof_loss=0): 151 | """ 152 | Perform ANOVA 153 | Inputs: 154 | pep_abd: Peptide abundance matrix 155 | group_ix: Index of sample groups 156 | estimates: Estimated abundances of sample groups 157 | null_ave: Global average 158 | dof_loss: Loss of dof due to averaging 159 | Return: 160 | f: Value of F-statistic 161 | dof1: Degree of freedom of model 1 162 | dof2: Degree of freedom of model 2 163 | """ 164 | global nGroups 165 | ss_total = sum_squares(pep_abd, group_ix, null_ave) 166 | ss_resid = sum_squares(pep_abd, group_ix, estimates) 167 | dof1 = nGroups - 1 168 | dof2 = isfinite(pep_abd).sum() - nGroups - dof_loss 169 | if dof2 <= 0: 170 | return np.nan, dof1, dof2 171 | f = ((ss_total - ss_resid) / dof1) / (ss_resid / dof2) 172 | return f, dof1, dof2 173 | 174 | 175 | def mv_impute(pep_abd, group_ix, least_missing=0.99, impute_as=0.001): 176 | """ Impute missing values when having a large proportion in a sample group. 177 | Inputs: 178 | pep_abd: n peptides, m samples, in linear scale 179 | group_ix: grouping index for each of the m samples 180 | least_missing: set the minimum threshold of missing rate to trigger the imputation (Default: 99%). 181 | impute_as: set missing values in the sample to this value 182 | Return: 183 | numpy array after replacing missing values with imputed values 184 | """ 185 | aT = np.array(pep_abd).T 186 | for ix in group_ix: 187 | if np.isnan(aT[ix]).sum() > least_missing * len(aT[ix].flatten()): 188 | val = aT[ix] 189 | val[np.where(np.isnan(val))] = impute_as 190 | aT[ix] = val 191 | return aT.T 192 | 193 | 194 | # @jit(nogil=True) 195 | def weighted_average(weights, pep_abd, group_ix): 196 | """ 197 | Calculate weighted geometric means for sample groups 198 | Inputs: 199 | weights: Weights of peptides after filtering by loading threshold 200 | pep_abd: Peptide abundances after filtering by loading threshold 201 | group_ix: Array indexes of sample groups 202 | Return: 203 | expr: Estimated expression levels 204 | """ 205 | global nGroups 206 | abd_w = pep_abd * weights[..., None] 207 | count_peptides = np.sum(~np.isnan(pep_abd), axis = 0) 208 | one_w = abd_w / abd_w * weights[..., None] 209 | a_sums = np.nansum(abd_w, axis=0) 210 | w_sums = np.nansum(one_w, axis=0) 211 | expr = np.empty(nGroups) 212 | for i in range(expr.shape[0]): 213 | if count_peptides[i] > 0: 214 | expr[i] = a_sums[group_ix[i]].sum() / w_sums[group_ix[i]].sum() 215 | else: 216 | expr[i] = np.nan 217 | return expr 218 | 219 | def _init_pool(the_dict): 220 | global prot_dict 221 | prot_dict = the_dict 222 | 223 | def _load_fasta(db, id_regex): 224 | prot_dict = dict() 225 | for header, seq in fasta.read(db): 226 | seq = seq.replace("I", "L").upper() # convert DB sequence I -> L 227 | prot_id = header.split()[0] 228 | if id_regex is not None: 229 | find_id = re.findall(id_regex, header) 230 | if len(find_id) > 0: 231 | prot_id = find_id[0] 232 | prot_dict[prot_id] = seq 233 | 234 | return prot_dict 235 | 236 | 237 | def _map_seq(p): 238 | global prot_dict 239 | pairs = [] 240 | for prot_id, seq in prot_dict.items(): 241 | if p in seq: 242 | pairs.append([p, prot_id]) 243 | return pairs 244 | 245 | def peptide_db_graph(peps, db, id_regex=None): 246 | """ search a set of peptides against a FASTA database """ 247 | g = nx.Graph() 248 | protdict = _load_fasta(db, id_regex) 249 | 250 | with Pool(initializer = _init_pool, initargs=(protdict,)) as pool: 251 | mapped_ppps = pool.map(_map_seq, peps) 252 | 253 | for ppps in mapped_ppps: 254 | if len(ppps): 255 | g.add_edges_from(ppps) 256 | return g 257 | 258 | 259 | def parsimony_grouping(g, peps): 260 | """ Group peptides to proteins using the rule of parsimony 261 | Inputs: 262 | g: an undirected graph with peptide <-> protein as edges 263 | peps: the set of peptide sequences, nodes not listed in the peptide set are protein IDs. 264 | Return: 265 | prot_groups: a dictionary with mappings between proteins (keys) to peptides (values) 266 | """ 267 | not_peps = set(g.nodes()) - set(peps) 268 | prot_groups = dict() 269 | for cc in (g.subgraph(c).copy() for c in nx.connected_components(g)): 270 | in_group_peptides = set(cc.nodes()) - not_peps 271 | in_group_proteins = not_peps.intersection(cc.nodes()) 272 | 273 | if len(in_group_proteins) == 1: 274 | prot_groups[in_group_proteins.pop()] = in_group_peptides 275 | elif len(in_group_proteins) > 1: 276 | reported = set() 277 | while len(in_group_proteins - reported) > 0: 278 | candidate_proteins = sorted( 279 | in_group_proteins - reported, 280 | key=lambda p: (len(set(cc[p].keys()) - reported), p), 281 | reverse=True, 282 | ) 283 | p = candidate_proteins[0] 284 | current_peps = set(cc[p].keys()) 285 | plabel = [p] 286 | for i in range(1, len(candidate_proteins)): 287 | _p = candidate_proteins[i] 288 | _peps = set(cc[_p].keys()) 289 | if _peps == current_peps: 290 | plabel.append(_p) 291 | if len(_peps - current_peps) == 0: 292 | reported.add(_p) 293 | 294 | plabel = ";".join(sorted(plabel)) 295 | if len(current_peps - reported) > 0: 296 | prot_groups[plabel] = current_peps 297 | reported = reported.union(current_peps) 298 | reported.add(p) 299 | return prot_groups 300 | 301 | 302 | def protein_grouping(df, proteinDb): 303 | """ 304 | Grouping peptide sequences in the given dataframe (df) 305 | by mapping to a protein database (FASTA); 306 | or by the first column of dataframe when the database is absent 307 | """ 308 | peptides = sorted(set(df.index)) 309 | if not proteinDb: 310 | g = nx.Graph() 311 | for i, x in df.iterrows(): 312 | for prot in x.values.astype("str")[0].split(";"): 313 | if len(prot) > 0: 314 | g.add_edge(i, prot) 315 | else: 316 | g = peptide_db_graph(peptides, proteinDb) 317 | pg = parsimony_grouping(g, peptides) 318 | return pg 319 | 320 | 321 | def zero_center_normalize(df, samples, logInput=False, method="median"): 322 | """ 323 | Transforming input peptide abundance table into log2-scale and centralize to zero. 324 | Inputs: 325 | df : dataframe of peptide abundances 326 | samples: column names of selected samples 327 | logInput: input abundances are already in log scale 328 | method: method for estimating zero point 329 | Return: 330 | df: the dataframe of peptide abundances after normalization 331 | """ 332 | assert method in ( 333 | "median", 334 | "average", 335 | "GMM", 336 | ), "Zero centering method has to be among median, average or GMM!" 337 | 338 | if not logInput: 339 | # convert abundances to log2 scale 340 | df[samples] = df[samples].apply(np.log2) 341 | if method == "average": 342 | norm_scale = np.nanmean(df[samples], axis=0) 343 | elif method == "median": 344 | norm_scale = np.nanmedian(df[samples], axis=0) 345 | elif method == "GMM": 346 | """ two-component Gaussian mixture model """ 347 | from sklearn.mixture import GaussianMixture as GMM 348 | 349 | gmm = GMM(2) 350 | 351 | norm_scale = [] 352 | for sp in samples: 353 | v = df[sp].values 354 | v = v[np.logical_not(np.isnan(v))] 355 | v = v[np.logical_not(np.isinf(v))] 356 | try: 357 | gmm.fit(np.matrix(v).T) 358 | vmean = gmm.means_[np.argmin(gmm.covariances_)][0] 359 | norm_scale.append(vmean) 360 | except: 361 | norm_scale.append(np.nanmean(v)) 362 | norm_scale = np.array(norm_scale) 363 | 364 | print( 365 | "Caution!!", 366 | "Two-component Gaussian mixture model is used to center peptide abundances!", 367 | "Centring factors are:", 368 | *[ 369 | "Sample:{}\tGMM:{:.3f}\t Median:{:.3f}".format(s, g, m) 370 | for s, g, m in zip( 371 | samples, norm_scale, np.nanmedian(df[samples], axis=0) 372 | ) 373 | ], 374 | "Check if GMM estimated values deviate greatly from median values.", 375 | "If in doubt, use other metrics (e.g. median) to centre the abundances!!\n", 376 | sep="\n" 377 | ) 378 | df[samples] = df[samples] - norm_scale 379 | return df 380 | 381 | 382 | def pqpq(peptide_abundances, metric="correlation", method="complete", t=0.4): 383 | """ The essential PQPQ2 process from @yafeng 384 | [https://github.com/yafeng/pqpq_python/blob/master/pqpq2.py] 385 | """ 386 | from scipy import cluster 387 | 388 | d = cluster.hierarchy.distance.pdist(peptide_abundances, metric) 389 | if metric == "correlation": 390 | D = np.clip(d, 0, 2) 391 | else: 392 | D = d 393 | L = cluster.hierarchy.linkage(D, method, metric) 394 | ind = cluster.hierarchy.fcluster(L, t, "distance") 395 | return ind 396 | 397 | 398 | # ===================== 399 | # Monte Carlo permutation tests 400 | def monte_carlo_permutation(samp_index, n): 401 | """ 402 | Generating a batch of random permutations of sample indexes 403 | Inputs: 404 | samp_index: array indexes of sample groups 405 | n: size of the batch of permutations 406 | """ 407 | flat = np.hstack(samp_index) 408 | ix = [0] 409 | [ix.append(ix[-1] + len(i)) for i in samp_index] 410 | for i in range(n): 411 | permute = np.random.permutation(flat) 412 | new_ix = [permute[ix[i - 1] : ix[i]] for i in range(1, len(ix))] 413 | yield np.array(new_ix) 414 | 415 | 416 | def calc_q(pvals): 417 | """ 418 | Calculate q-values based on a list of p-values, with a conservative estimate 419 | of the proportion of true null hypotheses (pi0_hat) based on the given p-values. 420 | """ 421 | pv = np.array(pvals) 422 | pv = pv[isfinite(pv)] 423 | pi0_hat = min(1, np.sum(pv) * 2 / len(pv)) 424 | ranking = pv.argsort().argsort() + 1 425 | qlist = pv * pi0_hat * len(pv) / ranking 426 | for i, rank in enumerate(ranking): 427 | qlist[i] = min(qlist[ranking >= rank]) 428 | qlist = list(qlist) 429 | qvals = np.ones_like(pvals).tolist() 430 | for i, e in enumerate(pvals): 431 | if isfinite(e): 432 | qvals[i] = qlist.pop(0) 433 | return np.array(qvals) 434 | 435 | 436 | def perform_mcfdr( 437 | diffacto_res, 438 | sampIx, 439 | max_mc=1e5, 440 | batch_size=100, 441 | terminate_t=50, 442 | target_fdr=0.05, 443 | sn_threshold=-20, 444 | ): 445 | """ 446 | Sequential Monte Carlo permutation test 447 | Inputs: 448 | diffacto_res: a dictionary of Diffacto statistics for each protein 449 | sampIx: array indexes of sample groups 450 | max_mc: maximun number of random permutations 451 | batch_size: number of permutations for every iteration 452 | terminate_t: target number of permutation tests with better statistics to terminate the simulation for one protein 453 | target_fdr: target level of FDR to stop simulation for the remaining proteins. 454 | sn_threshold: signal-to-noise threshold for exclusion of non-informative proteins. 455 | """ 456 | proteins = sorted(diffacto_res.keys()) 457 | preTermination = set() 458 | for batch in range(1, int(max_mc / batch_size) + 2): 459 | mc_pvals = [] 460 | for prot in proteins: 461 | grand_ave, weight, abd_qc, sn, f, T, N = diffacto_res[prot] 462 | if sn <= sn_threshold: 463 | mc_pvals.append(np.nan) 464 | preTermination.add(prot) 465 | continue 466 | if prot in preTermination: 467 | mc_pvals.append(T / N) 468 | continue 469 | for ix in monte_carlo_permutation(sampIx, batch_size): 470 | N += 1 471 | try: 472 | yhat = weighted_average(weight, abd_qc, ix) 473 | f_mc, _, _ = f_ANOVA(abd_qc, ix, yhat, grand_ave) 474 | except: 475 | f_mc = f 476 | if f_mc >= f: 477 | T += 1 478 | diffacto_res[prot][-1] = N # 1 + Total MC simulations performed 479 | diffacto_res[prot][-2] = T # 1 + MC simulations with better stats 480 | mc_pvals.append(T / N) 481 | if T >= terminate_t: 482 | preTermination.add(prot) 483 | 484 | mc_fdr = calc_q(mc_pvals) 485 | curr_prot = [proteins.index(p) for p in proteins if p not in preTermination] 486 | 487 | if ( 488 | len(curr_prot) == 0 489 | or max(mc_fdr[curr_prot]) < target_fdr 490 | or batch * batch_size >= max_mc 491 | ): 492 | print("Monte Carlo permutation test finished.") 493 | return zip(proteins, mc_pvals, mc_fdr) 494 | else: 495 | print( 496 | "%d times simulation, %d proteins remaining (FDR %.3f)" 497 | % (batch * batch_size, len(curr_prot), max(mc_fdr[curr_prot])) 498 | ) 499 | 500 | 501 | # ================================================= 502 | # Main 503 | # ================================================= 504 | def main(): 505 | import argparse 506 | import sys 507 | 508 | DEBUG = False 509 | SUMMARIZE_EACH_RUN = False 510 | TOPN = 3 511 | T_PQPQ = 0.4 512 | EXAMPLE = "HUMAN" 513 | 514 | MC_SIMULATION = True 515 | MC_MAX_N = 200000 516 | MC_BATCH_SIZE = 100 517 | MC_MAX_HIT = MC_MAX_N / 1000 518 | 519 | apars = argparse.ArgumentParser( 520 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 521 | ) 522 | 523 | apars.add_argument( 524 | "-i", 525 | required=True, 526 | nargs=1, 527 | help="""Peptides abundances in CSV format. 528 | The first row should contain names for all samples. 529 | The first column should contain unique peptide sequences. 530 | Missing values should be empty instead of zeros. 531 | """, 532 | ) 533 | # The first column contains unique peptide sequences 534 | # Missing values should be empty instead of zeros 535 | 536 | apars.add_argument( 537 | "-db", 538 | nargs="?", 539 | help="""Protein database in FASTA format. 540 | If None, the peptide file must have protein ID(s) in the second column. 541 | """, 542 | ) 543 | 544 | apars.add_argument( 545 | "-samples", 546 | nargs="?", 547 | help="""File of the sample list. 548 | One run and its sample group per line, separated by tab. 549 | If None, read from peptide file headings, 550 | then each run will be summarized as a group. 551 | """, 552 | ) 553 | 554 | apars.add_argument( 555 | "-log2", 556 | default="False", 557 | help="Input abundances are in log scale (True) or linear scale (False)", 558 | ) 559 | 560 | apars.add_argument( 561 | "-normalize", 562 | choices=["average", "median", "GMM", "None"], 563 | default="None", 564 | help="Method for sample-wise normalization.", 565 | ) 566 | # Normalize input abundances (per sample) to zero-centered in log-scale 567 | # Valid methods include: 'average', 'median' or 'GMM' (two-component 568 | # Gaussian mixture model). If None (default), do not normalize. 569 | 570 | apars.add_argument("-farms_mu", type=float, default=0.1, help="Hyperparameter mu") 571 | # Hyperparameter mu of the FARMS algorithm: prior knowledge of the 572 | # expected loading. 573 | 574 | apars.add_argument( 575 | "-farms_alpha", 576 | type=float, 577 | default=0.1, 578 | help="Hyperparameter weight of prior probability", 579 | ) 580 | # Hyperparameter weight of the FARMS algorithm: weight of prior 581 | # probability in EM calculation. 582 | 583 | apars.add_argument( 584 | "-reference", 585 | default="average", 586 | help="Names of reference sample groups (separated by semicolon)", 587 | ) 588 | # If average (default) calculate average abundance as the reference. 589 | # Otherwise, keep peptide abundance values as is. 590 | 591 | apars.add_argument( 592 | "-min_samples", 593 | type=int, 594 | default=1, 595 | help="Minimum number of samples peptides needed to be quantified in", 596 | ) 597 | # Peptides quantified in less than the minimum number will be discarded 598 | 599 | apars.add_argument("-use_unique", default="False", help="Use unique peptides only") 600 | 601 | apars.add_argument( 602 | "-impute_threshold", 603 | type=float, 604 | default=0.99, 605 | help=( 606 | "Minimum fraction of missing values in the group. " 607 | "Impute missing values if missing fraction is larger than the threshold. " 608 | ), 609 | ) 610 | 611 | apars.add_argument( 612 | "-cutoff_weight", 613 | type=float, 614 | default=0.5, 615 | help="Peptides weighted lower than the cutoff will be excluded", 616 | ) 617 | 618 | apars.add_argument( 619 | "-fast", 620 | default="False", 621 | help="Allow early termination in EM calculation when noise is sufficiently small.", 622 | ) 623 | 624 | apars.add_argument( 625 | "-out", 626 | type=argparse.FileType("w"), 627 | default=sys.stdout, 628 | help="Path to output file (writing in TSV format).", 629 | ) 630 | 631 | apars.add_argument( 632 | "-mc_out", default=None, help="Path to MCFDR output (writing in TSV format)." 633 | ) 634 | 635 | apars.add_argument('-loadings_out', default=None, 636 | help='File for peptide loadings (writing in TSV format).') 637 | # ------------------------------------------------ 638 | args = apars.parse_args() 639 | 640 | def boolparam(p): 641 | """ convert a string parameter to boolean value""" 642 | if str(p).lower() in ("yes", "true", "t", "y", "1"): 643 | return True 644 | else: 645 | return False 646 | 647 | args.log2 = boolparam(args.log2) 648 | args.fast = boolparam(args.fast) 649 | args.use_unique = boolparam(args.use_unique) 650 | print(args) 651 | diffacto_res = dict() 652 | df = pandas.read_csv(args.i[0], index_col=0) 653 | df.index = [i.upper().replace("I", "L") for i in df.index] 654 | print("Abundance matrix loaded: %d peptides" % len(df.index)) 655 | 656 | if not args.samples: 657 | # read sample names from header 658 | samples = df.columns.tolist() 659 | if args.db is None: 660 | samples.pop(0) 661 | groups = samples 662 | else: 663 | # read sample labels 664 | samples, groups = ([], []) 665 | with open(args.samples) as fh: 666 | for line in fh.readlines(): 667 | try: 668 | _s, _g = line.rstrip().split("\t") 669 | samples.append(_s) 670 | groups.append(_g) 671 | except ValueError: 672 | pass 673 | 674 | # per sample normalization of peptide abundances 675 | logInput = args.log2 676 | if not args.normalize == "None": 677 | df = zero_center_normalize( 678 | df, samples, logInput=logInput, method=args.normalize 679 | ) 680 | args.log2 = True 681 | 682 | # select reference runs if specified 683 | ref_samples = [] 684 | if args.reference: 685 | for r in args.reference.split(";"): 686 | for i in range(len(groups)): 687 | if groups[i] == r: 688 | ref_samples.append(i) 689 | ref_samples = [samples[i] for i in ref_samples] 690 | 691 | print("Number of runs: %d" % len(samples)) 692 | 693 | # sample grouping 694 | group_names = [ 695 | i 696 | for i in sorted(set(groups), key=lambda k: "{0:0>50}".format(k)) 697 | if i not in args.reference.split(";") 698 | ] 699 | if len(group_names) == len(samples): 700 | group_names = samples 701 | 702 | sampIx = np.array( 703 | [[j for j in range(len(groups)) if groups[j] == i] for i in group_names] 704 | ) 705 | global nGroups 706 | nGroups = len(group_names) 707 | print("Number of sample groups: %d" % nGroups) 708 | print("Reference runs (%d): " % len(ref_samples), *ref_samples, sep="\t") 709 | 710 | # protein grouping 711 | pg = protein_grouping(df, args.db) 712 | print("Number of protein groups: %d" % len(pg.keys())) 713 | 714 | # coverage filtering 715 | df = df[ 716 | [ 717 | np.count_nonzero(np.nan_to_num(v)) >= args.min_samples 718 | for v in df[samples].values 719 | ] 720 | ] 721 | 722 | # reversed mapping (peptide to protein group) for checking peptide uniqueness. 723 | pep2prot = defaultdict(list) 724 | for prot_ids, bseqs in pg.items(): 725 | for s in bseqs: 726 | pep2prot[s] += prot_ids.split() 727 | 728 | # use unique peptides 729 | if args.use_unique: 730 | df = df[[len(pep2prot[p]) == 1 for p in df.index]] 731 | 732 | # Check that we don't have any peptides with a single non-missing value. 733 | # These tend to break diffacto, because in fast_farms we end up with a covariance matrix of less than full rank. Which the algorithm is not set up to handle. 734 | nonZeroNonMissing = np.vectorize( 735 | lambda x: ~np.isnan(x) and x != 0, otypes=[np.bool_] 736 | ) 737 | if df.shape[0] > 0: 738 | for prot in sorted(pg.keys()): 739 | if prot == "nan": 740 | continue 741 | if DEBUG and EXAMPLE not in prot: 742 | continue 743 | # =====----=====-----=====-----===== 744 | peps = pg[prot] # constituent peptides 745 | dx = df.loc[[p for p in sorted(peps) if p in df.index]] # dataframe 746 | pep_count = len(dx) # number of peptides 747 | pep_abd = dx[samples].values 748 | counts = np.sum(nonZeroNonMissing(pep_abd), axis=1) 749 | if any(counts < 2): 750 | print( 751 | "Protein {} contained peptides with fewer than two non-missing or non-zero values. Please remove these peptides".format( 752 | prot 753 | ) 754 | ) 755 | return 756 | 757 | if args.loadings_out is not None: 758 | loadings_out_file = open(args.loadings_out, 'w') 759 | # ------------------------------------------------------------------------- 760 | # perform differential analysis 761 | output_header = ["Protein", "N.Pept", "Q.Pept", "S/N", "P(PECA)"] 762 | output_header += group_names 763 | if SUMMARIZE_EACH_RUN: 764 | output_header += ["P(Top-%d)" % TOPN, "P(Median)", "P(PQPQ)"] 765 | output_header += ["Top-%d_%s" % (TOPN, s) for s in samples] 766 | output_header += ["Median_%s" % s for s in samples] 767 | output_header += ["PQPQ_%s" % s for s in samples] 768 | 769 | print(*output_header, sep="\t", file=args.out) 770 | for prot in sorted(pg.keys()): 771 | if prot == "nan": 772 | continue 773 | if DEBUG and EXAMPLE not in prot: 774 | continue 775 | # =====----=====-----=====-----===== 776 | peps = pg[prot] # constituent peptides 777 | dx = df.loc[[p for p in sorted(peps) if p in df.index]] # dataframe 778 | pep_count = len(dx) # number of peptides 779 | pep_abd = dx[samples].values 780 | 781 | if len(ref_samples): # rescale peptide abundances by reference runs 782 | reference_abundance = ( 783 | dx[ref_samples].mean(axis=1).fillna(np.nanmean(dx[samples])).values 784 | ) 785 | elif args.reference.lower() == "average": # rescale by average values 786 | reference_abundance = dx[samples].mean(axis=1).values 787 | else: 788 | if not args.log2: 789 | reference_abundance = 1.0 790 | else: 791 | reference_abundance = 0 792 | 793 | if not args.log2: 794 | pep_abd = np.log2(pep_abd) 795 | reference_abundance = np.log2(reference_abundance) 796 | 797 | pep_abd = (pep_abd.T - reference_abundance).T 798 | 799 | if pep_count == 1: 800 | # single peptide group 801 | loading = array([1 for _ in dx.index]) 802 | noise = 1.0 803 | continue 804 | # do not report 805 | elif pep_count > 1: 806 | loading, noise = fast_farms( 807 | pep_abd, 808 | mu=args.farms_mu, 809 | weight=args.farms_alpha, 810 | max_iter=1000, 811 | force_iter=not args.fast, 812 | ) 813 | else: 814 | continue 815 | 816 | if noise < 1: 817 | sn = 10 * np.log10((1 - noise) / noise) 818 | else: 819 | # fix log(0) issue 820 | sn = -np.inf 821 | 822 | if args.loadings_out is not None: 823 | for pep, pepLoading in zip(peps, loading): 824 | print(prot, pep, pepLoading, sep="\t", file = loadings_out_file) 825 | 826 | qc = loading > args.cutoff_weight 827 | abd_qc = mv_impute( 828 | pep_abd[qc], 829 | sampIx, 830 | least_missing=args.impute_threshold, 831 | impute_as=np.nanmin(pep_abd) - 1, 832 | ) 833 | protein_summary_group = weighted_average(loading[qc], abd_qc, sampIx) 834 | 835 | if SUMMARIZE_EACH_RUN: 836 | with warnings.catch_warnings(): 837 | warnings.simplefilter("ignore", category=RuntimeWarning) 838 | # Top-N averaging 839 | v = dx[samples].values 840 | if logInput: 841 | v = 2 ** v 842 | protein_summary_topn = np.array( 843 | [ 844 | np.mean(np.sort(v[:, i][isfinite(v[:, i])])[-TOPN:]) 845 | for i in range(len(samples)) 846 | ] 847 | ) 848 | p_ave = stats.f_oneway( 849 | *[ 850 | protein_summary_topn[s][isfinite(protein_summary_topn[s])] 851 | for s in sampIx 852 | ] 853 | )[1] 854 | 855 | # Median 856 | v = dx[samples].values 857 | protein_summary_median = np.nanmedian(v, axis=0) 858 | p_med = stats.f_oneway( 859 | *[ 860 | protein_summary_median[s][isfinite(protein_summary_median[s])] 861 | for s in sampIx 862 | ] 863 | )[1] 864 | 865 | # PQPQ clustering and averaging 866 | v = np.nan_to_num(pep_abd) 867 | clusters = pqpq(v, t=T_PQPQ) 868 | major = sorted( 869 | [(len(clusters[clusters == i]), i) for i in set(clusters.tolist())] 870 | )[-1] 871 | if major[0] >= 2: 872 | clusters[clusters != major[1]] = 0 873 | clusters[clusters != 0] = 1 874 | else: 875 | clusters = np.ones(*clusters.shape) 876 | protein_summary_pqpq = np.nanmean( 877 | dx[samples].values[clusters > 0], axis=0 878 | ) 879 | p_pqpq = stats.f_oneway( 880 | *[ 881 | protein_summary_pqpq[s][isfinite(protein_summary_pqpq[s])] 882 | for s in sampIx 883 | ] 884 | )[1] 885 | 886 | # ================================================================ 887 | # PECA: grouping peptide-level p-values based on beta distribution 888 | # https://www.bioconductor.org/packages/release/bioc/html/PECA.html 889 | """ 890 | Calculate Probe-level Expression Change Averages (PECA) 891 | to identify differential expression in Affymetrix gene expression 892 | microarray studies or in proteomic studies using peptide-level 893 | mesurements respectively. 894 | """ 895 | pep_pvals = [] 896 | for pep_v in abd_qc: 897 | with warnings.catch_warnings(): 898 | warnings.simplefilter("ignore", category=RuntimeWarning) 899 | ave_0 = np.broadcast_to(np.nanmean(pep_v), (nGroups, 1)) 900 | ave_1 = np.array([np.nanmean(pep_v[i]) for i in sampIx]) 901 | try: 902 | f, d1, d2 = f_ANOVA(pep_v[None, ...], sampIx, ave_1, ave_0) 903 | pv = stats.f.sf(f, d1, d2) 904 | pep_pvals.append(pv) 905 | except: 906 | pass 907 | 908 | pep_pvals = np.array(pep_pvals) 909 | pep_pvals = pep_pvals[isfinite(pep_pvals)] 910 | beta_ab = len(pep_pvals) / 2 + 0.5 911 | if len(pep_pvals) > 0: 912 | p_peca = stats.beta.cdf(np.median(pep_pvals), beta_ab, beta_ab) 913 | else: 914 | p_peca = np.nan 915 | 916 | if MC_SIMULATION: 917 | grand_ave = np.broadcast_to(np.nanmean(abd_qc), (nGroups, 1)) 918 | f, _, _ = f_ANOVA(abd_qc, sampIx, protein_summary_group, grand_ave) 919 | diffacto_res[prot] = [grand_ave, loading[qc], abd_qc, sn, f, 1, 1] 920 | 921 | # ============================= 922 | if not logInput: 923 | protein_summary_group = 2 ** protein_summary_group 924 | 925 | output_row = [prot, pep_count, sum(qc), sn, p_peca] + list( 926 | protein_summary_group 927 | ) 928 | 929 | if SUMMARIZE_EACH_RUN: 930 | output_row += ( 931 | [p_ave, p_med, p_pqpq] 932 | + list(protein_summary_topn) 933 | + list(protein_summary_median) 934 | + list(protein_summary_pqpq) 935 | ) 936 | 937 | print(*output_row, sep="\t", file=args.out) 938 | 939 | if MC_SIMULATION and args.mc_out: 940 | try: 941 | mc_out = open(args.mc_out, "w") 942 | except: 943 | print("Cannot open file: ", args.mc_out, ". Use stdout instead.") 944 | mc_out = sys.stdout 945 | 946 | print("Protein", "P(MC)", "MCFDR", sep="\t", file=mc_out) 947 | mc_result = perform_mcfdr( 948 | diffacto_res, 949 | sampIx, 950 | max_mc=MC_MAX_N, 951 | batch_size=MC_BATCH_SIZE, 952 | terminate_t=MC_MAX_HIT, 953 | target_fdr=0.05, 954 | ) 955 | 956 | for prot, p, q in mc_result: 957 | print(prot, p, q, sep="\t", file=mc_out) 958 | 959 | 960 | if __name__ == "__main__": 961 | main() 962 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: diffacto_35 2 | channels: 3 | - defaults 4 | dependencies: 5 | - ca-certificates=2017.08.26=ha1e5d58_0 6 | - certifi=2017.11.5=py35hd00889a_0 7 | - decorator=4.1.2=py35hf37c5b3_0 8 | - intel-openmp=2018.0.0=h8158457_8 9 | - libcxx=4.0.1=h579ed51_0 10 | - libcxxabi=4.0.1=hebd6815_0 11 | - libedit=3.1=hb4e282d_0 12 | - libffi=3.2.1=h475c297_4 13 | - libgfortran=3.0.1=h93005f0_2 14 | - mkl=2018.0.1=hfbd8650_4 15 | - ncurses=6.0=hd04f020_2 16 | - networkx=2.0=py35hb193ae4_0 17 | - numpy=1.14.0=py35h8a80b8c_0 18 | - openssl=1.0.2n=hdbc3d79_0 19 | - pandas=0.22.0=py35h0a44026_0 20 | - pip=9.0.1=py35h33ce766_4 21 | - python=3.5.4=he720263_23 22 | - python-dateutil=2.6.1=py35h10515e0_1 23 | - pytz=2017.3=py35heeb7564_0 24 | - readline=7.0=hc1231fa_4 25 | - scikit-learn=0.19.1=py35h2b554eb_0 26 | - scipy=1.0.0=py35h8b35106_0 27 | - setuptools=36.5.0=py35h52cde6a_0 28 | - six=1.11.0=py35h39a4c60_1 29 | - sqlite=3.20.1=h7e4c145_2 30 | - tk=8.6.7=h35a86e2_3 31 | - wheel=0.30.0=py35h5c0b906_1 32 | - xz=5.2.3=h0278029_2 33 | - zlib=1.2.11=hf3cbc9b_2 34 | - pip: 35 | - altgraph==0.15 36 | - future==0.16.0 37 | - macholib==1.9 38 | - pefile==2017.11.5 39 | - pyinstaller==3.3.1 40 | - pyteomics==3.4.2 41 | -------------------------------------------------------------------------------- /example/HBY20Mix.samples.lst: -------------------------------------------------------------------------------- 1 | 20160112_P1_SEG_MID P1 2 | 20160112_P1_SEG_LOW P1 3 | 20160112_P1_SEG_HIGH P1 4 | 20160112_P2_SEG_LOW P2 5 | 20160112_P2_SEG_MID P2 6 | 20160112_P2_SEG_HIGH P2 7 | 20160112_P3_SEG_HIGH P3 8 | 20160112_P3_SEG_LOW P3 9 | 20160112_P3_SEG_MID P3 10 | 20160112_P4_SEG_HIGH P4 11 | 20160112_P4_SEG_LOW P4 12 | 20160112_P4_SEG_MID P4 13 | 20160112_P5_SEG_HIGH P5 14 | 20160112_P5_SEG_LOW_160121063813 P5 15 | 20160112_P5_SEG_MID P5 16 | 20160112_P6_SEG_HIGH P6 17 | 20160112_P6_SEG_LOW P6 18 | 20160112_P6_SEG_MID P6 19 | 20160112_P7_SEG_HIGH P7 20 | 20160112_P7_SEG_LOW P7 21 | 20160112_P7_SEG_MID P7 22 | 20160112_P8_SEG_HIGH P8 23 | 20160112_P8_SEG_LOW P8 24 | 20160112_P8_SEG_MID_160121160232 P8 25 | 20160112_P9_SEG_MID P9 26 | 20160112_P9_SEG_HIGH P9 27 | 20160112_P9_SEG_LOW_160121012404 P9 28 | 20160112_P10_SEG_HIGH P10 29 | 20160112_P10_SEG_LOW_160120200540 P10 30 | 20160112_P10_SEG_MID P10 31 | 20160112_P11_SEG_LOW_160203031257 P11 32 | 20160112_P11_SEG_MID_160203050927 P11 33 | 20160112_P11_SEG_HIGH_160203070611 P11 34 | 20160112_P11_SEG_HIGH REF 35 | 20160112_P11_SEG_LOW REF 36 | 20160112_P11_SEG_MID REF 37 | 20160112_P12_SEG_LOW P12 38 | 20160112_P12_SEG_MID P12 39 | 20160112_P12_SEG_HIGH P12 40 | 20160112_P13_SEG_HIGH P13 41 | 20160112_P13_SEG_LOW P13 42 | 20160112_P13_SEG_MID P13 43 | 20160112_P14_SEG_HIGH P14 44 | 20160112_P14_SEG_MID P14 45 | 20160112_P14_SEG_LOW_160120174525 P14 46 | 20160112_P15_SEG_LOW P15 47 | 20160112_P15_SEG_MID P15 48 | 20160112_P15_SEG_HIGH_160120220930 P15 49 | 20160112_P16_SEG_LOW P16 50 | 20160112_P16_SEG_MID P16 51 | 20160112_P16_SEG_HIGH_160121181003 P16 52 | 20160112_P17_SEG_HIGH P17 53 | 20160112_P17_SEG_LOW P17 54 | 20160112_P17_SEG_MID P17 55 | 20160112_P18_SEG_LOW P18 56 | 20160112_P18_SEG_MID P18 57 | 20160112_P18_SEG_HIGH P18 58 | 20160112_P19_SEG_MID_160121112852 P19 59 | 20160112_P19_SEG_HIGH P19 60 | 20160112_P19_SEG_LOW P19 61 | 20160112_P20_SEG_MID P20 62 | 20160112_P20_SEG_LOW P20 63 | 20160112_P20_SEG_HIGH_160121032454 P20 -------------------------------------------------------------------------------- /example/iPRG.samples.lst: -------------------------------------------------------------------------------- 1 | JD_06232014_sample1-A S1 2 | JD_06232014_sample1_B S1 3 | JD_06232014_sample1_C S1 4 | JD_06232014_sample2_A S2 5 | JD_06232014_sample2_B S2 6 | JD_06232014_sample2_C S2 7 | JD_06232014_sample3_A S3 8 | JD_06232014_sample3_B S3 9 | JD_06232014_sample3_C S3 10 | JD_06232014_sample4-A S4 11 | JD_06232014_sample4_B S4 12 | JD_06232014_sample4_C S4 -------------------------------------------------------------------------------- /example/readme.md: -------------------------------------------------------------------------------- 1 | ## Diffacto: Examples 2 | ---- 3 | 4 | To run these examples, clone this git repository and decend to the example directory. 5 | To install dependencies run 6 | pip install pyteomics numpy networkx scikit-learn scipy 7 | 8 | #### Print usage information 9 | python ../run_diffacto.py -h 10 | 11 | 12 | --- 13 | #### Example-1: 14 | 15 | 16 | python ../run_diffacto.py -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt -mc_out iPRG.denovo.protein.FDR -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False 17 | 18 | 19 | # 20 | 21 | * input-1, peptide abundances: _iPRG.novo.pep.csv_ 22 | * input-2, sample list: _iPRG.samples.lst_ 23 | * output-1, protein quantification: _iPRG.denovo.protein.txt_ 24 | * output-2, FDR estimation by MC tests: _iPRG.denovo.protein.FDR_ 25 | * other parameters: 26 | -min_samples 4 (peptide quantified in at least four runs) 27 | -impute_threshold 0.9 (threshold for missing value imputation 90%) 28 | -use_unique True (only use unique peptides for quantification) 29 | -log2 False (input abundances are not in log scale) 30 | 31 | 32 | --- 33 | #### Example-2: 34 | 35 | 36 | python ../run_diffacto.py -i HBY20Mix.peptides.csv -samples HBY20Mix.samples.lst -db UP000002311_559292.fasta -out HBY20Mix.protein.txt -min_samples 30 -impute_threshold 0.7 -log2 False -reference REF 37 | 38 | 39 | # 40 | 41 | * input-1, peptide abundances: _HBY20Mix.peptides.csv_ 42 | * input-2, sample list: _HBY20Mix.samples.lst_ 43 | * input-3, protein database: _UP000002311_559292.fasta_ 44 | * output-1, protein quantification: _HBY20Mix.protein.txt_ 45 | * other parameters: 46 | -min_samples 30 (peptide quantified in at least 30 runs) 47 | -impute_threshold 0.7 (threshold for missing value imputation 70%) 48 | -log2 False (input abundances are not in log scale) 49 | -reference REF (use the runs labeled 'REF' as the internal reference) 50 | -------------------------------------------------------------------------------- /run_diffacto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """Convenience wrapper for running diffacto directly from source tree.""" 6 | 7 | 8 | from diffacto.diffacto import main 9 | 10 | 11 | if __name__ == '__main__': 12 | main() 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | python-tag = py36 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """setup.py: setuptools control for diffacto.""" 2 | 3 | import re 4 | from setuptools import setup, find_packages 5 | 6 | version = re.search( 7 | '^__version__\s*=\s*"(.*)"', 8 | open('diffacto/diffacto.py').read(), 9 | re.M 10 | ).group(1) 11 | 12 | from codecs import open 13 | from os import path 14 | 15 | here = path.abspath(path.dirname(__file__)) 16 | 17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f: 18 | long_description = f.read() 19 | 20 | 21 | setup( 22 | # This is the name of your project. The first time you publish this 23 | # package, this name will be registered for you. It will determine how 24 | # users can install this project, e.g.: 25 | # 26 | # $ pip install diffacto 27 | # 28 | 29 | name='diffacto', # Required 30 | version=version, # Required 31 | packages = ["diffacto"], 32 | entry_points = { 33 | "console_scripts": ['diffacto = diffacto.diffacto:main'] 34 | }, 35 | 36 | description='A protein summarization method for shotgun proteomics experiments', # Required 37 | long_description=long_description, # Optional 38 | url='https://github.com/statisticalbiotechnology/diffacto', # Optional 39 | author='Bo Zhang, Lukas Käll, KTH', # Optional 40 | author_email='lukas.kall@scilifelab.se', # Optional 41 | maintainer='Lukas Käll, KTH', # Optional 42 | maintainer_email='lukas.kall@scilifelab.se', # Optional 43 | license='Apache', 44 | # Classifiers help users find your project by categorizing it. 45 | # 46 | # For a list of valid classifiers, see 47 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 48 | classifiers=[ # Optional 49 | # How mature is this project? Common values are 50 | # 3 - Alpha 51 | # 4 - Beta 52 | # 5 - Production/Stable 53 | 'Development Status :: 3 - Alpha', 54 | 55 | # Indicate who your project is intended for 56 | 'Intended Audience :: Developers', 57 | 'Topic :: Software Development :: Build Tools', 58 | 59 | # Pick your license as you wish 60 | 'License :: OSI Approved :: Apache Software License', 61 | 62 | # Specify the Python versions you support here. In particular, ensure 63 | # that you indicate whether you support Python 2, Python 3 or both. 64 | 'Programming Language :: Python :: 3', 65 | 'Programming Language :: Python :: 3.4', 66 | 'Programming Language :: Python :: 3.5', 67 | 'Programming Language :: Python :: 3.6', 68 | ], 69 | 70 | # This field adds keywords for your project which will appear on the 71 | # project page. What does your project relate to? 72 | # 73 | # Note that this is a string of words separated by whitespace, not a list. 74 | # keywords='sample setuptools development', # Optional 75 | 76 | # You can just specify package directories manually here if your project is 77 | # simple. Or you can use find_packages(). 78 | # 79 | # Alternatively, if you just want to distribute a single Python file, use 80 | # the `py_modules` argument instead as follows, which will expect a file 81 | # called `my_module.py` to exist: 82 | # 83 | # py_modules=["my_module"], 84 | # 85 | # packages=find_packages(exclude=['contrib', 'docs', 'tests']), # Required 86 | 87 | # This field lists other packages that your project depends on to run. 88 | # Any package you put here will be installed by pip when your project is 89 | # installed, so they must be valid existing projects. 90 | # 91 | # For an analysis of "install_requires" vs pip's requirements files see: 92 | # https://packaging.python.org/en/latest/requirements.html 93 | install_requires=[ 94 | 'numpy>=1.10', 95 | 'scipy>=0.17', 96 | 'pandas>=0.18', 97 | 'networkx>=1.10', 98 | 'scikit-learn>=0.17', 99 | 'pyteomics>=3.3', 100 | 'Cython>=0.26'], 101 | 102 | # List additional groups of dependencies here (e.g. development 103 | # dependencies). Users will be able to install these using the "extras" 104 | # syntax, for example: 105 | # 106 | # $ pip install sampleproject[dev] 107 | # 108 | # Similar to `install_requires` above, these must be valid existing 109 | # projects. 110 | #extras_require={ # Optional 111 | # 'dev': ['check-manifest'], 112 | # 'test': ['coverage'], 113 | #}, 114 | 115 | # If there are data files included in your packages that need to be 116 | # installed, specify them here. 117 | # 118 | # If using Python 2.6 or earlier, then these have to be included in 119 | # MANIFEST.in as well. 120 | #package_data={ # Optional 121 | # 'sample': ['package_data.dat'], 122 | #}, 123 | 124 | # Although 'package_data' is the preferred approach, in some case you may 125 | # need to place data files outside of your packages. See: 126 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files 127 | # 128 | # In this case, 'data_file' will be installed into '/my_data' 129 | #data_files=[('my_data', ['data/data_file'])], # Optional 130 | 131 | ) 132 | --------------------------------------------------------------------------------