├── .gitignore
├── .test.sh
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── diffacto
├── __init__.py
├── __main__.py
└── diffacto.py
├── environment.yml
├── example
├── HBY20Mix.peptides.csv
├── HBY20Mix.samples.lst
├── UP000002311_559292.fasta
├── iPRG.novo.pep.csv
├── iPRG.samples.lst
└── readme.md
├── run_diffacto.py
├── setup.cfg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/.test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd example
3 | diffacto -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt \
4 | -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False
5 | protfile_size=$(wc -l < iPRG.denovo.protein.txt)
6 | #fdrfile_size=$(wc -l < iPRG.denovo.protein.FDR)
7 | #if [ $protfile_size -ge '200' ] && [ $fdrfile_size -ge '200' ]; then
8 | if [ $protfile_size -ge '200' ]; then
9 | # All OK
10 | exit 0
11 | else
12 | # Something is wrong
13 | exit 1
14 | fi
15 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | # - "3.5"
4 | - "3.6"
5 | # command to install dependencies
6 | install:
7 | - python setup.py install
8 | # command to run tests
9 | script: ./.test.sh
10 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2017 Bo Zhang and Lukas Käll
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Diffacto: Differential Factor Analysis for Comparative Shotgun Proteomics
2 | ==========================================================================
3 |
4 | Requirements
5 | --------------
6 |
7 | Anaconda_ Python3.5+
8 |
9 | Packages needed:
10 |
11 | - numpy 1.10+
12 | - scipy 0.17+
13 | - pandas 0.18+
14 | - networkx 1.10+
15 | - scikit-learn 0.17+
16 | - pyteomics_ 3.3+
17 |
18 | .. _Anaconda: https://www.anaconda.com/
19 | .. _pyteomics: https://pyteomics.readthedocs.io/
20 |
21 | Installation via ``pip``
22 | *************************
23 |
24 | ::
25 |
26 | pip install diffacto
27 |
28 |
29 | Usage
30 | -----
31 |
32 | ::
33 |
34 | diffacto.py [-h] -i I [-db [DB]] [-samples [SAMPLES]] [-log2 LOG2]
35 | [-normalize {average,median,GMM,None}]
36 | [-farms_mu FARMS_MU] [-farms_alpha FARMS_ALPHA]
37 | [-reference REFERENCE] [-min_samples MIN_SAMPLES]
38 | [-use_unique USE_UNIQUE]
39 | [-impute_threshold IMPUTE_THRESHOLD]
40 | [-cutoff_weight CUTOFF_WEIGHT] [-fast FAST] [-out OUT]
41 | [-mc_out MC_OUT]
42 | optional arguments:
43 | -h, --help show this help message and exit
44 | -i I Peptides abundances in CSV format. The first row
45 | should contain names for all samples. The first column
46 | should contain unique peptide sequences. Missing
47 | values should be empty instead of zeros. (default:
48 | None)
49 | -db [DB] Protein database in FASTA format. If None, the peptide
50 | file must have protein ID(s) in the second column.
51 | (default: None)
52 | -samples [SAMPLES] File of the sample list. One run and its sample group
53 | per line, separated by tab. If None, read from peptide
54 | file headings, then each run will be summarized as a
55 | group. (default: None)
56 | -log2 LOG2 Input abundances are in log scale (True) or linear
57 | scale (False) (default: False)
58 | -normalize {average,median,GMM,None}
59 | Method for sample-wise normalization. (default: None)
60 | -farms_mu FARMS_MU Hyperparameter mu (default: 0.1)
61 | -farms_alpha FARMS_ALPHA
62 | Hyperparameter weight of prior probability (default:
63 | 0.1)
64 | -reference REFERENCE Names of reference sample groups (separated by
65 | semicolon) (default: average)
66 | -min_samples MIN_SAMPLES
67 | Minimum number of samples peptides needed to be
68 | quantified in (default: 1)
69 | -use_unique USE_UNIQUE
70 | Use unique peptides only (default: False)
71 | -impute_threshold IMPUTE_THRESHOLD
72 | Minimum fraction of missing values in the group.
73 | Impute missing values if missing fraction is larger
74 | than the threshold. (default: 0.99)
75 | -cutoff_weight CUTOFF_WEIGHT
76 | Peptides weighted lower than the cutoff will be
77 | excluded (default: 0.5)
78 | -fast FAST Allow early termination in EM calculation when noise
79 | is sufficiently small. (default: False)
80 | -out OUT Path to output file (writing in TSV format).
81 | -mc_out MC_OUT Path to MCFDR output (writing in TSV format).
82 | (default: None)
83 |
84 |
85 | Example
86 | -------
87 |
88 | Examples are given in the example_ directory.
89 |
90 | .. _example: ./example
91 |
--------------------------------------------------------------------------------
/diffacto/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/statisticalbiotechnology/diffacto/6389c9410f4c01aed6f33fd787a9fce0cf304a9e/diffacto/__init__.py
--------------------------------------------------------------------------------
/diffacto/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | """diffacto.__main__: executed when bootstrap directory is called as script."""
5 |
6 |
7 | from .diffacto import main
8 | main()
9 |
--------------------------------------------------------------------------------
/diffacto/diffacto.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: UTF-8 -*-
3 | from __future__ import division, print_function
4 |
5 | """diffacto.diffacto: provides entry point main()."""
6 |
7 | __version__ = "1.0.7"
8 |
9 | import csv
10 | import re
11 | import warnings
12 | from collections import defaultdict
13 | from multiprocessing import Pool
14 |
15 | from scipy import optimize, stats
16 | import networkx as nx
17 | import numpy as np
18 | import pandas
19 | from numpy import array, isfinite, nanmean, nansum
20 | from pyteomics import fasta
21 |
22 | # from numba import jit # # Enable just-in-time compiler for speeding up
23 | # @jit
24 | def fast_farms(
25 | probes: np.array,
26 | weight: float = 0.5,
27 | mu: float = 0,
28 | max_iter: int = 1000,
29 | force_iter: bool = False,
30 | min_noise: float = 1e-4,
31 | fill_nan: float = 0.0,
32 | ):
33 | """Bayesian Factor Analysis for Proteomics Summarization
34 | A python translation of function "generateExprVal.method.farms" from
35 | Bioconductor FARMS.
36 | [http://www.bioconductor.org/packages/release/bioc/html/farms.html]
37 | [http://www.bioinf.jku.at/publications/papers/farms/supplementary.ps]
38 |
39 | Reference:
40 | Hochreiter S, Clevert D and Obermayer K (2006). A new summarization
41 | method for affymetrix probe level data. Bioinformatics, 22(8),
42 | http://bioinformatics.oxfordjournals.org/cgi/content/abstract/22/8/943.
43 |
44 | Inputs:
45 | probes: Peptide abundance array (N peptides, M samples) in log scale.
46 | weight: Hyperparameter (backscale factor) value in the range of [0,1]
47 | which determines the influence of the prior.
48 | mu: Hyperparameter value which allows to quantify different aspects
49 | of potential prior knowledge. A value near zero assumes that
50 | most genes do not contain a signal, and introduces a bias for
51 | loading matrix elements near zero. """
52 |
53 | readouts = np.array(probes)
54 | if fill_nan != 0:
55 | readouts[np.isnan(readouts)] = fill_nan
56 |
57 | # normalize and transform X
58 | X = np.nan_to_num(readouts).T
59 | X = X - np.nanmean(X, axis=0)
60 | xsd = np.nanstd(X, axis=0)
61 | xsd[xsd < min_noise] = 1.0
62 | X /= xsd
63 | X[~isfinite(X)] = 0
64 |
65 | n_samples, n_features = X.shape
66 | C = np.cov(X.T, ddof=0)
67 |
68 | # positive definite
69 | C = 0.5 * (C + C.T)
70 | C[np.where(C < 0)] = 0
71 |
72 | # robustness
73 | U, s, V = np.linalg.svd(C)
74 | s[s < min_noise] = min_noise
75 | C = U.dot(np.diag(s)).dot(V)
76 |
77 | # initiation
78 | λ = np.sqrt(np.diag(C) * 0.75)
79 | ψ = np.diag(C) - λ ** 2
80 | old_psi = ψ
81 | old_lambda = λ
82 | alpha = weight * n_features
83 | E = 1.0
84 | min_noise_square = min_noise ** 2
85 | C_diag = np.diag(C)
86 |
87 | for i in range(max_iter):
88 | # E step
89 | φ = λ / ψ
90 | a = 1 + np.matmul(λ.reshape(1, -1), φ.reshape(-1, 1))
91 | η = φ / a
92 | ζ = C.dot(η.T)
93 | E = 1 - η.dot(λ) + η.dot(ζ)
94 |
95 | # M step
96 | λ = ζ.T / (E + ψ * alpha)
97 | λ = np.asarray(λ)[0]
98 | ψ = C_diag - np.asarray(ζ)[0] * λ + ψ * alpha * λ * (mu - λ)
99 | ψ = np.maximum(ψ, min_noise_square)
100 | if (
101 | ψ[-1] == old_psi[-1]
102 | and ψ[0] == old_psi[0]
103 | and np.array_equal(ψ, old_psi)
104 | and np.array_equal(λ, old_lambda)
105 | ):
106 | break
107 |
108 | if not force_iter:
109 | if abs(ψ - old_psi).max() / old_psi.max() < min_noise / 10:
110 | break
111 |
112 | old_psi = ψ
113 | old_lambda = λ
114 |
115 | loading = np.sqrt(E[0, 0]) * λ
116 | φ = loading / ψ
117 | weights = loading / loading.max() # rescale loadings to the range of [0,1]
118 | noise = 1 / (1 + np.matmul(loading.reshape(1, -1), φ.reshape(-1, 1)))
119 | noise = noise[0, 0]
120 | return weights, noise
121 |
122 |
123 | # @jit(nogil=True)
124 | def fast_gmean_nomissing(weights, pep_abd, group_ix):
125 | """
126 | Calculate geometric means based on non-missing peptide readouts.
127 | """
128 | abd_w = pep_abd * weights[..., None]
129 | one_w = abd_w / abd_w * weights[..., None]
130 | a_sums = np.nansum(abd_w, axis=0)
131 | w_sums = np.nansum(one_w, axis=0)
132 | expr = a_sums[group_ix].sum(axis=1) / w_sums[group_ix].sum(axis=1)
133 | return expr
134 |
135 |
136 | # @jit(nogil=True)
137 | def sum_squares(pep_abd, group_ix, estimates):
138 | """
139 | Calculate sum of squared residuals
140 | """
141 | global nGroups
142 | residual = 0.0
143 | for i in range(nGroups):
144 | res = pep_abd[:, group_ix[i]] - estimates[i]
145 | residual += np.nansum(res * res)
146 | return residual
147 |
148 |
149 | # @jit(nogil=True)
150 | def f_ANOVA(pep_abd, group_ix, estimates, null_ave, dof_loss=0):
151 | """
152 | Perform ANOVA
153 | Inputs:
154 | pep_abd: Peptide abundance matrix
155 | group_ix: Index of sample groups
156 | estimates: Estimated abundances of sample groups
157 | null_ave: Global average
158 | dof_loss: Loss of dof due to averaging
159 | Return:
160 | f: Value of F-statistic
161 | dof1: Degree of freedom of model 1
162 | dof2: Degree of freedom of model 2
163 | """
164 | global nGroups
165 | ss_total = sum_squares(pep_abd, group_ix, null_ave)
166 | ss_resid = sum_squares(pep_abd, group_ix, estimates)
167 | dof1 = nGroups - 1
168 | dof2 = isfinite(pep_abd).sum() - nGroups - dof_loss
169 | if dof2 <= 0:
170 | return np.nan, dof1, dof2
171 | f = ((ss_total - ss_resid) / dof1) / (ss_resid / dof2)
172 | return f, dof1, dof2
173 |
174 |
175 | def mv_impute(pep_abd, group_ix, least_missing=0.99, impute_as=0.001):
176 | """ Impute missing values when having a large proportion in a sample group.
177 | Inputs:
178 | pep_abd: n peptides, m samples, in linear scale
179 | group_ix: grouping index for each of the m samples
180 | least_missing: set the minimum threshold of missing rate to trigger the imputation (Default: 99%).
181 | impute_as: set missing values in the sample to this value
182 | Return:
183 | numpy array after replacing missing values with imputed values
184 | """
185 | aT = np.array(pep_abd).T
186 | for ix in group_ix:
187 | if np.isnan(aT[ix]).sum() > least_missing * len(aT[ix].flatten()):
188 | val = aT[ix]
189 | val[np.where(np.isnan(val))] = impute_as
190 | aT[ix] = val
191 | return aT.T
192 |
193 |
194 | # @jit(nogil=True)
195 | def weighted_average(weights, pep_abd, group_ix):
196 | """
197 | Calculate weighted geometric means for sample groups
198 | Inputs:
199 | weights: Weights of peptides after filtering by loading threshold
200 | pep_abd: Peptide abundances after filtering by loading threshold
201 | group_ix: Array indexes of sample groups
202 | Return:
203 | expr: Estimated expression levels
204 | """
205 | global nGroups
206 | abd_w = pep_abd * weights[..., None]
207 | count_peptides = np.sum(~np.isnan(pep_abd), axis = 0)
208 | one_w = abd_w / abd_w * weights[..., None]
209 | a_sums = np.nansum(abd_w, axis=0)
210 | w_sums = np.nansum(one_w, axis=0)
211 | expr = np.empty(nGroups)
212 | for i in range(expr.shape[0]):
213 | if count_peptides[i] > 0:
214 | expr[i] = a_sums[group_ix[i]].sum() / w_sums[group_ix[i]].sum()
215 | else:
216 | expr[i] = np.nan
217 | return expr
218 |
219 | def _init_pool(the_dict):
220 | global prot_dict
221 | prot_dict = the_dict
222 |
223 | def _load_fasta(db, id_regex):
224 | prot_dict = dict()
225 | for header, seq in fasta.read(db):
226 | seq = seq.replace("I", "L").upper() # convert DB sequence I -> L
227 | prot_id = header.split()[0]
228 | if id_regex is not None:
229 | find_id = re.findall(id_regex, header)
230 | if len(find_id) > 0:
231 | prot_id = find_id[0]
232 | prot_dict[prot_id] = seq
233 |
234 | return prot_dict
235 |
236 |
237 | def _map_seq(p):
238 | global prot_dict
239 | pairs = []
240 | for prot_id, seq in prot_dict.items():
241 | if p in seq:
242 | pairs.append([p, prot_id])
243 | return pairs
244 |
245 | def peptide_db_graph(peps, db, id_regex=None):
246 | """ search a set of peptides against a FASTA database """
247 | g = nx.Graph()
248 | protdict = _load_fasta(db, id_regex)
249 |
250 | with Pool(initializer = _init_pool, initargs=(protdict,)) as pool:
251 | mapped_ppps = pool.map(_map_seq, peps)
252 |
253 | for ppps in mapped_ppps:
254 | if len(ppps):
255 | g.add_edges_from(ppps)
256 | return g
257 |
258 |
259 | def parsimony_grouping(g, peps):
260 | """ Group peptides to proteins using the rule of parsimony
261 | Inputs:
262 | g: an undirected graph with peptide <-> protein as edges
263 | peps: the set of peptide sequences, nodes not listed in the peptide set are protein IDs.
264 | Return:
265 | prot_groups: a dictionary with mappings between proteins (keys) to peptides (values)
266 | """
267 | not_peps = set(g.nodes()) - set(peps)
268 | prot_groups = dict()
269 | for cc in (g.subgraph(c).copy() for c in nx.connected_components(g)):
270 | in_group_peptides = set(cc.nodes()) - not_peps
271 | in_group_proteins = not_peps.intersection(cc.nodes())
272 |
273 | if len(in_group_proteins) == 1:
274 | prot_groups[in_group_proteins.pop()] = in_group_peptides
275 | elif len(in_group_proteins) > 1:
276 | reported = set()
277 | while len(in_group_proteins - reported) > 0:
278 | candidate_proteins = sorted(
279 | in_group_proteins - reported,
280 | key=lambda p: (len(set(cc[p].keys()) - reported), p),
281 | reverse=True,
282 | )
283 | p = candidate_proteins[0]
284 | current_peps = set(cc[p].keys())
285 | plabel = [p]
286 | for i in range(1, len(candidate_proteins)):
287 | _p = candidate_proteins[i]
288 | _peps = set(cc[_p].keys())
289 | if _peps == current_peps:
290 | plabel.append(_p)
291 | if len(_peps - current_peps) == 0:
292 | reported.add(_p)
293 |
294 | plabel = ";".join(sorted(plabel))
295 | if len(current_peps - reported) > 0:
296 | prot_groups[plabel] = current_peps
297 | reported = reported.union(current_peps)
298 | reported.add(p)
299 | return prot_groups
300 |
301 |
302 | def protein_grouping(df, proteinDb):
303 | """
304 | Grouping peptide sequences in the given dataframe (df)
305 | by mapping to a protein database (FASTA);
306 | or by the first column of dataframe when the database is absent
307 | """
308 | peptides = sorted(set(df.index))
309 | if not proteinDb:
310 | g = nx.Graph()
311 | for i, x in df.iterrows():
312 | for prot in x.values.astype("str")[0].split(";"):
313 | if len(prot) > 0:
314 | g.add_edge(i, prot)
315 | else:
316 | g = peptide_db_graph(peptides, proteinDb)
317 | pg = parsimony_grouping(g, peptides)
318 | return pg
319 |
320 |
321 | def zero_center_normalize(df, samples, logInput=False, method="median"):
322 | """
323 | Transforming input peptide abundance table into log2-scale and centralize to zero.
324 | Inputs:
325 | df : dataframe of peptide abundances
326 | samples: column names of selected samples
327 | logInput: input abundances are already in log scale
328 | method: method for estimating zero point
329 | Return:
330 | df: the dataframe of peptide abundances after normalization
331 | """
332 | assert method in (
333 | "median",
334 | "average",
335 | "GMM",
336 | ), "Zero centering method has to be among median, average or GMM!"
337 |
338 | if not logInput:
339 | # convert abundances to log2 scale
340 | df[samples] = df[samples].apply(np.log2)
341 | if method == "average":
342 | norm_scale = np.nanmean(df[samples], axis=0)
343 | elif method == "median":
344 | norm_scale = np.nanmedian(df[samples], axis=0)
345 | elif method == "GMM":
346 | """ two-component Gaussian mixture model """
347 | from sklearn.mixture import GaussianMixture as GMM
348 |
349 | gmm = GMM(2)
350 |
351 | norm_scale = []
352 | for sp in samples:
353 | v = df[sp].values
354 | v = v[np.logical_not(np.isnan(v))]
355 | v = v[np.logical_not(np.isinf(v))]
356 | try:
357 | gmm.fit(np.matrix(v).T)
358 | vmean = gmm.means_[np.argmin(gmm.covariances_)][0]
359 | norm_scale.append(vmean)
360 | except:
361 | norm_scale.append(np.nanmean(v))
362 | norm_scale = np.array(norm_scale)
363 |
364 | print(
365 | "Caution!!",
366 | "Two-component Gaussian mixture model is used to center peptide abundances!",
367 | "Centring factors are:",
368 | *[
369 | "Sample:{}\tGMM:{:.3f}\t Median:{:.3f}".format(s, g, m)
370 | for s, g, m in zip(
371 | samples, norm_scale, np.nanmedian(df[samples], axis=0)
372 | )
373 | ],
374 | "Check if GMM estimated values deviate greatly from median values.",
375 | "If in doubt, use other metrics (e.g. median) to centre the abundances!!\n",
376 | sep="\n"
377 | )
378 | df[samples] = df[samples] - norm_scale
379 | return df
380 |
381 |
382 | def pqpq(peptide_abundances, metric="correlation", method="complete", t=0.4):
383 | """ The essential PQPQ2 process from @yafeng
384 | [https://github.com/yafeng/pqpq_python/blob/master/pqpq2.py]
385 | """
386 | from scipy import cluster
387 |
388 | d = cluster.hierarchy.distance.pdist(peptide_abundances, metric)
389 | if metric == "correlation":
390 | D = np.clip(d, 0, 2)
391 | else:
392 | D = d
393 | L = cluster.hierarchy.linkage(D, method, metric)
394 | ind = cluster.hierarchy.fcluster(L, t, "distance")
395 | return ind
396 |
397 |
398 | # =====================
399 | # Monte Carlo permutation tests
400 | def monte_carlo_permutation(samp_index, n):
401 | """
402 | Generating a batch of random permutations of sample indexes
403 | Inputs:
404 | samp_index: array indexes of sample groups
405 | n: size of the batch of permutations
406 | """
407 | flat = np.hstack(samp_index)
408 | ix = [0]
409 | [ix.append(ix[-1] + len(i)) for i in samp_index]
410 | for i in range(n):
411 | permute = np.random.permutation(flat)
412 | new_ix = [permute[ix[i - 1] : ix[i]] for i in range(1, len(ix))]
413 | yield np.array(new_ix)
414 |
415 |
416 | def calc_q(pvals):
417 | """
418 | Calculate q-values based on a list of p-values, with a conservative estimate
419 | of the proportion of true null hypotheses (pi0_hat) based on the given p-values.
420 | """
421 | pv = np.array(pvals)
422 | pv = pv[isfinite(pv)]
423 | pi0_hat = min(1, np.sum(pv) * 2 / len(pv))
424 | ranking = pv.argsort().argsort() + 1
425 | qlist = pv * pi0_hat * len(pv) / ranking
426 | for i, rank in enumerate(ranking):
427 | qlist[i] = min(qlist[ranking >= rank])
428 | qlist = list(qlist)
429 | qvals = np.ones_like(pvals).tolist()
430 | for i, e in enumerate(pvals):
431 | if isfinite(e):
432 | qvals[i] = qlist.pop(0)
433 | return np.array(qvals)
434 |
435 |
436 | def perform_mcfdr(
437 | diffacto_res,
438 | sampIx,
439 | max_mc=1e5,
440 | batch_size=100,
441 | terminate_t=50,
442 | target_fdr=0.05,
443 | sn_threshold=-20,
444 | ):
445 | """
446 | Sequential Monte Carlo permutation test
447 | Inputs:
448 | diffacto_res: a dictionary of Diffacto statistics for each protein
449 | sampIx: array indexes of sample groups
450 | max_mc: maximun number of random permutations
451 | batch_size: number of permutations for every iteration
452 | terminate_t: target number of permutation tests with better statistics to terminate the simulation for one protein
453 | target_fdr: target level of FDR to stop simulation for the remaining proteins.
454 | sn_threshold: signal-to-noise threshold for exclusion of non-informative proteins.
455 | """
456 | proteins = sorted(diffacto_res.keys())
457 | preTermination = set()
458 | for batch in range(1, int(max_mc / batch_size) + 2):
459 | mc_pvals = []
460 | for prot in proteins:
461 | grand_ave, weight, abd_qc, sn, f, T, N = diffacto_res[prot]
462 | if sn <= sn_threshold:
463 | mc_pvals.append(np.nan)
464 | preTermination.add(prot)
465 | continue
466 | if prot in preTermination:
467 | mc_pvals.append(T / N)
468 | continue
469 | for ix in monte_carlo_permutation(sampIx, batch_size):
470 | N += 1
471 | try:
472 | yhat = weighted_average(weight, abd_qc, ix)
473 | f_mc, _, _ = f_ANOVA(abd_qc, ix, yhat, grand_ave)
474 | except:
475 | f_mc = f
476 | if f_mc >= f:
477 | T += 1
478 | diffacto_res[prot][-1] = N # 1 + Total MC simulations performed
479 | diffacto_res[prot][-2] = T # 1 + MC simulations with better stats
480 | mc_pvals.append(T / N)
481 | if T >= terminate_t:
482 | preTermination.add(prot)
483 |
484 | mc_fdr = calc_q(mc_pvals)
485 | curr_prot = [proteins.index(p) for p in proteins if p not in preTermination]
486 |
487 | if (
488 | len(curr_prot) == 0
489 | or max(mc_fdr[curr_prot]) < target_fdr
490 | or batch * batch_size >= max_mc
491 | ):
492 | print("Monte Carlo permutation test finished.")
493 | return zip(proteins, mc_pvals, mc_fdr)
494 | else:
495 | print(
496 | "%d times simulation, %d proteins remaining (FDR %.3f)"
497 | % (batch * batch_size, len(curr_prot), max(mc_fdr[curr_prot]))
498 | )
499 |
500 |
501 | # =================================================
502 | # Main
503 | # =================================================
504 | def main():
505 | import argparse
506 | import sys
507 |
508 | DEBUG = False
509 | SUMMARIZE_EACH_RUN = False
510 | TOPN = 3
511 | T_PQPQ = 0.4
512 | EXAMPLE = "HUMAN"
513 |
514 | MC_SIMULATION = True
515 | MC_MAX_N = 200000
516 | MC_BATCH_SIZE = 100
517 | MC_MAX_HIT = MC_MAX_N / 1000
518 |
519 | apars = argparse.ArgumentParser(
520 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
521 | )
522 |
523 | apars.add_argument(
524 | "-i",
525 | required=True,
526 | nargs=1,
527 | help="""Peptides abundances in CSV format.
528 | The first row should contain names for all samples.
529 | The first column should contain unique peptide sequences.
530 | Missing values should be empty instead of zeros.
531 | """,
532 | )
533 | # The first column contains unique peptide sequences
534 | # Missing values should be empty instead of zeros
535 |
536 | apars.add_argument(
537 | "-db",
538 | nargs="?",
539 | help="""Protein database in FASTA format.
540 | If None, the peptide file must have protein ID(s) in the second column.
541 | """,
542 | )
543 |
544 | apars.add_argument(
545 | "-samples",
546 | nargs="?",
547 | help="""File of the sample list.
548 | One run and its sample group per line, separated by tab.
549 | If None, read from peptide file headings,
550 | then each run will be summarized as a group.
551 | """,
552 | )
553 |
554 | apars.add_argument(
555 | "-log2",
556 | default="False",
557 | help="Input abundances are in log scale (True) or linear scale (False)",
558 | )
559 |
560 | apars.add_argument(
561 | "-normalize",
562 | choices=["average", "median", "GMM", "None"],
563 | default="None",
564 | help="Method for sample-wise normalization.",
565 | )
566 | # Normalize input abundances (per sample) to zero-centered in log-scale
567 | # Valid methods include: 'average', 'median' or 'GMM' (two-component
568 | # Gaussian mixture model). If None (default), do not normalize.
569 |
570 | apars.add_argument("-farms_mu", type=float, default=0.1, help="Hyperparameter mu")
571 | # Hyperparameter mu of the FARMS algorithm: prior knowledge of the
572 | # expected loading.
573 |
574 | apars.add_argument(
575 | "-farms_alpha",
576 | type=float,
577 | default=0.1,
578 | help="Hyperparameter weight of prior probability",
579 | )
580 | # Hyperparameter weight of the FARMS algorithm: weight of prior
581 | # probability in EM calculation.
582 |
583 | apars.add_argument(
584 | "-reference",
585 | default="average",
586 | help="Names of reference sample groups (separated by semicolon)",
587 | )
588 | # If average (default) calculate average abundance as the reference.
589 | # Otherwise, keep peptide abundance values as is.
590 |
591 | apars.add_argument(
592 | "-min_samples",
593 | type=int,
594 | default=1,
595 | help="Minimum number of samples peptides needed to be quantified in",
596 | )
597 | # Peptides quantified in less than the minimum number will be discarded
598 |
599 | apars.add_argument("-use_unique", default="False", help="Use unique peptides only")
600 |
601 | apars.add_argument(
602 | "-impute_threshold",
603 | type=float,
604 | default=0.99,
605 | help=(
606 | "Minimum fraction of missing values in the group. "
607 | "Impute missing values if missing fraction is larger than the threshold. "
608 | ),
609 | )
610 |
611 | apars.add_argument(
612 | "-cutoff_weight",
613 | type=float,
614 | default=0.5,
615 | help="Peptides weighted lower than the cutoff will be excluded",
616 | )
617 |
618 | apars.add_argument(
619 | "-fast",
620 | default="False",
621 | help="Allow early termination in EM calculation when noise is sufficiently small.",
622 | )
623 |
624 | apars.add_argument(
625 | "-out",
626 | type=argparse.FileType("w"),
627 | default=sys.stdout,
628 | help="Path to output file (writing in TSV format).",
629 | )
630 |
631 | apars.add_argument(
632 | "-mc_out", default=None, help="Path to MCFDR output (writing in TSV format)."
633 | )
634 |
635 | apars.add_argument('-loadings_out', default=None,
636 | help='File for peptide loadings (writing in TSV format).')
637 | # ------------------------------------------------
638 | args = apars.parse_args()
639 |
640 | def boolparam(p):
641 | """ convert a string parameter to boolean value"""
642 | if str(p).lower() in ("yes", "true", "t", "y", "1"):
643 | return True
644 | else:
645 | return False
646 |
647 | args.log2 = boolparam(args.log2)
648 | args.fast = boolparam(args.fast)
649 | args.use_unique = boolparam(args.use_unique)
650 | print(args)
651 | diffacto_res = dict()
652 | df = pandas.read_csv(args.i[0], index_col=0)
653 | df.index = [i.upper().replace("I", "L") for i in df.index]
654 | print("Abundance matrix loaded: %d peptides" % len(df.index))
655 |
656 | if not args.samples:
657 | # read sample names from header
658 | samples = df.columns.tolist()
659 | if args.db is None:
660 | samples.pop(0)
661 | groups = samples
662 | else:
663 | # read sample labels
664 | samples, groups = ([], [])
665 | with open(args.samples) as fh:
666 | for line in fh.readlines():
667 | try:
668 | _s, _g = line.rstrip().split("\t")
669 | samples.append(_s)
670 | groups.append(_g)
671 | except ValueError:
672 | pass
673 |
674 | # per sample normalization of peptide abundances
675 | logInput = args.log2
676 | if not args.normalize == "None":
677 | df = zero_center_normalize(
678 | df, samples, logInput=logInput, method=args.normalize
679 | )
680 | args.log2 = True
681 |
682 | # select reference runs if specified
683 | ref_samples = []
684 | if args.reference:
685 | for r in args.reference.split(";"):
686 | for i in range(len(groups)):
687 | if groups[i] == r:
688 | ref_samples.append(i)
689 | ref_samples = [samples[i] for i in ref_samples]
690 |
691 | print("Number of runs: %d" % len(samples))
692 |
693 | # sample grouping
694 | group_names = [
695 | i
696 | for i in sorted(set(groups), key=lambda k: "{0:0>50}".format(k))
697 | if i not in args.reference.split(";")
698 | ]
699 | if len(group_names) == len(samples):
700 | group_names = samples
701 |
702 | sampIx = np.array(
703 | [[j for j in range(len(groups)) if groups[j] == i] for i in group_names]
704 | )
705 | global nGroups
706 | nGroups = len(group_names)
707 | print("Number of sample groups: %d" % nGroups)
708 | print("Reference runs (%d): " % len(ref_samples), *ref_samples, sep="\t")
709 |
710 | # protein grouping
711 | pg = protein_grouping(df, args.db)
712 | print("Number of protein groups: %d" % len(pg.keys()))
713 |
714 | # coverage filtering
715 | df = df[
716 | [
717 | np.count_nonzero(np.nan_to_num(v)) >= args.min_samples
718 | for v in df[samples].values
719 | ]
720 | ]
721 |
722 | # reversed mapping (peptide to protein group) for checking peptide uniqueness.
723 | pep2prot = defaultdict(list)
724 | for prot_ids, bseqs in pg.items():
725 | for s in bseqs:
726 | pep2prot[s] += prot_ids.split()
727 |
728 | # use unique peptides
729 | if args.use_unique:
730 | df = df[[len(pep2prot[p]) == 1 for p in df.index]]
731 |
732 | # Check that we don't have any peptides with a single non-missing value.
733 | # These tend to break diffacto, because in fast_farms we end up with a covariance matrix of less than full rank. Which the algorithm is not set up to handle.
734 | nonZeroNonMissing = np.vectorize(
735 | lambda x: ~np.isnan(x) and x != 0, otypes=[np.bool_]
736 | )
737 | if df.shape[0] > 0:
738 | for prot in sorted(pg.keys()):
739 | if prot == "nan":
740 | continue
741 | if DEBUG and EXAMPLE not in prot:
742 | continue
743 | # =====----=====-----=====-----=====
744 | peps = pg[prot] # constituent peptides
745 | dx = df.loc[[p for p in sorted(peps) if p in df.index]] # dataframe
746 | pep_count = len(dx) # number of peptides
747 | pep_abd = dx[samples].values
748 | counts = np.sum(nonZeroNonMissing(pep_abd), axis=1)
749 | if any(counts < 2):
750 | print(
751 | "Protein {} contained peptides with fewer than two non-missing or non-zero values. Please remove these peptides".format(
752 | prot
753 | )
754 | )
755 | return
756 |
757 | if args.loadings_out is not None:
758 | loadings_out_file = open(args.loadings_out, 'w')
759 | # -------------------------------------------------------------------------
760 | # perform differential analysis
761 | output_header = ["Protein", "N.Pept", "Q.Pept", "S/N", "P(PECA)"]
762 | output_header += group_names
763 | if SUMMARIZE_EACH_RUN:
764 | output_header += ["P(Top-%d)" % TOPN, "P(Median)", "P(PQPQ)"]
765 | output_header += ["Top-%d_%s" % (TOPN, s) for s in samples]
766 | output_header += ["Median_%s" % s for s in samples]
767 | output_header += ["PQPQ_%s" % s for s in samples]
768 |
769 | print(*output_header, sep="\t", file=args.out)
770 | for prot in sorted(pg.keys()):
771 | if prot == "nan":
772 | continue
773 | if DEBUG and EXAMPLE not in prot:
774 | continue
775 | # =====----=====-----=====-----=====
776 | peps = pg[prot] # constituent peptides
777 | dx = df.loc[[p for p in sorted(peps) if p in df.index]] # dataframe
778 | pep_count = len(dx) # number of peptides
779 | pep_abd = dx[samples].values
780 |
781 | if len(ref_samples): # rescale peptide abundances by reference runs
782 | reference_abundance = (
783 | dx[ref_samples].mean(axis=1).fillna(np.nanmean(dx[samples])).values
784 | )
785 | elif args.reference.lower() == "average": # rescale by average values
786 | reference_abundance = dx[samples].mean(axis=1).values
787 | else:
788 | if not args.log2:
789 | reference_abundance = 1.0
790 | else:
791 | reference_abundance = 0
792 |
793 | if not args.log2:
794 | pep_abd = np.log2(pep_abd)
795 | reference_abundance = np.log2(reference_abundance)
796 |
797 | pep_abd = (pep_abd.T - reference_abundance).T
798 |
799 | if pep_count == 1:
800 | # single peptide group
801 | loading = array([1 for _ in dx.index])
802 | noise = 1.0
803 | continue
804 | # do not report
805 | elif pep_count > 1:
806 | loading, noise = fast_farms(
807 | pep_abd,
808 | mu=args.farms_mu,
809 | weight=args.farms_alpha,
810 | max_iter=1000,
811 | force_iter=not args.fast,
812 | )
813 | else:
814 | continue
815 |
816 | if noise < 1:
817 | sn = 10 * np.log10((1 - noise) / noise)
818 | else:
819 | # fix log(0) issue
820 | sn = -np.inf
821 |
822 | if args.loadings_out is not None:
823 | for pep, pepLoading in zip(peps, loading):
824 | print(prot, pep, pepLoading, sep="\t", file = loadings_out_file)
825 |
826 | qc = loading > args.cutoff_weight
827 | abd_qc = mv_impute(
828 | pep_abd[qc],
829 | sampIx,
830 | least_missing=args.impute_threshold,
831 | impute_as=np.nanmin(pep_abd) - 1,
832 | )
833 | protein_summary_group = weighted_average(loading[qc], abd_qc, sampIx)
834 |
835 | if SUMMARIZE_EACH_RUN:
836 | with warnings.catch_warnings():
837 | warnings.simplefilter("ignore", category=RuntimeWarning)
838 | # Top-N averaging
839 | v = dx[samples].values
840 | if logInput:
841 | v = 2 ** v
842 | protein_summary_topn = np.array(
843 | [
844 | np.mean(np.sort(v[:, i][isfinite(v[:, i])])[-TOPN:])
845 | for i in range(len(samples))
846 | ]
847 | )
848 | p_ave = stats.f_oneway(
849 | *[
850 | protein_summary_topn[s][isfinite(protein_summary_topn[s])]
851 | for s in sampIx
852 | ]
853 | )[1]
854 |
855 | # Median
856 | v = dx[samples].values
857 | protein_summary_median = np.nanmedian(v, axis=0)
858 | p_med = stats.f_oneway(
859 | *[
860 | protein_summary_median[s][isfinite(protein_summary_median[s])]
861 | for s in sampIx
862 | ]
863 | )[1]
864 |
865 | # PQPQ clustering and averaging
866 | v = np.nan_to_num(pep_abd)
867 | clusters = pqpq(v, t=T_PQPQ)
868 | major = sorted(
869 | [(len(clusters[clusters == i]), i) for i in set(clusters.tolist())]
870 | )[-1]
871 | if major[0] >= 2:
872 | clusters[clusters != major[1]] = 0
873 | clusters[clusters != 0] = 1
874 | else:
875 | clusters = np.ones(*clusters.shape)
876 | protein_summary_pqpq = np.nanmean(
877 | dx[samples].values[clusters > 0], axis=0
878 | )
879 | p_pqpq = stats.f_oneway(
880 | *[
881 | protein_summary_pqpq[s][isfinite(protein_summary_pqpq[s])]
882 | for s in sampIx
883 | ]
884 | )[1]
885 |
886 | # ================================================================
887 | # PECA: grouping peptide-level p-values based on beta distribution
888 | # https://www.bioconductor.org/packages/release/bioc/html/PECA.html
889 | """
890 | Calculate Probe-level Expression Change Averages (PECA)
891 | to identify differential expression in Affymetrix gene expression
892 | microarray studies or in proteomic studies using peptide-level
893 | mesurements respectively.
894 | """
895 | pep_pvals = []
896 | for pep_v in abd_qc:
897 | with warnings.catch_warnings():
898 | warnings.simplefilter("ignore", category=RuntimeWarning)
899 | ave_0 = np.broadcast_to(np.nanmean(pep_v), (nGroups, 1))
900 | ave_1 = np.array([np.nanmean(pep_v[i]) for i in sampIx])
901 | try:
902 | f, d1, d2 = f_ANOVA(pep_v[None, ...], sampIx, ave_1, ave_0)
903 | pv = stats.f.sf(f, d1, d2)
904 | pep_pvals.append(pv)
905 | except:
906 | pass
907 |
908 | pep_pvals = np.array(pep_pvals)
909 | pep_pvals = pep_pvals[isfinite(pep_pvals)]
910 | beta_ab = len(pep_pvals) / 2 + 0.5
911 | if len(pep_pvals) > 0:
912 | p_peca = stats.beta.cdf(np.median(pep_pvals), beta_ab, beta_ab)
913 | else:
914 | p_peca = np.nan
915 |
916 | if MC_SIMULATION:
917 | grand_ave = np.broadcast_to(np.nanmean(abd_qc), (nGroups, 1))
918 | f, _, _ = f_ANOVA(abd_qc, sampIx, protein_summary_group, grand_ave)
919 | diffacto_res[prot] = [grand_ave, loading[qc], abd_qc, sn, f, 1, 1]
920 |
921 | # =============================
922 | if not logInput:
923 | protein_summary_group = 2 ** protein_summary_group
924 |
925 | output_row = [prot, pep_count, sum(qc), sn, p_peca] + list(
926 | protein_summary_group
927 | )
928 |
929 | if SUMMARIZE_EACH_RUN:
930 | output_row += (
931 | [p_ave, p_med, p_pqpq]
932 | + list(protein_summary_topn)
933 | + list(protein_summary_median)
934 | + list(protein_summary_pqpq)
935 | )
936 |
937 | print(*output_row, sep="\t", file=args.out)
938 |
939 | if MC_SIMULATION and args.mc_out:
940 | try:
941 | mc_out = open(args.mc_out, "w")
942 | except:
943 | print("Cannot open file: ", args.mc_out, ". Use stdout instead.")
944 | mc_out = sys.stdout
945 |
946 | print("Protein", "P(MC)", "MCFDR", sep="\t", file=mc_out)
947 | mc_result = perform_mcfdr(
948 | diffacto_res,
949 | sampIx,
950 | max_mc=MC_MAX_N,
951 | batch_size=MC_BATCH_SIZE,
952 | terminate_t=MC_MAX_HIT,
953 | target_fdr=0.05,
954 | )
955 |
956 | for prot, p, q in mc_result:
957 | print(prot, p, q, sep="\t", file=mc_out)
958 |
959 |
960 | if __name__ == "__main__":
961 | main()
962 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: diffacto_35
2 | channels:
3 | - defaults
4 | dependencies:
5 | - ca-certificates=2017.08.26=ha1e5d58_0
6 | - certifi=2017.11.5=py35hd00889a_0
7 | - decorator=4.1.2=py35hf37c5b3_0
8 | - intel-openmp=2018.0.0=h8158457_8
9 | - libcxx=4.0.1=h579ed51_0
10 | - libcxxabi=4.0.1=hebd6815_0
11 | - libedit=3.1=hb4e282d_0
12 | - libffi=3.2.1=h475c297_4
13 | - libgfortran=3.0.1=h93005f0_2
14 | - mkl=2018.0.1=hfbd8650_4
15 | - ncurses=6.0=hd04f020_2
16 | - networkx=2.0=py35hb193ae4_0
17 | - numpy=1.14.0=py35h8a80b8c_0
18 | - openssl=1.0.2n=hdbc3d79_0
19 | - pandas=0.22.0=py35h0a44026_0
20 | - pip=9.0.1=py35h33ce766_4
21 | - python=3.5.4=he720263_23
22 | - python-dateutil=2.6.1=py35h10515e0_1
23 | - pytz=2017.3=py35heeb7564_0
24 | - readline=7.0=hc1231fa_4
25 | - scikit-learn=0.19.1=py35h2b554eb_0
26 | - scipy=1.0.0=py35h8b35106_0
27 | - setuptools=36.5.0=py35h52cde6a_0
28 | - six=1.11.0=py35h39a4c60_1
29 | - sqlite=3.20.1=h7e4c145_2
30 | - tk=8.6.7=h35a86e2_3
31 | - wheel=0.30.0=py35h5c0b906_1
32 | - xz=5.2.3=h0278029_2
33 | - zlib=1.2.11=hf3cbc9b_2
34 | - pip:
35 | - altgraph==0.15
36 | - future==0.16.0
37 | - macholib==1.9
38 | - pefile==2017.11.5
39 | - pyinstaller==3.3.1
40 | - pyteomics==3.4.2
41 |
--------------------------------------------------------------------------------
/example/HBY20Mix.samples.lst:
--------------------------------------------------------------------------------
1 | 20160112_P1_SEG_MID P1
2 | 20160112_P1_SEG_LOW P1
3 | 20160112_P1_SEG_HIGH P1
4 | 20160112_P2_SEG_LOW P2
5 | 20160112_P2_SEG_MID P2
6 | 20160112_P2_SEG_HIGH P2
7 | 20160112_P3_SEG_HIGH P3
8 | 20160112_P3_SEG_LOW P3
9 | 20160112_P3_SEG_MID P3
10 | 20160112_P4_SEG_HIGH P4
11 | 20160112_P4_SEG_LOW P4
12 | 20160112_P4_SEG_MID P4
13 | 20160112_P5_SEG_HIGH P5
14 | 20160112_P5_SEG_LOW_160121063813 P5
15 | 20160112_P5_SEG_MID P5
16 | 20160112_P6_SEG_HIGH P6
17 | 20160112_P6_SEG_LOW P6
18 | 20160112_P6_SEG_MID P6
19 | 20160112_P7_SEG_HIGH P7
20 | 20160112_P7_SEG_LOW P7
21 | 20160112_P7_SEG_MID P7
22 | 20160112_P8_SEG_HIGH P8
23 | 20160112_P8_SEG_LOW P8
24 | 20160112_P8_SEG_MID_160121160232 P8
25 | 20160112_P9_SEG_MID P9
26 | 20160112_P9_SEG_HIGH P9
27 | 20160112_P9_SEG_LOW_160121012404 P9
28 | 20160112_P10_SEG_HIGH P10
29 | 20160112_P10_SEG_LOW_160120200540 P10
30 | 20160112_P10_SEG_MID P10
31 | 20160112_P11_SEG_LOW_160203031257 P11
32 | 20160112_P11_SEG_MID_160203050927 P11
33 | 20160112_P11_SEG_HIGH_160203070611 P11
34 | 20160112_P11_SEG_HIGH REF
35 | 20160112_P11_SEG_LOW REF
36 | 20160112_P11_SEG_MID REF
37 | 20160112_P12_SEG_LOW P12
38 | 20160112_P12_SEG_MID P12
39 | 20160112_P12_SEG_HIGH P12
40 | 20160112_P13_SEG_HIGH P13
41 | 20160112_P13_SEG_LOW P13
42 | 20160112_P13_SEG_MID P13
43 | 20160112_P14_SEG_HIGH P14
44 | 20160112_P14_SEG_MID P14
45 | 20160112_P14_SEG_LOW_160120174525 P14
46 | 20160112_P15_SEG_LOW P15
47 | 20160112_P15_SEG_MID P15
48 | 20160112_P15_SEG_HIGH_160120220930 P15
49 | 20160112_P16_SEG_LOW P16
50 | 20160112_P16_SEG_MID P16
51 | 20160112_P16_SEG_HIGH_160121181003 P16
52 | 20160112_P17_SEG_HIGH P17
53 | 20160112_P17_SEG_LOW P17
54 | 20160112_P17_SEG_MID P17
55 | 20160112_P18_SEG_LOW P18
56 | 20160112_P18_SEG_MID P18
57 | 20160112_P18_SEG_HIGH P18
58 | 20160112_P19_SEG_MID_160121112852 P19
59 | 20160112_P19_SEG_HIGH P19
60 | 20160112_P19_SEG_LOW P19
61 | 20160112_P20_SEG_MID P20
62 | 20160112_P20_SEG_LOW P20
63 | 20160112_P20_SEG_HIGH_160121032454 P20
--------------------------------------------------------------------------------
/example/iPRG.samples.lst:
--------------------------------------------------------------------------------
1 | JD_06232014_sample1-A S1
2 | JD_06232014_sample1_B S1
3 | JD_06232014_sample1_C S1
4 | JD_06232014_sample2_A S2
5 | JD_06232014_sample2_B S2
6 | JD_06232014_sample2_C S2
7 | JD_06232014_sample3_A S3
8 | JD_06232014_sample3_B S3
9 | JD_06232014_sample3_C S3
10 | JD_06232014_sample4-A S4
11 | JD_06232014_sample4_B S4
12 | JD_06232014_sample4_C S4
--------------------------------------------------------------------------------
/example/readme.md:
--------------------------------------------------------------------------------
1 | ## Diffacto: Examples
2 | ----
3 |
4 | To run these examples, clone this git repository and decend to the example directory.
5 | To install dependencies run
6 | pip install pyteomics numpy networkx scikit-learn scipy
7 |
8 | #### Print usage information
9 | python ../run_diffacto.py -h
10 |
11 |
12 | ---
13 | #### Example-1:
14 |
15 |
16 | python ../run_diffacto.py -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt -mc_out iPRG.denovo.protein.FDR -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False
17 |
18 |
19 | #
20 |
21 | * input-1, peptide abundances: _iPRG.novo.pep.csv_
22 | * input-2, sample list: _iPRG.samples.lst_
23 | * output-1, protein quantification: _iPRG.denovo.protein.txt_
24 | * output-2, FDR estimation by MC tests: _iPRG.denovo.protein.FDR_
25 | * other parameters:
26 | -min_samples 4 (peptide quantified in at least four runs)
27 | -impute_threshold 0.9 (threshold for missing value imputation 90%)
28 | -use_unique True (only use unique peptides for quantification)
29 | -log2 False (input abundances are not in log scale)
30 |
31 |
32 | ---
33 | #### Example-2:
34 |
35 |
36 | python ../run_diffacto.py -i HBY20Mix.peptides.csv -samples HBY20Mix.samples.lst -db UP000002311_559292.fasta -out HBY20Mix.protein.txt -min_samples 30 -impute_threshold 0.7 -log2 False -reference REF
37 |
38 |
39 | #
40 |
41 | * input-1, peptide abundances: _HBY20Mix.peptides.csv_
42 | * input-2, sample list: _HBY20Mix.samples.lst_
43 | * input-3, protein database: _UP000002311_559292.fasta_
44 | * output-1, protein quantification: _HBY20Mix.protein.txt_
45 | * other parameters:
46 | -min_samples 30 (peptide quantified in at least 30 runs)
47 | -impute_threshold 0.7 (threshold for missing value imputation 70%)
48 | -log2 False (input abundances are not in log scale)
49 | -reference REF (use the runs labeled 'REF' as the internal reference)
50 |
--------------------------------------------------------------------------------
/run_diffacto.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """Convenience wrapper for running diffacto directly from source tree."""
6 |
7 |
8 | from diffacto.diffacto import main
9 |
10 |
11 | if __name__ == '__main__':
12 | main()
13 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | python-tag = py36
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """setup.py: setuptools control for diffacto."""
2 |
3 | import re
4 | from setuptools import setup, find_packages
5 |
6 | version = re.search(
7 | '^__version__\s*=\s*"(.*)"',
8 | open('diffacto/diffacto.py').read(),
9 | re.M
10 | ).group(1)
11 |
12 | from codecs import open
13 | from os import path
14 |
15 | here = path.abspath(path.dirname(__file__))
16 |
17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
18 | long_description = f.read()
19 |
20 |
21 | setup(
22 | # This is the name of your project. The first time you publish this
23 | # package, this name will be registered for you. It will determine how
24 | # users can install this project, e.g.:
25 | #
26 | # $ pip install diffacto
27 | #
28 |
29 | name='diffacto', # Required
30 | version=version, # Required
31 | packages = ["diffacto"],
32 | entry_points = {
33 | "console_scripts": ['diffacto = diffacto.diffacto:main']
34 | },
35 |
36 | description='A protein summarization method for shotgun proteomics experiments', # Required
37 | long_description=long_description, # Optional
38 | url='https://github.com/statisticalbiotechnology/diffacto', # Optional
39 | author='Bo Zhang, Lukas Käll, KTH', # Optional
40 | author_email='lukas.kall@scilifelab.se', # Optional
41 | maintainer='Lukas Käll, KTH', # Optional
42 | maintainer_email='lukas.kall@scilifelab.se', # Optional
43 | license='Apache',
44 | # Classifiers help users find your project by categorizing it.
45 | #
46 | # For a list of valid classifiers, see
47 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
48 | classifiers=[ # Optional
49 | # How mature is this project? Common values are
50 | # 3 - Alpha
51 | # 4 - Beta
52 | # 5 - Production/Stable
53 | 'Development Status :: 3 - Alpha',
54 |
55 | # Indicate who your project is intended for
56 | 'Intended Audience :: Developers',
57 | 'Topic :: Software Development :: Build Tools',
58 |
59 | # Pick your license as you wish
60 | 'License :: OSI Approved :: Apache Software License',
61 |
62 | # Specify the Python versions you support here. In particular, ensure
63 | # that you indicate whether you support Python 2, Python 3 or both.
64 | 'Programming Language :: Python :: 3',
65 | 'Programming Language :: Python :: 3.4',
66 | 'Programming Language :: Python :: 3.5',
67 | 'Programming Language :: Python :: 3.6',
68 | ],
69 |
70 | # This field adds keywords for your project which will appear on the
71 | # project page. What does your project relate to?
72 | #
73 | # Note that this is a string of words separated by whitespace, not a list.
74 | # keywords='sample setuptools development', # Optional
75 |
76 | # You can just specify package directories manually here if your project is
77 | # simple. Or you can use find_packages().
78 | #
79 | # Alternatively, if you just want to distribute a single Python file, use
80 | # the `py_modules` argument instead as follows, which will expect a file
81 | # called `my_module.py` to exist:
82 | #
83 | # py_modules=["my_module"],
84 | #
85 | # packages=find_packages(exclude=['contrib', 'docs', 'tests']), # Required
86 |
87 | # This field lists other packages that your project depends on to run.
88 | # Any package you put here will be installed by pip when your project is
89 | # installed, so they must be valid existing projects.
90 | #
91 | # For an analysis of "install_requires" vs pip's requirements files see:
92 | # https://packaging.python.org/en/latest/requirements.html
93 | install_requires=[
94 | 'numpy>=1.10',
95 | 'scipy>=0.17',
96 | 'pandas>=0.18',
97 | 'networkx>=1.10',
98 | 'scikit-learn>=0.17',
99 | 'pyteomics>=3.3',
100 | 'Cython>=0.26'],
101 |
102 | # List additional groups of dependencies here (e.g. development
103 | # dependencies). Users will be able to install these using the "extras"
104 | # syntax, for example:
105 | #
106 | # $ pip install sampleproject[dev]
107 | #
108 | # Similar to `install_requires` above, these must be valid existing
109 | # projects.
110 | #extras_require={ # Optional
111 | # 'dev': ['check-manifest'],
112 | # 'test': ['coverage'],
113 | #},
114 |
115 | # If there are data files included in your packages that need to be
116 | # installed, specify them here.
117 | #
118 | # If using Python 2.6 or earlier, then these have to be included in
119 | # MANIFEST.in as well.
120 | #package_data={ # Optional
121 | # 'sample': ['package_data.dat'],
122 | #},
123 |
124 | # Although 'package_data' is the preferred approach, in some case you may
125 | # need to place data files outside of your packages. See:
126 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files
127 | #
128 | # In this case, 'data_file' will be installed into '/my_data'
129 | #data_files=[('my_data', ['data/data_file'])], # Optional
130 |
131 | )
132 |
--------------------------------------------------------------------------------