├── .gitignore
├── .test.sh
├── .travis.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── diffacto
    ├── __init__.py
    ├── __main__.py
    └── diffacto.py
├── environment.yml
├── example
    ├── HBY20Mix.peptides.csv
    ├── HBY20Mix.samples.lst
    ├── UP000002311_559292.fasta
    ├── iPRG.novo.pep.csv
    ├── iPRG.samples.lst
    └── readme.md
├── run_diffacto.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/.test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd example
 3 | diffacto -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt \
 4 |   -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False
 5 | protfile_size=$(wc -l < iPRG.denovo.protein.txt)
 6 | #fdrfile_size=$(wc -l < iPRG.denovo.protein.FDR)
 7 | #if [ $protfile_size -ge '200' ] && [ $fdrfile_size -ge '200' ]; then
 8 | if [ $protfile_size -ge '200' ]; then
 9 |     # All OK
10 |     exit 0
11 | else
12 |     # Something is wrong
13 |     exit 1
14 | fi
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | #  - "3.5"
 4 |   - "3.6"
 5 | # command to install dependencies
 6 | install:
 7 |   - python setup.py install
 8 | # command to run tests
 9 | script: ./.test.sh
10 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2017 Bo Zhang and Lukas Käll
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Diffacto: Differential Factor Analysis for Comparative Shotgun Proteomics
 2 | ==========================================================================
 3 | 
 4 | Requirements
 5 | --------------
 6 | 
 7 | Anaconda_ Python3.5+
 8 | 
 9 | Packages needed:
10 | 
11 | - numpy 1.10+
12 | - scipy 0.17+
13 | - pandas 0.18+
14 | - networkx 1.10+
15 | - scikit-learn 0.17+
16 | - pyteomics_ 3.3+
17 | 
18 | .. _Anaconda: https://www.anaconda.com/
19 | .. _pyteomics: https://pyteomics.readthedocs.io/
20 | 
21 | Installation via ``pip``
22 | *************************
23 | 
24 | ::
25 | 
26 |   pip install diffacto
27 | 
28 | 
29 | Usage
30 | -----
31 | 
32 | ::
33 | 
34 |   diffacto.py [-h] -i I [-db [DB]] [-samples [SAMPLES]] [-log2 LOG2]
35 |                        [-normalize {average,median,GMM,None}]
36 |                        [-farms_mu FARMS_MU] [-farms_alpha FARMS_ALPHA]
37 |                        [-reference REFERENCE] [-min_samples MIN_SAMPLES]
38 |                        [-use_unique USE_UNIQUE]
39 |                        [-impute_threshold IMPUTE_THRESHOLD]
40 |                        [-cutoff_weight CUTOFF_WEIGHT] [-fast FAST] [-out OUT]
41 |                        [-mc_out MC_OUT]
42 |   optional arguments:
43 |   -h, --help            show this help message and exit
44 |   -i I                  Peptides abundances in CSV format. The first row
45 |                         should contain names for all samples. The first column
46 |                         should contain unique peptide sequences. Missing
47 |                         values should be empty instead of zeros. (default:
48 |                         None)
49 |   -db [DB]              Protein database in FASTA format. If None, the peptide
50 |                         file must have protein ID(s) in the second column.
51 |                         (default: None)
52 |   -samples [SAMPLES]    File of the sample list. One run and its sample group
53 |                         per line, separated by tab. If None, read from peptide
54 |                         file headings, then each run will be summarized as a
55 |                         group. (default: None)
56 |   -log2 LOG2            Input abundances are in log scale (True) or linear
57 |                         scale (False) (default: False)
58 |   -normalize {average,median,GMM,None}
59 |                         Method for sample-wise normalization. (default: None)
60 |   -farms_mu FARMS_MU    Hyperparameter mu (default: 0.1)
61 |   -farms_alpha FARMS_ALPHA
62 |                         Hyperparameter weight of prior probability (default:
63 |                         0.1)
64 |   -reference REFERENCE  Names of reference sample groups (separated by
65 |                         semicolon) (default: average)
66 |   -min_samples MIN_SAMPLES
67 |                         Minimum number of samples peptides needed to be
68 |                         quantified in (default: 1)
69 |   -use_unique USE_UNIQUE
70 |                         Use unique peptides only (default: False)
71 |   -impute_threshold IMPUTE_THRESHOLD
72 |                         Minimum fraction of missing values in the group.
73 |                         Impute missing values if missing fraction is larger
74 |                         than the threshold. (default: 0.99)
75 |   -cutoff_weight CUTOFF_WEIGHT
76 |                         Peptides weighted lower than the cutoff will be
77 |                         excluded (default: 0.5)
78 |   -fast FAST            Allow early termination in EM calculation when noise
79 |                         is sufficiently small. (default: False)
80 |   -out OUT              Path to output file (writing in TSV format).
81 |   -mc_out MC_OUT        Path to MCFDR output (writing in TSV format).
82 |                         (default: None)
83 | 
84 | 
85 | Example
86 | -------
87 | 
88 | Examples are given in the example_ directory.
89 | 
90 | .. _example: ./example
91 | 


--------------------------------------------------------------------------------
/diffacto/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/statisticalbiotechnology/diffacto/6389c9410f4c01aed6f33fd787a9fce0cf304a9e/diffacto/__init__.py


--------------------------------------------------------------------------------
/diffacto/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | 
4 | """diffacto.__main__: executed when bootstrap directory is called as script."""
5 | 
6 | 
7 | from .diffacto import main
8 | main()
9 | 


--------------------------------------------------------------------------------
/diffacto/diffacto.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: UTF-8 -*-
  3 | from __future__ import division, print_function
  4 | 
  5 | """diffacto.diffacto: provides entry point main()."""
  6 | 
  7 | __version__ = "1.0.7"
  8 | 
  9 | import csv
 10 | import re
 11 | import warnings
 12 | from collections import defaultdict
 13 | from multiprocessing import Pool
 14 | 
 15 | from scipy import optimize, stats
 16 | import networkx as nx
 17 | import numpy as np
 18 | import pandas
 19 | from numpy import array, isfinite, nanmean, nansum
 20 | from pyteomics import fasta
 21 | 
 22 | # from numba import jit  # # Enable just-in-time compiler for speeding up
 23 | # @jit
 24 | def fast_farms(
 25 |     probes: np.array,
 26 |     weight: float = 0.5,
 27 |     mu: float = 0,
 28 |     max_iter: int = 1000,
 29 |     force_iter: bool = False,
 30 |     min_noise: float = 1e-4,
 31 |     fill_nan: float = 0.0,
 32 | ):
 33 |     """Bayesian Factor Analysis for Proteomics Summarization
 34 |        A python translation of function "generateExprVal.method.farms" from
 35 |        Bioconductor FARMS.
 36 |        [http://www.bioconductor.org/packages/release/bioc/html/farms.html]
 37 |        [http://www.bioinf.jku.at/publications/papers/farms/supplementary.ps]
 38 | 
 39 |     Reference:
 40 |        Hochreiter S, Clevert D and Obermayer K (2006). A new summarization
 41 |        method for affymetrix probe level data. Bioinformatics, 22(8),
 42 |        http://bioinformatics.oxfordjournals.org/cgi/content/abstract/22/8/943.
 43 | 
 44 |     Inputs:
 45 |        probes:  Peptide abundance array (N peptides, M samples) in log scale.
 46 |        weight:  Hyperparameter (backscale factor) value in the range of [0,1]
 47 |                 which determines the influence of the prior.
 48 |        mu:      Hyperparameter value which allows to quantify different aspects
 49 |                 of potential prior knowledge. A value near zero assumes that
 50 |                 most genes do not contain a signal, and introduces a bias for
 51 |                 loading matrix elements near zero. """
 52 | 
 53 |     readouts = np.array(probes)
 54 |     if fill_nan != 0:
 55 |         readouts[np.isnan(readouts)] = fill_nan
 56 | 
 57 |     # normalize and transform X
 58 |     X = np.nan_to_num(readouts).T
 59 |     X = X - np.nanmean(X, axis=0)
 60 |     xsd = np.nanstd(X, axis=0)
 61 |     xsd[xsd < min_noise] = 1.0
 62 |     X /= xsd
 63 |     X[~isfinite(X)] = 0
 64 | 
 65 |     n_samples, n_features = X.shape
 66 |     C = np.cov(X.T, ddof=0)
 67 | 
 68 |     # positive definite
 69 |     C = 0.5 * (C + C.T)
 70 |     C[np.where(C < 0)] = 0
 71 | 
 72 |     # robustness
 73 |     U, s, V = np.linalg.svd(C)
 74 |     s[s < min_noise] = min_noise
 75 |     C = U.dot(np.diag(s)).dot(V)
 76 | 
 77 |     # initiation
 78 |     λ = np.sqrt(np.diag(C) * 0.75)
 79 |     ψ = np.diag(C) - λ ** 2
 80 |     old_psi = ψ
 81 |     old_lambda = λ
 82 |     alpha = weight * n_features
 83 |     E = 1.0
 84 |     min_noise_square = min_noise ** 2
 85 |     C_diag = np.diag(C)
 86 | 
 87 |     for i in range(max_iter):
 88 |         # E step
 89 |         φ = λ / ψ
 90 |         a = 1 + np.matmul(λ.reshape(1, -1), φ.reshape(-1, 1))
 91 |         η = φ / a
 92 |         ζ = C.dot(η.T)
 93 |         E = 1 - η.dot(λ) + η.dot(ζ)
 94 | 
 95 |         # M step
 96 |         λ = ζ.T / (E + ψ * alpha)
 97 |         λ = np.asarray(λ)[0]
 98 |         ψ = C_diag - np.asarray(ζ)[0] * λ + ψ * alpha * λ * (mu - λ)
 99 |         ψ = np.maximum(ψ, min_noise_square)
100 |         if (
101 |             ψ[-1] == old_psi[-1]
102 |             and ψ[0] == old_psi[0]
103 |             and np.array_equal(ψ, old_psi)
104 |             and np.array_equal(λ, old_lambda)
105 |         ):
106 |             break
107 | 
108 |         if not force_iter:
109 |             if abs(ψ - old_psi).max() / old_psi.max() < min_noise / 10:
110 |                 break
111 | 
112 |         old_psi = ψ
113 |         old_lambda = λ
114 | 
115 |     loading = np.sqrt(E[0, 0]) * λ
116 |     φ = loading / ψ
117 |     weights = loading / loading.max()  # rescale loadings to the range of [0,1]
118 |     noise = 1 / (1 + np.matmul(loading.reshape(1, -1), φ.reshape(-1, 1)))
119 |     noise = noise[0, 0]
120 |     return weights, noise
121 | 
122 | 
123 | # @jit(nogil=True)
124 | def fast_gmean_nomissing(weights, pep_abd, group_ix):
125 |     """
126 |     Calculate geometric means based on non-missing peptide readouts.
127 |     """
128 |     abd_w = pep_abd * weights[..., None]
129 |     one_w = abd_w / abd_w * weights[..., None]
130 |     a_sums = np.nansum(abd_w, axis=0)
131 |     w_sums = np.nansum(one_w, axis=0)
132 |     expr = a_sums[group_ix].sum(axis=1) / w_sums[group_ix].sum(axis=1)
133 |     return expr
134 | 
135 | 
136 | # @jit(nogil=True)
137 | def sum_squares(pep_abd, group_ix, estimates):
138 |     """
139 |     Calculate sum of squared residuals
140 |     """
141 |     global nGroups
142 |     residual = 0.0
143 |     for i in range(nGroups):
144 |         res = pep_abd[:, group_ix[i]] - estimates[i]
145 |         residual += np.nansum(res * res)
146 |     return residual
147 | 
148 | 
149 | # @jit(nogil=True)
150 | def f_ANOVA(pep_abd, group_ix, estimates, null_ave, dof_loss=0):
151 |     """
152 |     Perform ANOVA
153 |     Inputs:
154 |         pep_abd:    Peptide abundance matrix
155 |         group_ix:   Index of sample groups
156 |         estimates:  Estimated abundances of sample groups
157 |         null_ave:   Global average
158 |         dof_loss:   Loss of dof due to averaging
159 |     Return:
160 |         f:          Value of F-statistic
161 |         dof1:       Degree of freedom of model 1
162 |         dof2:       Degree of freedom of model 2
163 |     """
164 |     global nGroups
165 |     ss_total = sum_squares(pep_abd, group_ix, null_ave)
166 |     ss_resid = sum_squares(pep_abd, group_ix, estimates)
167 |     dof1 = nGroups - 1
168 |     dof2 = isfinite(pep_abd).sum() - nGroups - dof_loss
169 |     if dof2 <= 0:
170 |         return np.nan, dof1, dof2
171 |     f = ((ss_total - ss_resid) / dof1) / (ss_resid / dof2)
172 |     return f, dof1, dof2
173 | 
174 | 
175 | def mv_impute(pep_abd, group_ix, least_missing=0.99, impute_as=0.001):
176 |     """ Impute missing values when having a large proportion in a sample group.
177 |     Inputs:
178 |         pep_abd:        n peptides, m samples, in linear scale
179 |         group_ix:       grouping index for each of the m samples
180 |         least_missing:  set the minimum threshold of missing rate to trigger the imputation (Default: 99%).
181 |         impute_as:      set missing values in the sample to this value
182 |     Return:
183 |         numpy array after replacing missing values with imputed values
184 |     """
185 |     aT = np.array(pep_abd).T
186 |     for ix in group_ix:
187 |         if np.isnan(aT[ix]).sum() > least_missing * len(aT[ix].flatten()):
188 |             val = aT[ix]
189 |             val[np.where(np.isnan(val))] = impute_as
190 |             aT[ix] = val
191 |     return aT.T
192 | 
193 | 
194 | # @jit(nogil=True)
195 | def weighted_average(weights, pep_abd, group_ix):
196 |     """
197 |     Calculate weighted geometric means for sample groups
198 |     Inputs:
199 |         weights:    Weights of peptides after filtering by loading threshold
200 |         pep_abd:    Peptide abundances after filtering by loading threshold
201 |         group_ix:   Array indexes of sample groups
202 |     Return:
203 |         expr:       Estimated expression levels
204 |     """
205 |     global nGroups
206 |     abd_w = pep_abd * weights[..., None]
207 |     count_peptides = np.sum(~np.isnan(pep_abd), axis = 0)
208 |     one_w = abd_w / abd_w * weights[..., None]
209 |     a_sums = np.nansum(abd_w, axis=0)
210 |     w_sums = np.nansum(one_w, axis=0)
211 |     expr = np.empty(nGroups)
212 |     for i in range(expr.shape[0]):
213 |         if count_peptides[i] > 0:
214 |             expr[i] = a_sums[group_ix[i]].sum() / w_sums[group_ix[i]].sum()
215 |         else:
216 |             expr[i] = np.nan
217 |     return expr
218 | 
219 | def _init_pool(the_dict):
220 |         global prot_dict
221 |         prot_dict = the_dict
222 | 
223 | def _load_fasta(db, id_regex):
224 |     prot_dict = dict()
225 |     for header, seq in fasta.read(db):
226 |         seq = seq.replace("I", "L").upper()  # convert DB sequence I -> L
227 |         prot_id = header.split()[0]
228 |         if id_regex is not None:
229 |             find_id = re.findall(id_regex, header)
230 |             if len(find_id) > 0:
231 |                 prot_id = find_id[0]
232 |         prot_dict[prot_id] = seq
233 | 
234 |     return prot_dict
235 | 
236 | 
237 | def _map_seq(p):
238 |     global prot_dict
239 |     pairs = []
240 |     for prot_id, seq in prot_dict.items():
241 |         if p in seq:
242 |             pairs.append([p, prot_id])
243 |     return pairs
244 | 
245 | def peptide_db_graph(peps, db, id_regex=None):
246 |     """ search a set of peptides against a FASTA database  """
247 |     g = nx.Graph()
248 |     protdict = _load_fasta(db, id_regex)
249 | 
250 |     with Pool(initializer = _init_pool, initargs=(protdict,)) as pool:
251 |         mapped_ppps = pool.map(_map_seq, peps)
252 | 
253 |     for ppps in mapped_ppps:
254 |         if len(ppps):
255 |             g.add_edges_from(ppps)
256 |     return g
257 | 
258 | 
259 | def parsimony_grouping(g, peps):
260 |     """ Group peptides to proteins using the rule of parsimony
261 |     Inputs:
262 |         g:  an undirected graph with peptide <-> protein as edges
263 |         peps: the set of peptide sequences, nodes not listed in the peptide set are protein IDs.
264 |     Return:
265 |         prot_groups: a dictionary with mappings between proteins (keys) to peptides (values)
266 |     """
267 |     not_peps = set(g.nodes()) - set(peps)
268 |     prot_groups = dict()
269 |     for cc in (g.subgraph(c).copy() for c in nx.connected_components(g)):
270 |         in_group_peptides = set(cc.nodes()) - not_peps
271 |         in_group_proteins = not_peps.intersection(cc.nodes())
272 | 
273 |         if len(in_group_proteins) == 1:
274 |             prot_groups[in_group_proteins.pop()] = in_group_peptides
275 |         elif len(in_group_proteins) > 1:
276 |             reported = set()
277 |             while len(in_group_proteins - reported) > 0:
278 |                 candidate_proteins = sorted(
279 |                     in_group_proteins - reported,
280 |                     key=lambda p: (len(set(cc[p].keys()) - reported), p),
281 |                     reverse=True,
282 |                 )
283 |                 p = candidate_proteins[0]
284 |                 current_peps = set(cc[p].keys())
285 |                 plabel = [p]
286 |                 for i in range(1, len(candidate_proteins)):
287 |                     _p = candidate_proteins[i]
288 |                     _peps = set(cc[_p].keys())
289 |                     if _peps == current_peps:
290 |                         plabel.append(_p)
291 |                     if len(_peps - current_peps) == 0:
292 |                         reported.add(_p)
293 | 
294 |                 plabel = ";".join(sorted(plabel))
295 |                 if len(current_peps - reported) > 0:
296 |                     prot_groups[plabel] = current_peps
297 |                     reported = reported.union(current_peps)
298 |                 reported.add(p)
299 |     return prot_groups
300 | 
301 | 
302 | def protein_grouping(df, proteinDb):
303 |     """
304 |     Grouping peptide sequences in the given dataframe (df)
305 |         by mapping to a protein database (FASTA);
306 |         or by the first column of dataframe when the database is absent
307 |     """
308 |     peptides = sorted(set(df.index))
309 |     if not proteinDb:
310 |         g = nx.Graph()
311 |         for i, x in df.iterrows():
312 |             for prot in x.values.astype("str")[0].split(";"):
313 |                 if len(prot) > 0:
314 |                     g.add_edge(i, prot)
315 |     else:
316 |         g = peptide_db_graph(peptides, proteinDb)
317 |     pg = parsimony_grouping(g, peptides)
318 |     return pg
319 | 
320 | 
321 | def zero_center_normalize(df, samples, logInput=False, method="median"):
322 |     """
323 |     Transforming input peptide abundance table into log2-scale and centralize to zero.
324 |     Inputs:
325 |         df :        dataframe of peptide abundances
326 |         samples:    column names of selected samples
327 |         logInput:   input abundances are already in log scale
328 |         method:     method for estimating zero point
329 |     Return:
330 |         df:         the dataframe of peptide abundances after normalization
331 |     """
332 |     assert method in (
333 |         "median",
334 |         "average",
335 |         "GMM",
336 |     ), "Zero centering method has to be among median, average or GMM!"
337 | 
338 |     if not logInput:
339 |         # convert abundances to log2 scale
340 |         df[samples] = df[samples].apply(np.log2)
341 |     if method == "average":
342 |         norm_scale = np.nanmean(df[samples], axis=0)
343 |     elif method == "median":
344 |         norm_scale = np.nanmedian(df[samples], axis=0)
345 |     elif method == "GMM":
346 |         """ two-component Gaussian mixture model """
347 |         from sklearn.mixture import GaussianMixture as GMM
348 | 
349 |         gmm = GMM(2)
350 | 
351 |         norm_scale = []
352 |         for sp in samples:
353 |             v = df[sp].values
354 |             v = v[np.logical_not(np.isnan(v))]
355 |             v = v[np.logical_not(np.isinf(v))]
356 |             try:
357 |                 gmm.fit(np.matrix(v).T)
358 |                 vmean = gmm.means_[np.argmin(gmm.covariances_)][0]
359 |                 norm_scale.append(vmean)
360 |             except:
361 |                 norm_scale.append(np.nanmean(v))
362 |         norm_scale = np.array(norm_scale)
363 | 
364 |         print(
365 |             "Caution!!",
366 |             "Two-component Gaussian mixture model is used to center peptide abundances!",
367 |             "Centring factors are:",
368 |             *[
369 |                 "Sample:{}\tGMM:{:.3f}\t Median:{:.3f}".format(s, g, m)
370 |                 for s, g, m in zip(
371 |                     samples, norm_scale, np.nanmedian(df[samples], axis=0)
372 |                 )
373 |             ],
374 |             "Check if GMM estimated values deviate greatly from median values.",
375 |             "If in doubt, use other metrics (e.g. median) to centre the abundances!!\n",
376 |             sep="\n"
377 |         )
378 |     df[samples] = df[samples] - norm_scale
379 |     return df
380 | 
381 | 
382 | def pqpq(peptide_abundances, metric="correlation", method="complete", t=0.4):
383 |     """ The essential PQPQ2 process from @yafeng
384 |         [https://github.com/yafeng/pqpq_python/blob/master/pqpq2.py]
385 |     """
386 |     from scipy import cluster
387 | 
388 |     d = cluster.hierarchy.distance.pdist(peptide_abundances, metric)
389 |     if metric == "correlation":
390 |         D = np.clip(d, 0, 2)
391 |     else:
392 |         D = d
393 |     L = cluster.hierarchy.linkage(D, method, metric)
394 |     ind = cluster.hierarchy.fcluster(L, t, "distance")
395 |     return ind
396 | 
397 | 
398 | # =====================
399 | # Monte Carlo permutation tests
400 | def monte_carlo_permutation(samp_index, n):
401 |     """
402 |     Generating a batch of random permutations of sample indexes
403 |     Inputs:
404 |         samp_index: array indexes of sample groups
405 |         n:          size of the batch of permutations
406 |     """
407 |     flat = np.hstack(samp_index)
408 |     ix = [0]
409 |     [ix.append(ix[-1] + len(i)) for i in samp_index]
410 |     for i in range(n):
411 |         permute = np.random.permutation(flat)
412 |         new_ix = [permute[ix[i - 1] : ix[i]] for i in range(1, len(ix))]
413 |         yield np.array(new_ix)
414 | 
415 | 
416 | def calc_q(pvals):
417 |     """
418 |     Calculate q-values based on a list of p-values, with a conservative estimate
419 |     of the proportion of true null hypotheses (pi0_hat) based on the given p-values.
420 |     """
421 |     pv = np.array(pvals)
422 |     pv = pv[isfinite(pv)]
423 |     pi0_hat = min(1, np.sum(pv) * 2 / len(pv))
424 |     ranking = pv.argsort().argsort() + 1
425 |     qlist = pv * pi0_hat * len(pv) / ranking
426 |     for i, rank in enumerate(ranking):
427 |         qlist[i] = min(qlist[ranking >= rank])
428 |     qlist = list(qlist)
429 |     qvals = np.ones_like(pvals).tolist()
430 |     for i, e in enumerate(pvals):
431 |         if isfinite(e):
432 |             qvals[i] = qlist.pop(0)
433 |     return np.array(qvals)
434 | 
435 | 
436 | def perform_mcfdr(
437 |     diffacto_res,
438 |     sampIx,
439 |     max_mc=1e5,
440 |     batch_size=100,
441 |     terminate_t=50,
442 |     target_fdr=0.05,
443 |     sn_threshold=-20,
444 | ):
445 |     """
446 |     Sequential Monte Carlo permutation test
447 |     Inputs:
448 |         diffacto_res:   a dictionary of Diffacto statistics for each protein
449 |         sampIx:         array indexes of sample groups
450 |         max_mc:         maximun number of random permutations
451 |         batch_size:     number of permutations for every iteration
452 |         terminate_t:    target number of permutation tests with better statistics to terminate the simulation for one protein
453 |         target_fdr:     target level of FDR to stop simulation for the remaining proteins.
454 |         sn_threshold:   signal-to-noise threshold for exclusion of non-informative proteins.
455 |     """
456 |     proteins = sorted(diffacto_res.keys())
457 |     preTermination = set()
458 |     for batch in range(1, int(max_mc / batch_size) + 2):
459 |         mc_pvals = []
460 |         for prot in proteins:
461 |             grand_ave, weight, abd_qc, sn, f, T, N = diffacto_res[prot]
462 |             if sn <= sn_threshold:
463 |                 mc_pvals.append(np.nan)
464 |                 preTermination.add(prot)
465 |                 continue
466 |             if prot in preTermination:
467 |                 mc_pvals.append(T / N)
468 |                 continue
469 |             for ix in monte_carlo_permutation(sampIx, batch_size):
470 |                 N += 1
471 |                 try:
472 |                     yhat = weighted_average(weight, abd_qc, ix)
473 |                     f_mc, _, _ = f_ANOVA(abd_qc, ix, yhat, grand_ave)
474 |                 except:
475 |                     f_mc = f
476 |                 if f_mc >= f:
477 |                     T += 1
478 |             diffacto_res[prot][-1] = N  # 1 + Total MC simulations performed
479 |             diffacto_res[prot][-2] = T  # 1 + MC simulations with better stats
480 |             mc_pvals.append(T / N)
481 |             if T >= terminate_t:
482 |                 preTermination.add(prot)
483 | 
484 |         mc_fdr = calc_q(mc_pvals)
485 |         curr_prot = [proteins.index(p) for p in proteins if p not in preTermination]
486 | 
487 |         if (
488 |             len(curr_prot) == 0
489 |             or max(mc_fdr[curr_prot]) < target_fdr
490 |             or batch * batch_size >= max_mc
491 |         ):
492 |             print("Monte Carlo permutation test finished.")
493 |             return zip(proteins, mc_pvals, mc_fdr)
494 |         else:
495 |             print(
496 |                 "%d times simulation, %d proteins remaining (FDR %.3f)"
497 |                 % (batch * batch_size, len(curr_prot), max(mc_fdr[curr_prot]))
498 |             )
499 | 
500 | 
501 | # =================================================
502 | #  Main
503 | # =================================================
504 | def main():
505 |     import argparse
506 |     import sys
507 | 
508 |     DEBUG = False
509 |     SUMMARIZE_EACH_RUN = False
510 |     TOPN = 3
511 |     T_PQPQ = 0.4
512 |     EXAMPLE = "HUMAN"
513 | 
514 |     MC_SIMULATION = True
515 |     MC_MAX_N = 200000
516 |     MC_BATCH_SIZE = 100
517 |     MC_MAX_HIT = MC_MAX_N / 1000
518 | 
519 |     apars = argparse.ArgumentParser(
520 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
521 |     )
522 | 
523 |     apars.add_argument(
524 |         "-i",
525 |         required=True,
526 |         nargs=1,
527 |         help="""Peptides abundances in CSV format.
528 |                                The first row should contain names for all samples.
529 |                                The first column should contain unique peptide sequences.
530 |                                Missing values should be empty instead of zeros.
531 |                        """,
532 |     )
533 |     # The first column contains unique peptide sequences
534 |     # Missing values should be empty instead of zeros
535 | 
536 |     apars.add_argument(
537 |         "-db",
538 |         nargs="?",
539 |         help="""Protein database in FASTA format.
540 |         If None, the peptide file must have protein ID(s) in the second column.
541 |         """,
542 |     )
543 | 
544 |     apars.add_argument(
545 |         "-samples",
546 |         nargs="?",
547 |         help="""File of the sample list.
548 |         One run and its sample group per line, separated by tab.
549 |         If None, read from peptide file headings,
550 |            then each run will be summarized as a group.
551 |         """,
552 |     )
553 | 
554 |     apars.add_argument(
555 |         "-log2",
556 |         default="False",
557 |         help="Input abundances are in log scale (True) or linear scale (False)",
558 |     )
559 | 
560 |     apars.add_argument(
561 |         "-normalize",
562 |         choices=["average", "median", "GMM", "None"],
563 |         default="None",
564 |         help="Method for sample-wise normalization.",
565 |     )
566 |     # Normalize input abundances (per sample) to zero-centered in log-scale
567 |     # Valid methods include: 'average', 'median' or 'GMM' (two-component
568 |     # Gaussian mixture model).  If None (default), do not normalize.
569 | 
570 |     apars.add_argument("-farms_mu", type=float, default=0.1, help="Hyperparameter mu")
571 |     # Hyperparameter mu of the FARMS algorithm: prior knowledge of the
572 |     # expected loading.
573 | 
574 |     apars.add_argument(
575 |         "-farms_alpha",
576 |         type=float,
577 |         default=0.1,
578 |         help="Hyperparameter weight of prior probability",
579 |     )
580 |     # Hyperparameter weight of the FARMS algorithm: weight of prior
581 |     # probability in EM calculation.
582 | 
583 |     apars.add_argument(
584 |         "-reference",
585 |         default="average",
586 |         help="Names of reference sample groups (separated by semicolon)",
587 |     )
588 |     # If average (default) calculate average abundance as the reference.
589 |     # Otherwise, keep peptide abundance values as is.
590 | 
591 |     apars.add_argument(
592 |         "-min_samples",
593 |         type=int,
594 |         default=1,
595 |         help="Minimum number of samples peptides needed to be quantified in",
596 |     )
597 |     # Peptides quantified in less than the minimum number will be discarded
598 | 
599 |     apars.add_argument("-use_unique", default="False", help="Use unique peptides only")
600 | 
601 |     apars.add_argument(
602 |         "-impute_threshold",
603 |         type=float,
604 |         default=0.99,
605 |         help=(
606 |             "Minimum fraction of missing values in the group. "
607 |             "Impute missing values if missing fraction is larger than the threshold. "
608 |         ),
609 |     )
610 | 
611 |     apars.add_argument(
612 |         "-cutoff_weight",
613 |         type=float,
614 |         default=0.5,
615 |         help="Peptides weighted lower than the cutoff will be excluded",
616 |     )
617 | 
618 |     apars.add_argument(
619 |         "-fast",
620 |         default="False",
621 |         help="Allow early termination in EM calculation when noise is sufficiently small.",
622 |     )
623 | 
624 |     apars.add_argument(
625 |         "-out",
626 |         type=argparse.FileType("w"),
627 |         default=sys.stdout,
628 |         help="Path to output file (writing in TSV format).",
629 |     )
630 | 
631 |     apars.add_argument(
632 |         "-mc_out", default=None, help="Path to MCFDR output (writing in TSV format)."
633 |     )
634 | 
635 |     apars.add_argument('-loadings_out', default=None,
636 |                        help='File for peptide loadings (writing in TSV format).')
637 |     # ------------------------------------------------
638 |     args = apars.parse_args()
639 | 
640 |     def boolparam(p):
641 |         """ convert a string parameter to boolean value"""
642 |         if str(p).lower() in ("yes", "true", "t", "y", "1"):
643 |             return True
644 |         else:
645 |             return False
646 | 
647 |     args.log2 = boolparam(args.log2)
648 |     args.fast = boolparam(args.fast)
649 |     args.use_unique = boolparam(args.use_unique)
650 |     print(args)
651 |     diffacto_res = dict()
652 |     df = pandas.read_csv(args.i[0], index_col=0)
653 |     df.index = [i.upper().replace("I", "L") for i in df.index]
654 |     print("Abundance matrix loaded: %d peptides" % len(df.index))
655 | 
656 |     if not args.samples:
657 |         # read sample names from header
658 |         samples = df.columns.tolist()
659 |         if args.db is None:
660 |             samples.pop(0)
661 |         groups = samples
662 |     else:
663 |         # read sample labels
664 |         samples, groups = ([], [])
665 |         with open(args.samples) as fh:
666 |             for line in fh.readlines():
667 |                 try:
668 |                     _s, _g = line.rstrip().split("\t")
669 |                     samples.append(_s)
670 |                     groups.append(_g)
671 |                 except ValueError:
672 |                     pass
673 | 
674 |     # per sample normalization of peptide abundances
675 |     logInput = args.log2
676 |     if not args.normalize == "None":
677 |         df = zero_center_normalize(
678 |             df, samples, logInput=logInput, method=args.normalize
679 |         )
680 |         args.log2 = True
681 | 
682 |     # select reference runs if specified
683 |     ref_samples = []
684 |     if args.reference:
685 |         for r in args.reference.split(";"):
686 |             for i in range(len(groups)):
687 |                 if groups[i] == r:
688 |                     ref_samples.append(i)
689 |     ref_samples = [samples[i] for i in ref_samples]
690 | 
691 |     print("Number of runs: %d" % len(samples))
692 | 
693 |     # sample grouping
694 |     group_names = [
695 |         i
696 |         for i in sorted(set(groups), key=lambda k: "{0:0>50}".format(k))
697 |         if i not in args.reference.split(";")
698 |     ]
699 |     if len(group_names) == len(samples):
700 |         group_names = samples
701 | 
702 |     sampIx = np.array(
703 |         [[j for j in range(len(groups)) if groups[j] == i] for i in group_names]
704 |     )
705 |     global nGroups
706 |     nGroups = len(group_names)
707 |     print("Number of sample groups: %d" % nGroups)
708 |     print("Reference runs (%d): " % len(ref_samples), *ref_samples, sep="\t")
709 | 
710 |     # protein grouping
711 |     pg = protein_grouping(df, args.db)
712 |     print("Number of protein groups: %d" % len(pg.keys()))
713 | 
714 |     # coverage filtering
715 |     df = df[
716 |         [
717 |             np.count_nonzero(np.nan_to_num(v)) >= args.min_samples
718 |             for v in df[samples].values
719 |         ]
720 |     ]
721 | 
722 |     # reversed mapping (peptide to protein group) for checking peptide uniqueness.
723 |     pep2prot = defaultdict(list)
724 |     for prot_ids, bseqs in pg.items():
725 |         for s in bseqs:
726 |             pep2prot[s] += prot_ids.split()
727 | 
728 |     # use unique peptides
729 |     if args.use_unique:
730 |         df = df[[len(pep2prot[p]) == 1 for p in df.index]]
731 | 
732 |     # Check that we don't have any peptides with a single non-missing value.
733 |     # These tend to break diffacto, because in fast_farms we end up with a covariance matrix of less than full rank. Which the algorithm is not set up to handle.
734 |     nonZeroNonMissing = np.vectorize(
735 |         lambda x: ~np.isnan(x) and x != 0, otypes=[np.bool_]
736 |     )
737 |     if df.shape[0] > 0:
738 |         for prot in sorted(pg.keys()):
739 |             if prot == "nan":
740 |                 continue
741 |             if DEBUG and EXAMPLE not in prot:
742 |                 continue
743 |             # =====----=====-----=====-----=====
744 |             peps = pg[prot]  # constituent peptides
745 |             dx = df.loc[[p for p in sorted(peps) if p in df.index]]  # dataframe
746 |             pep_count = len(dx)  # number of peptides
747 |             pep_abd = dx[samples].values
748 |             counts = np.sum(nonZeroNonMissing(pep_abd), axis=1)
749 |             if any(counts < 2):
750 |                 print(
751 |                     "Protein {} contained peptides with fewer than two non-missing or non-zero values. Please remove these peptides".format(
752 |                         prot
753 |                     )
754 |                 )
755 |                 return
756 | 
757 |     if args.loadings_out is not None:
758 |         loadings_out_file = open(args.loadings_out, 'w')
759 |     # -------------------------------------------------------------------------
760 |     # perform differential analysis
761 |     output_header = ["Protein", "N.Pept", "Q.Pept", "S/N", "P(PECA)"]
762 |     output_header += group_names
763 |     if SUMMARIZE_EACH_RUN:
764 |         output_header += ["P(Top-%d)" % TOPN, "P(Median)", "P(PQPQ)"]
765 |         output_header += ["Top-%d_%s" % (TOPN, s) for s in samples]
766 |         output_header += ["Median_%s" % s for s in samples]
767 |         output_header += ["PQPQ_%s" % s for s in samples]
768 | 
769 |     print(*output_header, sep="\t", file=args.out)
770 |     for prot in sorted(pg.keys()):
771 |         if prot == "nan":
772 |             continue
773 |         if DEBUG and EXAMPLE not in prot:
774 |             continue
775 |         # =====----=====-----=====-----=====
776 |         peps = pg[prot]  # constituent peptides
777 |         dx = df.loc[[p for p in sorted(peps) if p in df.index]]  # dataframe
778 |         pep_count = len(dx)  # number of peptides
779 |         pep_abd = dx[samples].values
780 | 
781 |         if len(ref_samples):  # rescale peptide abundances by reference runs
782 |             reference_abundance = (
783 |                 dx[ref_samples].mean(axis=1).fillna(np.nanmean(dx[samples])).values
784 |             )
785 |         elif args.reference.lower() == "average":  # rescale by average values
786 |             reference_abundance = dx[samples].mean(axis=1).values
787 |         else:
788 |             if not args.log2:
789 |                 reference_abundance = 1.0
790 |             else:
791 |                 reference_abundance = 0
792 | 
793 |         if not args.log2:
794 |             pep_abd = np.log2(pep_abd)
795 |             reference_abundance = np.log2(reference_abundance)
796 | 
797 |         pep_abd = (pep_abd.T - reference_abundance).T
798 | 
799 |         if pep_count == 1:
800 |             # single peptide group
801 |             loading = array([1 for _ in dx.index])
802 |             noise = 1.0
803 |             continue
804 |             # do not report
805 |         elif pep_count > 1:
806 |             loading, noise = fast_farms(
807 |                 pep_abd,
808 |                 mu=args.farms_mu,
809 |                 weight=args.farms_alpha,
810 |                 max_iter=1000,
811 |                 force_iter=not args.fast,
812 |             )
813 |         else:
814 |             continue
815 | 
816 |         if noise < 1:
817 |             sn = 10 * np.log10((1 - noise) / noise)
818 |         else:
819 |             # fix log(0) issue
820 |             sn = -np.inf
821 | 
822 |         if args.loadings_out is not None:
823 |             for pep, pepLoading in zip(peps, loading):
824 |                 print(prot, pep, pepLoading, sep="\t", file = loadings_out_file)
825 | 
826 |         qc = loading > args.cutoff_weight
827 |         abd_qc = mv_impute(
828 |             pep_abd[qc],
829 |             sampIx,
830 |             least_missing=args.impute_threshold,
831 |             impute_as=np.nanmin(pep_abd) - 1,
832 |         )
833 |         protein_summary_group = weighted_average(loading[qc], abd_qc, sampIx)
834 | 
835 |         if SUMMARIZE_EACH_RUN:
836 |             with warnings.catch_warnings():
837 |                 warnings.simplefilter("ignore", category=RuntimeWarning)
838 |                 # Top-N averaging
839 |                 v = dx[samples].values
840 |                 if logInput:
841 |                     v = 2 ** v
842 |                 protein_summary_topn = np.array(
843 |                     [
844 |                         np.mean(np.sort(v[:, i][isfinite(v[:, i])])[-TOPN:])
845 |                         for i in range(len(samples))
846 |                     ]
847 |                 )
848 |                 p_ave = stats.f_oneway(
849 |                     *[
850 |                         protein_summary_topn[s][isfinite(protein_summary_topn[s])]
851 |                         for s in sampIx
852 |                     ]
853 |                 )[1]
854 | 
855 |                 # Median
856 |                 v = dx[samples].values
857 |                 protein_summary_median = np.nanmedian(v, axis=0)
858 |                 p_med = stats.f_oneway(
859 |                     *[
860 |                         protein_summary_median[s][isfinite(protein_summary_median[s])]
861 |                         for s in sampIx
862 |                     ]
863 |                 )[1]
864 | 
865 |                 # PQPQ clustering and averaging
866 |                 v = np.nan_to_num(pep_abd)
867 |                 clusters = pqpq(v, t=T_PQPQ)
868 |                 major = sorted(
869 |                     [(len(clusters[clusters == i]), i) for i in set(clusters.tolist())]
870 |                 )[-1]
871 |                 if major[0] >= 2:
872 |                     clusters[clusters != major[1]] = 0
873 |                     clusters[clusters != 0] = 1
874 |                 else:
875 |                     clusters = np.ones(*clusters.shape)
876 |                 protein_summary_pqpq = np.nanmean(
877 |                     dx[samples].values[clusters > 0], axis=0
878 |                 )
879 |                 p_pqpq = stats.f_oneway(
880 |                     *[
881 |                         protein_summary_pqpq[s][isfinite(protein_summary_pqpq[s])]
882 |                         for s in sampIx
883 |                     ]
884 |                 )[1]
885 | 
886 |         # ================================================================
887 |         # PECA: grouping peptide-level p-values based on beta distribution
888 |         # https://www.bioconductor.org/packages/release/bioc/html/PECA.html
889 |         """
890 |         Calculate Probe-level Expression Change Averages (PECA)
891 |         to identify differential expression in Affymetrix gene expression
892 |         microarray studies or in proteomic studies using peptide-level
893 |         mesurements respectively.
894 |         """
895 |         pep_pvals = []
896 |         for pep_v in abd_qc:
897 |             with warnings.catch_warnings():
898 |                 warnings.simplefilter("ignore", category=RuntimeWarning)
899 |                 ave_0 = np.broadcast_to(np.nanmean(pep_v), (nGroups, 1))
900 |                 ave_1 = np.array([np.nanmean(pep_v[i]) for i in sampIx])
901 |             try:
902 |                 f, d1, d2 = f_ANOVA(pep_v[None, ...], sampIx, ave_1, ave_0)
903 |                 pv = stats.f.sf(f, d1, d2)
904 |                 pep_pvals.append(pv)
905 |             except:
906 |                 pass
907 | 
908 |         pep_pvals = np.array(pep_pvals)
909 |         pep_pvals = pep_pvals[isfinite(pep_pvals)]
910 |         beta_ab = len(pep_pvals) / 2 + 0.5
911 |         if len(pep_pvals) > 0:
912 |             p_peca = stats.beta.cdf(np.median(pep_pvals), beta_ab, beta_ab)
913 |         else:
914 |             p_peca = np.nan
915 | 
916 |         if MC_SIMULATION:
917 |             grand_ave = np.broadcast_to(np.nanmean(abd_qc), (nGroups, 1))
918 |             f, _, _ = f_ANOVA(abd_qc, sampIx, protein_summary_group, grand_ave)
919 |             diffacto_res[prot] = [grand_ave, loading[qc], abd_qc, sn, f, 1, 1]
920 | 
921 |         # =============================
922 |         if not logInput:
923 |             protein_summary_group = 2 ** protein_summary_group
924 | 
925 |         output_row = [prot, pep_count, sum(qc), sn, p_peca] + list(
926 |             protein_summary_group
927 |         )
928 | 
929 |         if SUMMARIZE_EACH_RUN:
930 |             output_row += (
931 |                 [p_ave, p_med, p_pqpq]
932 |                 + list(protein_summary_topn)
933 |                 + list(protein_summary_median)
934 |                 + list(protein_summary_pqpq)
935 |             )
936 | 
937 |         print(*output_row, sep="\t", file=args.out)
938 | 
939 |     if MC_SIMULATION and args.mc_out:
940 |         try:
941 |             mc_out = open(args.mc_out, "w")
942 |         except:
943 |             print("Cannot open file: ", args.mc_out, ". Use stdout instead.")
944 |             mc_out = sys.stdout
945 | 
946 |         print("Protein", "P(MC)", "MCFDR", sep="\t", file=mc_out)
947 |         mc_result = perform_mcfdr(
948 |             diffacto_res,
949 |             sampIx,
950 |             max_mc=MC_MAX_N,
951 |             batch_size=MC_BATCH_SIZE,
952 |             terminate_t=MC_MAX_HIT,
953 |             target_fdr=0.05,
954 |         )
955 | 
956 |         for prot, p, q in mc_result:
957 |             print(prot, p, q, sep="\t", file=mc_out)
958 | 
959 | 
960 | if __name__ == "__main__":
961 |     main()
962 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: diffacto_35
 2 | channels:
 3 | - defaults
 4 | dependencies:
 5 | - ca-certificates=2017.08.26=ha1e5d58_0
 6 | - certifi=2017.11.5=py35hd00889a_0
 7 | - decorator=4.1.2=py35hf37c5b3_0
 8 | - intel-openmp=2018.0.0=h8158457_8
 9 | - libcxx=4.0.1=h579ed51_0
10 | - libcxxabi=4.0.1=hebd6815_0
11 | - libedit=3.1=hb4e282d_0
12 | - libffi=3.2.1=h475c297_4
13 | - libgfortran=3.0.1=h93005f0_2
14 | - mkl=2018.0.1=hfbd8650_4
15 | - ncurses=6.0=hd04f020_2
16 | - networkx=2.0=py35hb193ae4_0
17 | - numpy=1.14.0=py35h8a80b8c_0
18 | - openssl=1.0.2n=hdbc3d79_0
19 | - pandas=0.22.0=py35h0a44026_0
20 | - pip=9.0.1=py35h33ce766_4
21 | - python=3.5.4=he720263_23
22 | - python-dateutil=2.6.1=py35h10515e0_1
23 | - pytz=2017.3=py35heeb7564_0
24 | - readline=7.0=hc1231fa_4
25 | - scikit-learn=0.19.1=py35h2b554eb_0
26 | - scipy=1.0.0=py35h8b35106_0
27 | - setuptools=36.5.0=py35h52cde6a_0
28 | - six=1.11.0=py35h39a4c60_1
29 | - sqlite=3.20.1=h7e4c145_2
30 | - tk=8.6.7=h35a86e2_3
31 | - wheel=0.30.0=py35h5c0b906_1
32 | - xz=5.2.3=h0278029_2
33 | - zlib=1.2.11=hf3cbc9b_2
34 | - pip:
35 |   - altgraph==0.15
36 |   - future==0.16.0
37 |   - macholib==1.9
38 |   - pefile==2017.11.5
39 |   - pyinstaller==3.3.1
40 |   - pyteomics==3.4.2
41 | 


--------------------------------------------------------------------------------
/example/HBY20Mix.samples.lst:
--------------------------------------------------------------------------------
 1 | 20160112_P1_SEG_MID	P1
 2 | 20160112_P1_SEG_LOW	P1
 3 | 20160112_P1_SEG_HIGH	P1
 4 | 20160112_P2_SEG_LOW	P2
 5 | 20160112_P2_SEG_MID	P2
 6 | 20160112_P2_SEG_HIGH	P2
 7 | 20160112_P3_SEG_HIGH	P3
 8 | 20160112_P3_SEG_LOW	P3
 9 | 20160112_P3_SEG_MID	P3
10 | 20160112_P4_SEG_HIGH	P4
11 | 20160112_P4_SEG_LOW	P4
12 | 20160112_P4_SEG_MID	P4
13 | 20160112_P5_SEG_HIGH	P5
14 | 20160112_P5_SEG_LOW_160121063813	P5
15 | 20160112_P5_SEG_MID	P5
16 | 20160112_P6_SEG_HIGH	P6
17 | 20160112_P6_SEG_LOW	P6
18 | 20160112_P6_SEG_MID	P6
19 | 20160112_P7_SEG_HIGH	P7
20 | 20160112_P7_SEG_LOW	P7
21 | 20160112_P7_SEG_MID	P7
22 | 20160112_P8_SEG_HIGH	P8
23 | 20160112_P8_SEG_LOW	P8
24 | 20160112_P8_SEG_MID_160121160232	P8
25 | 20160112_P9_SEG_MID	P9
26 | 20160112_P9_SEG_HIGH	P9
27 | 20160112_P9_SEG_LOW_160121012404	P9
28 | 20160112_P10_SEG_HIGH	P10
29 | 20160112_P10_SEG_LOW_160120200540	P10
30 | 20160112_P10_SEG_MID	P10
31 | 20160112_P11_SEG_LOW_160203031257	P11
32 | 20160112_P11_SEG_MID_160203050927	P11
33 | 20160112_P11_SEG_HIGH_160203070611	P11
34 | 20160112_P11_SEG_HIGH	REF
35 | 20160112_P11_SEG_LOW	REF
36 | 20160112_P11_SEG_MID	REF
37 | 20160112_P12_SEG_LOW	P12
38 | 20160112_P12_SEG_MID	P12
39 | 20160112_P12_SEG_HIGH	P12
40 | 20160112_P13_SEG_HIGH	P13
41 | 20160112_P13_SEG_LOW	P13
42 | 20160112_P13_SEG_MID	P13
43 | 20160112_P14_SEG_HIGH	P14
44 | 20160112_P14_SEG_MID	P14
45 | 20160112_P14_SEG_LOW_160120174525	P14
46 | 20160112_P15_SEG_LOW	P15
47 | 20160112_P15_SEG_MID	P15
48 | 20160112_P15_SEG_HIGH_160120220930	P15
49 | 20160112_P16_SEG_LOW	P16
50 | 20160112_P16_SEG_MID	P16
51 | 20160112_P16_SEG_HIGH_160121181003	P16
52 | 20160112_P17_SEG_HIGH	P17
53 | 20160112_P17_SEG_LOW	P17
54 | 20160112_P17_SEG_MID	P17
55 | 20160112_P18_SEG_LOW	P18
56 | 20160112_P18_SEG_MID	P18
57 | 20160112_P18_SEG_HIGH	P18
58 | 20160112_P19_SEG_MID_160121112852	P19
59 | 20160112_P19_SEG_HIGH	P19
60 | 20160112_P19_SEG_LOW	P19
61 | 20160112_P20_SEG_MID	P20
62 | 20160112_P20_SEG_LOW	P20
63 | 20160112_P20_SEG_HIGH_160121032454	P20


--------------------------------------------------------------------------------
/example/iPRG.samples.lst:
--------------------------------------------------------------------------------
 1 | JD_06232014_sample1-A	S1
 2 | JD_06232014_sample1_B	S1
 3 | JD_06232014_sample1_C	S1
 4 | JD_06232014_sample2_A	S2
 5 | JD_06232014_sample2_B	S2
 6 | JD_06232014_sample2_C	S2
 7 | JD_06232014_sample3_A	S3
 8 | JD_06232014_sample3_B	S3
 9 | JD_06232014_sample3_C	S3
10 | JD_06232014_sample4-A	S4
11 | JD_06232014_sample4_B	S4
12 | JD_06232014_sample4_C	S4


--------------------------------------------------------------------------------
/example/readme.md:
--------------------------------------------------------------------------------
 1 | ## Diffacto: Examples
 2 | ----
 3 | 
 4 | To run these examples, clone this git repository and decend to the example directory.
 5 | To install dependencies run 
 6 | <code> pip install pyteomics numpy networkx scikit-learn scipy</code>
 7 | 
 8 | #### Print usage information
 9 | <code>python ../run_diffacto.py  -h </code>
10 | 
11 | 
12 | ---
13 | #### Example-1:
14 | 
15 | <code>
16 | python ../run_diffacto.py -i iPRG.novo.pep.csv -samples iPRG.samples.lst -out iPRG.denovo.protein.txt -mc_out iPRG.denovo.protein.FDR -min_samples 4 -impute_threshold 0.9 -use_unique True -log2 False
17 | </code>
18 | 
19 | # 
20 | 
21 | * input-1, peptide abundances: _iPRG.novo.pep.csv_
22 | * input-2, sample list: _iPRG.samples.lst_
23 | * output-1, protein quantification: _iPRG.denovo.protein.txt_
24 | * output-2, FDR estimation by MC tests: _iPRG.denovo.protein.FDR_
25 | * other parameters:  
26 |     -min_samples 4 (peptide quantified in at least four runs)   
27 |     -impute_threshold 0.9 (threshold for missing value imputation 90%)  
28 |     -use_unique True (only use unique peptides for quantification)  
29 |     -log2 False (input abundances are not in log scale)
30 | 
31 | 
32 | ---
33 | #### Example-2:
34 | 
35 | <code>
36 | python ../run_diffacto.py -i HBY20Mix.peptides.csv -samples HBY20Mix.samples.lst -db UP000002311_559292.fasta -out HBY20Mix.protein.txt -min_samples 30 -impute_threshold 0.7 -log2 False -reference REF
37 | </code>
38 | 
39 | #
40 | 
41 | * input-1, peptide abundances: _HBY20Mix.peptides.csv_
42 | * input-2, sample list: _HBY20Mix.samples.lst_
43 | * input-3, protein database: _UP000002311_559292.fasta_
44 | * output-1, protein quantification: _HBY20Mix.protein.txt_
45 | * other parameters:  
46 |     -min_samples 30 (peptide quantified in at least 30 runs)   
47 |     -impute_threshold 0.7 (threshold for missing value imputation 70%)  
48 |     -log2 False (input abundances are not in log scale)  
49 |     -reference REF (use the runs labeled 'REF' as the internal reference)  
50 | 


--------------------------------------------------------------------------------
/run_diffacto.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | """Convenience wrapper for running diffacto directly from source tree."""
 6 | 
 7 | 
 8 | from diffacto.diffacto import main
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     main()
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | python-tag = py36
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | """setup.py: setuptools control for diffacto."""
  2 | 
  3 | import re
  4 | from setuptools import setup, find_packages
  5 | 
  6 | version = re.search(
  7 |     '^__version__\s*=\s*"(.*)"',
  8 |     open('diffacto/diffacto.py').read(),
  9 |     re.M
 10 |     ).group(1)
 11 | 
 12 | from codecs import open
 13 | from os import path
 14 | 
 15 | here = path.abspath(path.dirname(__file__))
 16 | 
 17 | with open(path.join(here, 'README.rst'), encoding='utf-8') as f:
 18 |     long_description = f.read()
 19 | 
 20 | 
 21 | setup(
 22 |     # This is the name of your project. The first time you publish this
 23 |     # package, this name will be registered for you. It will determine how
 24 |     # users can install this project, e.g.:
 25 |     #
 26 |     # $ pip install diffacto
 27 |     #
 28 | 
 29 |     name='diffacto',  # Required
 30 |     version=version,  # Required
 31 |     packages = ["diffacto"],
 32 |     entry_points = {
 33 |         "console_scripts": ['diffacto = diffacto.diffacto:main']
 34 |     },
 35 | 
 36 |     description='A protein summarization method for shotgun proteomics experiments',  # Required
 37 |     long_description=long_description,  # Optional
 38 |     url='https://github.com/statisticalbiotechnology/diffacto',  # Optional
 39 |     author='Bo Zhang, Lukas Käll, KTH',  # Optional
 40 |     author_email='lukas.kall@scilifelab.se',  # Optional
 41 |     maintainer='Lukas Käll, KTH',  # Optional
 42 |     maintainer_email='lukas.kall@scilifelab.se',  # Optional
 43 |     license='Apache',
 44 |     # Classifiers help users find your project by categorizing it.
 45 |     #
 46 |     # For a list of valid classifiers, see
 47 |     # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 48 |     classifiers=[  # Optional
 49 |         # How mature is this project? Common values are
 50 |         #   3 - Alpha
 51 |         #   4 - Beta
 52 |         #   5 - Production/Stable
 53 |         'Development Status :: 3 - Alpha',
 54 | 
 55 |         # Indicate who your project is intended for
 56 |         'Intended Audience :: Developers',
 57 |         'Topic :: Software Development :: Build Tools',
 58 | 
 59 |         # Pick your license as you wish
 60 |         'License :: OSI Approved :: Apache Software License',
 61 | 
 62 |         # Specify the Python versions you support here. In particular, ensure
 63 |         # that you indicate whether you support Python 2, Python 3 or both.
 64 |         'Programming Language :: Python :: 3',
 65 |         'Programming Language :: Python :: 3.4',
 66 |         'Programming Language :: Python :: 3.5',
 67 |         'Programming Language :: Python :: 3.6',
 68 |     ],
 69 | 
 70 |     # This field adds keywords for your project which will appear on the
 71 |     # project page. What does your project relate to?
 72 |     #
 73 |     # Note that this is a string of words separated by whitespace, not a list.
 74 |     # keywords='sample setuptools development',  # Optional
 75 | 
 76 |     # You can just specify package directories manually here if your project is
 77 |     # simple. Or you can use find_packages().
 78 |     #
 79 |     # Alternatively, if you just want to distribute a single Python file, use
 80 |     # the `py_modules` argument instead as follows, which will expect a file
 81 |     # called `my_module.py` to exist:
 82 |     #
 83 |     #   py_modules=["my_module"],
 84 |     #
 85 |     # packages=find_packages(exclude=['contrib', 'docs', 'tests']),  # Required
 86 | 
 87 |     # This field lists other packages that your project depends on to run.
 88 |     # Any package you put here will be installed by pip when your project is
 89 |     # installed, so they must be valid existing projects.
 90 |     #
 91 |     # For an analysis of "install_requires" vs pip's requirements files see:
 92 |     # https://packaging.python.org/en/latest/requirements.html
 93 |     install_requires=[
 94 |         'numpy>=1.10',
 95 |         'scipy>=0.17',
 96 |         'pandas>=0.18',
 97 |         'networkx>=1.10',
 98 |         'scikit-learn>=0.17',
 99 |         'pyteomics>=3.3',
100 |         'Cython>=0.26'],
101 | 
102 |     # List additional groups of dependencies here (e.g. development
103 |     # dependencies). Users will be able to install these using the "extras"
104 |     # syntax, for example:
105 |     #
106 |     #   $ pip install sampleproject[dev]
107 |     #
108 |     # Similar to `install_requires` above, these must be valid existing
109 |     # projects.
110 |     #extras_require={  # Optional
111 |     #    'dev': ['check-manifest'],
112 |     #    'test': ['coverage'],
113 |     #},
114 | 
115 |     # If there are data files included in your packages that need to be
116 |     # installed, specify them here.
117 |     #
118 |     # If using Python 2.6 or earlier, then these have to be included in
119 |     # MANIFEST.in as well.
120 |     #package_data={  # Optional
121 |     #    'sample': ['package_data.dat'],
122 |     #},
123 | 
124 |     # Although 'package_data' is the preferred approach, in some case you may
125 |     # need to place data files outside of your packages. See:
126 |     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files
127 |     #
128 |     # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
129 |     #data_files=[('my_data', ['data/data_file'])],  # Optional
130 | 
131 | )
132 | 


--------------------------------------------------------------------------------