├── vectorized_metrics ├── __init__.py ├── LICENSE ├── logger.py ├── parsers.py └── vectorized_metrics.py ├── .gitignore ├── requirements.txt ├── demo-data └── results │ ├── disorder_pdb.analysis.all.dataset.f1s.cmat.csv │ ├── disorder_pdb.analysis.all.dataset.default.cmat.csv │ ├── disorder_pdb.analysis.AIUPred.thr.txt │ ├── disorder_pdb.analysis.LIPNet.thr.txt │ ├── disorder_pdb.analysis.AlphaFold-rsa.thr.txt │ ├── disorder_pdb.analysis.all.dataset.default.metrics.csv │ ├── disorder_pdb.analysis.all.dataset.f1s.metrics.csv │ ├── disorder_pdb.analysis.all.ci.default.metrics.csv │ ├── disorder_pdb.analysis.all.ci.f1s.metrics.csv │ ├── AlphaFold-rsa.thresholds.distribution.txt │ ├── LIPNet.thresholds.distribution.txt │ ├── AIUPred.thresholds.distribution.txt │ ├── disorder_pdb.analysis.all.bootstrap.default.metrics.csv │ ├── disorder_pdb.analysis.all.bootstrap.f1s.metrics.csv │ └── disorder_pdb.analysis.all.dataset._.roc.csv ├── caid.py └── README.md /vectorized_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vectorized_metrics/__pycache__ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==2.3.2 2 | pandas==2.3.1 3 | scipy==1.16.1 4 | tqdm==4.66.2 5 | -------------------------------------------------------------------------------- /vectorized_metrics/LICENSE: -------------------------------------------------------------------------------- 1 | This software is licensed under the license: 2 | Attribution 4.0 International (CC BY 4.0) 3 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.dataset.f1s.cmat.csv: -------------------------------------------------------------------------------- 1 | ,fn,fp,tn,tp 2 | LIPNet,145,67758,80,31256 3 | AlphaFold-rsa,5540,3080,64758,25861 4 | AIUPred,9188,4333,63419,22178 5 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.dataset.default.cmat.csv: -------------------------------------------------------------------------------- 1 | ,fn,fp,tn,tp 2 | LIPNet,15299,31098,36740,16102 3 | AlphaFold-rsa,4367,4996,62842,27034 4 | AIUPred,5474,12679,55073,25892 5 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.AIUPred.thr.txt: -------------------------------------------------------------------------------- 1 | default 0.5 2 | npv 0.001 3 | ppv 0.997 4 | tpr 0.001 5 | tnr 0.997 6 | fpr 0.0 7 | fnr 0.997 8 | fom 0.0 9 | csi 0.664 10 | bac 0.585 11 | f1s 0.665 12 | f2s 0.434 13 | f05 0.756 14 | mcc 0.721 15 | inf 0.589 16 | mk 0.809 17 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.LIPNet.thr.txt: -------------------------------------------------------------------------------- 1 | default 0.5 2 | npv 0.005 3 | ppv 0.998 4 | tpr 0.008 5 | tnr 0.999 6 | fpr 0.009 7 | fnr 0.999 8 | fom 0.004 9 | csi 0.012 10 | bac 0.804 11 | f1s 0.014 12 | f2s 0.008 13 | f05 0.637 14 | mcc 0.897 15 | inf 0.797 16 | mk 0.998 17 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.AlphaFold-rsa.thr.txt: -------------------------------------------------------------------------------- 1 | default 0.5 2 | npv 0.096 3 | ppv 0.971 4 | tpr 0.155 5 | tnr 0.971 6 | fpr 0.022 7 | fnr 0.971 8 | fom 0.003 9 | csi 0.543 10 | bac 0.515 11 | f1s 0.551 12 | f2s 0.448 13 | f05 0.606 14 | mcc 0.546 15 | inf 0.514 16 | mk 0.606 17 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.dataset.default.metrics.csv: -------------------------------------------------------------------------------- 1 | ,bac,csi,f05,f1s,f2s,fnr,fom,fpr,inf,mcc,mk,npv,ppv,tnr,tpr,aucroc,aucpr,aps,thr 2 | LIPNet,0.527,0.258,0.366,0.41,0.466,0.487,0.294,0.458,0.055,0.051,0.047,0.706,0.341,0.542,0.513,0.526,0.363,0.363,0.5 3 | AlphaFold-rsa,0.894,0.743,0.847,0.852,0.858,0.139,0.065,0.074,0.787,0.783,0.779,0.935,0.844,0.926,0.861,0.95,0.921,0.921,0.5 4 | AIUPred,0.819,0.588,0.697,0.74,0.789,0.175,0.09,0.187,0.638,0.609,0.581,0.91,0.671,0.813,0.825,0.899,0.86,0.859,0.5 5 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.dataset.f1s.metrics.csv: -------------------------------------------------------------------------------- 1 | ,bac,csi,f05,f1s,f2s,fnr,fom,fpr,inf,mcc,mk,npv,ppv,tnr,tpr,aucroc,aucpr,aps,thr 2 | LIPNet,0.498,0.315,0.366,0.48,0.696,0.005,0.644,0.999,-0.004,-0.034,-0.328,0.356,0.316,0.001,0.995,0.526,0.363,0.363,0.014 3 | AlphaFold-rsa,0.889,0.75,0.879,0.858,0.837,0.176,0.079,0.045,0.779,0.796,0.815,0.921,0.894,0.955,0.824,0.95,0.921,0.921,0.551 4 | AIUPred,0.822,0.621,0.807,0.767,0.73,0.293,0.127,0.064,0.643,0.676,0.71,0.873,0.837,0.936,0.707,0.899,0.86,0.859,0.665 5 | -------------------------------------------------------------------------------- /vectorized_metrics/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def set_logger(logfile, level): 5 | handlers = list() 6 | log_formatter = logging.Formatter('%(asctime)s | %(module)-13s | %(levelname)-8s | %(message)s') 7 | 8 | if logfile: 9 | file_handler = logging.FileHandler(logfile, 'a') 10 | file_handler.setFormatter(log_formatter) 11 | handlers.append(file_handler) 12 | else: 13 | console_handler = logging.StreamHandler() 14 | console_handler.setFormatter(log_formatter) 15 | handlers.append(console_handler) 16 | 17 | logging.basicConfig(level=level, format=log_formatter, handlers=handlers) 18 | -------------------------------------------------------------------------------- /caid.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | from vectorized_metrics.vectorized_metrics import bvaluation 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser( 12 | prog='caid-assess', description="CAID: Critical Assessment of Intrinsic Disorder", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 14 | 15 | parser.add_argument('reference_file', help='Path to the reference file') 16 | 17 | parser.add_argument('predictions', help="directory containing prediction file(s)") 18 | 19 | parser.add_argument('-o', '--outputDir', default='.', 20 | help='directory where the output will be written (default: cwd)') 21 | 22 | parser.add_argument('-b', '--labels', default=None, help='filename with labels') 23 | 24 | parser.add_argument('-l', '--log', type=str, default=None, help='log file') 25 | parser.add_argument("-ll", "--logLevel", default="WARNING", 26 | choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 27 | help='log level filter. All levels >= choice will be displayed') 28 | 29 | args = parser.parse_args() 30 | return args 31 | 32 | 33 | def set_logger(logfile, level): 34 | logging.basicConfig(level=level, 35 | format='%(asctime)s | %(module)-13s | %(levelname)-8s | %(message)s', 36 | stream=open(logfile) if logfile else sys.stderr) 37 | 38 | 39 | if __name__ == '__main__': 40 | args = parse_args() 41 | set_logger(args.log, args.logLevel) 42 | pred_paths = list(Path(args.predictions).glob('*.caid')) 43 | reference_path = Path(args.reference_file) 44 | 45 | os.makedirs(args.outputDir, exist_ok=True) 46 | 47 | bvaluation(reference_path, pred_paths, outpath=args.outputDir, dataset=True, bootstrap=True, target=True, 48 | accs_to_read=None) 49 | 50 | print('Done') 51 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.ci.default.metrics.csv: -------------------------------------------------------------------------------- 1 | ,,lo,hi,thr 2 | LIPNet,bac,0.5267194884153058,0.5273805115846942,0.5 3 | LIPNet,csi,0.25732749633557217,0.2580725036644279,0.5 4 | LIPNet,f05,0.3651216654564515,0.36609833454354845,0.5 5 | LIPNet,f1s,0.4093417828914774,0.4102782171085225,0.5 6 | LIPNet,f2s,0.4655203645079607,0.4664996354920394,0.5 7 | LIPNet,fnr,0.48648032378677747,0.48759967621322237,0.5 8 | LIPNet,fom,0.2937700180086772,0.2945899819913229,0.5 9 | LIPNet,fpr,0.4584495179794743,0.45921048202052583,0.5 10 | LIPNet,inf,0.053469485274176894,0.054790514725823115,0.5 11 | LIPNet,mcc,0.04983171277812712,0.05104828722187291,0.5 12 | LIPNet,mk,0.04644155690027707,0.04761844309972292,0.5 13 | LIPNet,npv,0.7054100180086771,0.7062299819913227,0.5 14 | LIPNet,ppv,0.3407197153282168,0.3417002846717833,0.5 15 | LIPNet,tnr,0.5407895179794742,0.5415504820205257,0.5 16 | LIPNet,tpr,0.5124003237867777,0.5135196762132225,0.5 17 | AlphaFold-rsa,bac,0.8934839359355656,0.8939560640644341,0.5 18 | AlphaFold-rsa,csi,0.7424773777428213,0.7434426222571786,0.5 19 | AlphaFold-rsa,f05,0.8471872259611388,0.847952774038861,0.5 20 | AlphaFold-rsa,f1s,0.8520403179936806,0.8526796820063192,0.5 21 | AlphaFold-rsa,f2s,0.8572717360737041,0.8579482639262954,0.5 22 | AlphaFold-rsa,fnr,0.13863115163123294,0.1393888483687671,0.5 23 | AlphaFold-rsa,fom,0.06476479486475437,0.06515520513524567,0.5 24 | AlphaFold-rsa,fpr,0.07338109300461075,0.07383890699538924,0.5 25 | AlphaFold-rsa,inf,0.786909170527419,0.7878508294725813,0.5 26 | AlphaFold-rsa,mcc,0.7827897650824049,0.7837102349175955,0.5 27 | AlphaFold-rsa,mk,0.7787400722706265,0.7797199277293735,0.5 28 | AlphaFold-rsa,npv,0.9348447948647546,0.9352352051352458,0.5 29 | AlphaFold-rsa,ppv,0.8437543331710258,0.8446256668289738,0.5 30 | AlphaFold-rsa,tnr,0.926161093004611,0.9266189069953895,0.5 31 | AlphaFold-rsa,tpr,0.8606111516312329,0.8613688483687671,0.5 32 | AIUPred,bac,0.8185144868652662,0.8191255131347334,0.5 33 | AIUPred,csi,0.5868739555345569,0.5879260444654429,0.5 34 | AIUPred,f05,0.6964118866537208,0.6974281133462787,0.5 35 | AIUPred,f1s,0.7395868179700418,0.7404531820299587,0.5 36 | AIUPred,f2s,0.7883873026375158,0.7892326973624845,0.5 37 | AIUPred,fnr,0.1744670030536258,0.17543299694637413,0.5 38 | AIUPred,fom,0.09029097616643904,0.09080902383356092,0.5 39 | AIUPred,fpr,0.18696207713014315,0.1875579228698568,0.5 40 | AIUPred,inf,0.6371942777180928,0.6383857222819072,0.5 41 | AIUPred,mcc,0.6078197758409971,0.6090002241590029,0.5 42 | AIUPred,mk,0.5797337423938208,0.5809462576061791,0.5 43 | AIUPred,npv,0.9091909761664392,0.909709023833561,0.5 44 | AIUPred,ppv,0.6703389324729924,0.6714410675270078,0.5 45 | AIUPred,tnr,0.8124420771301432,0.8130379228698569,0.5 46 | AIUPred,tpr,0.8245670030536257,0.8255329969463739,0.5 47 | -------------------------------------------------------------------------------- /demo-data/results/disorder_pdb.analysis.all.ci.f1s.metrics.csv: -------------------------------------------------------------------------------- 1 | ,,lo,hi,thr 2 | LIPNet,bac,0.49807081195178854,0.4982091880482116,0.014 3 | LIPNet,csi,0.3149886769266377,0.31569132307336234,0.014 4 | LIPNet,f05,0.3655100009546678,0.3661899990453321,0.014 5 | LIPNet,f1s,0.4792580092524259,0.48006199074757394,0.014 6 | LIPNet,f2s,0.6955100009546679,0.6961899990453321,0.014 7 | LIPNet,fnr,0.004446871255867898,0.004653128744132101,0.014 8 | LIPNet,fom,0.6373201669181053,0.6504398330818945,0.014 9 | LIPNet,fpr,0.9989999999999999,0.9989999999999999,0.014 10 | LIPNet,inf,-0.003653128744132102,-0.0034468712558679,0.014 11 | LIPNet,mcc,-0.03415523105057758,-0.03272476894942243,0.014 12 | LIPNet,mk,-0.3345726686577683,-0.3214873313422317,0.014 13 | LIPNet,npv,0.3495601669181055,0.3626798330818945,0.014 14 | LIPNet,ppv,0.31551000095466786,0.3161899990453322,0.014 15 | LIPNet,tnr,0.0010000000000000002,0.0010000000000000002,0.014 16 | LIPNet,tpr,0.9953468712558677,0.9955531287441319,0.014 17 | AlphaFold-rsa,bac,0.8888615562987058,0.8893384437012944,0.551 18 | AlphaFold-rsa,csi,0.7497653595033336,0.7507346404966666,0.551 19 | AlphaFold-rsa,f05,0.8784239453180211,0.8790560546819788,0.551 20 | AlphaFold-rsa,f1s,0.8569571498071993,0.8576028501928006,0.551 21 | AlphaFold-rsa,f2s,0.8363958521393294,0.8371641478606703,0.551 22 | AlphaFold-rsa,fnr,0.1758921847288923,0.17676781527110774,0.551 23 | AlphaFold-rsa,fom,0.07861490247632305,0.07906509752367691,0.551 24 | AlphaFold-rsa,fpr,0.045112982251019056,0.04544701774898092,0.551 25 | AlphaFold-rsa,inf,0.7779173878981664,0.7788626121018336,0.551 26 | AlphaFold-rsa,mcc,0.7960408925602009,0.796919107439799,0.551 27 | AlphaFold-rsa,mk,0.8144150103107813,0.8153049896892185,0.551 28 | AlphaFold-rsa,npv,0.9209349024763234,0.9213850975236773,0.551 29 | AlphaFold-rsa,ppv,0.8933253677258771,0.8940746322741232,0.551 30 | AlphaFold-rsa,tnr,0.9545529822510189,0.9548870177489809,0.551 31 | AlphaFold-rsa,tpr,0.8232321847288923,0.8241078152711078,0.551 32 | AIUPred,bac,0.8210920952137941,0.8217279047862056,0.665 33 | AIUPred,csi,0.6202284596293886,0.6214315403706114,0.665 34 | AIUPred,f05,0.8063287473545239,0.8072112526454763,0.665 35 | AIUPred,f1s,0.7655949779159574,0.766505022084043,0.665 36 | AIUPred,f2s,0.7288137892824053,0.7298662107175945,0.665 37 | AIUPred,fnr,0.29271105181454987,0.2938489481854502,0.665 38 | AIUPred,fom,0.12635868815608117,0.12692131184391883,0.665 39 | AIUPred,fpr,0.06378524324627495,0.06421475675372508,0.665 40 | AIUPred,inf,0.6420750635967261,0.6433649364032741,0.665 41 | AIUPred,mcc,0.6748309765180307,0.6760290234819695,0.665 42 | AIUPred,mk,0.7090300480148275,0.7102299519851725,0.665 43 | AIUPred,npv,0.8730786881560811,0.8736413118439189,0.665 44 | AIUPred,ppv,0.8357768368407777,0.836763163159222,0.665 45 | AIUPred,tnr,0.935785243246275,0.9362147567537251,0.665 46 | AIUPred,tpr,0.7061510518145496,0.70728894818545,0.665 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## CAID Assessment 2 | This repository contains the code for [CAID challenge](https://caid.idpcentral.org/challenge) assessment. Upon having predictions and reference sets, you can use this repository to generate the evalutations and metrics. 3 | CAID software packages wraps the [vectorized_cls_metrics](https://github.com/marnec/vectorized_cls_metrics) repository (with small modifications), which performs the calculations of the classification metrics used throughout CAID. For the details of evaluations, please see the Papers section. 4 | 5 | If you use this code in your research, please **cite the following papers**: 6 | 7 | - [CAID2](https://onlinelibrary.wiley.com/doi/full/10.1002/prot.26582) - Conte AD, Mehdiabadi M, Bouhraoua A, Miguel Monzon A, Tosatto SCE, Piovesan D. Critical assessment of protein intrinsic disorder prediction (CAID) - Results of round 2. Proteins. 2023; 91(12): 1925-1934 (2023) 8 | 9 | - [CAID1](https://www.nature.com/articles/s41592-021-01117-3) - Necci, M., Piovesan, D., CAID Predictors. et al. Critical assessment of protein intrinsic disorder prediction. Nat Methods 18, 472–481 (2021) 10 | 11 | 12 | ## Installation 13 | To run this package, you need to have `Python 3.8+` installed. 14 | 15 | ``` 16 | git clone https://github.com/BioComputingUP/CAID.git # clone the repository 17 | 18 | pip install -r requirements.txt # install the requirements 19 | ``` 20 | 21 | The repository is structures as below (the demo-data just contains sample data from CAID3 and the results you get from the assessment). 22 | ``` 23 | CAID --> (CAID repository) 24 | ├── caid.py --> the script to run the evaluaions 25 | ├── vectorized_metircs/ --> the assessment library 26 | └── demo-data/ --> demo data directory, with sample data from CAID3 challenge 27 | ├── predictions/ --> directory containing prediction of each method 28 | ├── references/ --> directory containing reference fasta file 29 | └── results/ --> directory for saving results 30 | ``` 31 | 32 | ## Input 33 | 34 | ### Predictions 35 | In order to run the assessment, you have to have your predictions in CAID ouptut format (see https://caid.idpcentral.org/challenge), where columns correspond to position, residue type, disorder/binding score, and a binary state. If the state is not provided, it will be automatically calculated using a threshold by maximizing f1-score. 36 | 37 | ``` 38 | >DP01234 39 | 1 M 0.892 1 40 | 2 E 0.813 1 41 | ... 42 | ``` 43 | Each file must be stored with .caid suffix. You can access and download all CAID challenge results from https://caid.idpcentral.org/challenge/results. 44 | 45 | 46 | ### References 47 | References must be provided as a single fasta file, includeing the sequence and the labels corresponding to each residue. In the labels, 0 indicates order, 1 indicates disorder/binding/linker, and - denotes that this residue is not included in the assessment. All the CAID challenge references can be downloaded from https://caid.idpcentral.org/challenge/results. 48 | 49 | ``` 50 | >DP01234 51 | MNASDFRRRGKEMVDYMADYLE 52 | 000011111000---------- 53 | ``` 54 | 55 | ## Output 56 | 57 | After running the assessment (see usage), the following files are generated. 58 | 59 | ```bash 60 | 61 | # Score distribution for a given method. `rawscore` are all scores, `thresholds` is the unique list of thresholds 62 | .{rawscore,thresholds}.distribution.txt 63 | 64 | # `dataset` the metrics for every considered threshold for a given `reference` and `method` 65 | # `bootstrap` same as `dataset` but for every boostrap sample 66 | # `target` same as `dataset` but for every predicted target 67 | .analysis..{bootstrap,dataset,target}.metrics.csv 68 | 69 | # Optimal thresholds for every calculated metric for a given `reference` and `method` 70 | .analysis..thr.csv 71 | 72 | # `ci` confidence intervals for all methods for a given `reference` and `optimization` 73 | # `bootstrap` metrics for each method and each boostrap sample for every method for a given `reference` and `optimization` 74 | .all.{ci,bootstrap}..metrics.csv 75 | 76 | # `cmat` confusion matrix for every method for a given `reference` and `optimization` 77 | # `metrics` metrics for each method 78 | .all.dataset..{cmat,metrics}.csv 79 | 80 | # `cmat` confusion matrices for all methods and all thresholds for a given `reference` 81 | # `pr` precision-recall data for all methods 82 | # `roc` ROC data for all methods 83 | # `predictions` scores and binary predictions for all methods at the residue level 84 | .all.dataset._.{cmat,pr,predictions,roc}.csv 85 | 86 | # metrics for all methods a the target level for a given `reference` and `optimization` 87 | .all.target..metrics.csv 88 | 89 | ``` 90 | 91 | ## Usage 92 | To run the assessment, you can run the `caid.py` script with arguments explained as below: 93 | ``` 94 | python3 caid.py -o 95 | ``` 96 | For example, the `demo-data/predictions` folder contains the predictions of 3 predictors from CAID3, and `demo-data/references/disorder_pdb.fasta` is the Disorder-PDB from [CAID3](https://caid.idpcentral.org/challenge/results). The script could be run by: 97 | 98 | ``` 99 | python3 caid.py demo-data/references/disorder_pdb.fasta demo-data/predictions -o demo-data/results 100 | ``` 101 | 102 | 103 | 104 | ## License 105 | [CC BY 3.0](https://creativecommons.org/licenses/by/3.0/) -------------------------------------------------------------------------------- /vectorized_metrics/parsers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from itertools import groupby 4 | import logging 5 | import numpy as np 6 | 7 | 8 | def parse_reference(ref_file: str, pttrn: dict=None, accs_to_read: set=None) -> (dict, set): 9 | """Load reference as dict from reference Fasta, return dict and set of accessions 10 | 11 | Reference dict is built to be later converted in pd.DataFrame 12 | 13 | :param ref_file: reference file 14 | :param pttrn: pattern dictionary. Each state of the reference will be interpreted looking for a 15 | corresponding value in `pattern` dict 16 | :return: 17 | """ 18 | ref_file = Path(ref_file) 19 | pattern = {'0': 0.0, '1': 1.0, '-': np.nan} if pttrn is None else pttrn 20 | 21 | ref = {('ref', 'states'): {}, ('ref', 'seq'): {}} 22 | accs = set() 23 | 24 | with open(ref_file) as f: 25 | faiter = (x[1] for x in groupby(f, lambda line: line[0] == ">")) 26 | for header in faiter: 27 | header = next(header) 28 | if header[0] != '#': 29 | acc, *desc = header[1:].strip().split() 30 | 31 | if (accs_to_read is not None and acc in accs_to_read ) or accs_to_read is None: 32 | seq, states = map(str.strip, next(faiter)) 33 | states = np.array([pattern.get(s, np.nan) for s in states], dtype=np.float64) 34 | 35 | accs.add(acc) 36 | for i, (st, aa), in enumerate(zip(states, seq)): 37 | ref[('ref', 'states')][(acc, i)] = float(st) 38 | ref[('ref', 'seq')][(acc, i)] = aa 39 | 40 | logging.debug("loaded reference as ; {}".format(ref_file.stem)) 41 | return ref, accs 42 | 43 | 44 | def strip_split(string: str) -> list: 45 | """ 46 | remove newlines and split by tabs, raise an error for unexpected number of elements from split 47 | 48 | :param string: string to modify 49 | :return: splitted string 50 | """ 51 | splitted = string.strip('\n').rsplit('\t') 52 | return splitted 53 | 54 | 55 | def parse_prediction(predfile, reference_ids, label=None, decimals=3, threshold=0.5, normalize=False): 56 | """ 57 | 58 | :param predfile: 59 | :type predfile: 60 | :param reference_ids: 61 | :type reference_ids: 62 | :param label: 63 | :type label: 64 | :param decimals: 65 | :type decimals: 66 | :param threshold: 67 | :type threshold: 68 | :return: 69 | :rtype: 70 | """ 71 | logging.debug('Parsing prediction file: {}'.format(predfile)) 72 | predfile = Path(predfile) 73 | label = label if label is not None else predfile.stem 74 | pred = {(label, 'states'): {}, (label, 'scores'): {}} 75 | 76 | reference_ids = set(reference_ids) 77 | 78 | with open(predfile.resolve()) as fhandle: 79 | faiter = (x[1] for x in groupby(fhandle, lambda line: line[0] == ">")) 80 | for acc in faiter: 81 | acc = next(acc).strip()[1:] 82 | body = next(faiter) 83 | 84 | if acc in reference_ids: 85 | splitted = list(zip(*map(strip_split, body))) 86 | scores = None 87 | states = None 88 | 89 | if len(splitted) == 3: 90 | positions, _, scores = splitted 91 | if len(splitted) == 4: 92 | positions, _, scores, states = splitted 93 | # check truth-ish of a value instead of that of its container since zip(*map(strip_split, body)) 94 | # returns list of empty strings when values are missing 95 | if states is not None and states[0]: 96 | states = np.array(states, dtype=np.float64) 97 | # when scores are missing, use states as scores 98 | scores = np.array(scores, dtype=np.float64) if scores[0] else states 99 | else: 100 | if scores is not None and scores[0]: 101 | scores = np.array(scores, dtype=np.float64) 102 | # when states are missing, apply threshold to scores to generate states 103 | # if threshold is not passed, default to 0.5 104 | states = np.greater_equal(scores, threshold).astype(np.float64) 105 | else: 106 | continue 107 | 108 | if normalize is True: 109 | # normalize in range [0, 1] 110 | if np.min(scores) < 0 or np.max(scores) > 1: 111 | scores = (scores - np.min(scores)) / np.ptp(scores) 112 | 113 | # round scores to the number of decimals passed (default 3) 114 | scores = scores.round(decimals) 115 | 116 | # reshape so that casting to pd.DataFrame is very quick 117 | for i, (st, sc), in enumerate(zip(states, scores)): 118 | pred[(label, 'states')][(acc, i)] = st 119 | pred[(label, 'scores')][(acc, i)] = sc 120 | 121 | logging.debug('loaded prediciton as ; {}'.format(label)) 122 | return pred 123 | 124 | 125 | def parse_thresholds(thr_file): 126 | thresholds = None 127 | 128 | if thr_file.resolve(strict=True): 129 | try: 130 | thresholds = {} 131 | with open(thr_file) as f: 132 | for line in f: 133 | pred, thr = line.strip().split() 134 | thresholds[pred] = float(thr) 135 | except IndexError: 136 | logging.error('Threshold file was not formatted properly, default threshold will be estimated from scores') 137 | 138 | return thresholds 139 | -------------------------------------------------------------------------------- /demo-data/results/AlphaFold-rsa.thresholds.distribution.txt: -------------------------------------------------------------------------------- 1 | 1.971 2 | 0.971 3 | 0.970 4 | 0.969 5 | 0.968 6 | 0.964 7 | 0.963 8 | 0.962 9 | 0.961 10 | 0.960 11 | 0.959 12 | 0.958 13 | 0.957 14 | 0.956 15 | 0.955 16 | 0.954 17 | 0.953 18 | 0.952 19 | 0.951 20 | 0.950 21 | 0.949 22 | 0.948 23 | 0.947 24 | 0.946 25 | 0.945 26 | 0.944 27 | 0.943 28 | 0.942 29 | 0.941 30 | 0.940 31 | 0.939 32 | 0.938 33 | 0.937 34 | 0.936 35 | 0.935 36 | 0.934 37 | 0.933 38 | 0.932 39 | 0.931 40 | 0.930 41 | 0.929 42 | 0.928 43 | 0.927 44 | 0.926 45 | 0.925 46 | 0.924 47 | 0.923 48 | 0.922 49 | 0.921 50 | 0.920 51 | 0.919 52 | 0.918 53 | 0.917 54 | 0.916 55 | 0.915 56 | 0.914 57 | 0.913 58 | 0.912 59 | 0.911 60 | 0.910 61 | 0.909 62 | 0.908 63 | 0.907 64 | 0.906 65 | 0.905 66 | 0.904 67 | 0.903 68 | 0.902 69 | 0.901 70 | 0.900 71 | 0.899 72 | 0.898 73 | 0.897 74 | 0.896 75 | 0.895 76 | 0.894 77 | 0.893 78 | 0.892 79 | 0.891 80 | 0.890 81 | 0.889 82 | 0.888 83 | 0.887 84 | 0.886 85 | 0.885 86 | 0.884 87 | 0.883 88 | 0.882 89 | 0.881 90 | 0.880 91 | 0.879 92 | 0.878 93 | 0.877 94 | 0.876 95 | 0.875 96 | 0.874 97 | 0.873 98 | 0.872 99 | 0.871 100 | 0.870 101 | 0.869 102 | 0.868 103 | 0.867 104 | 0.866 105 | 0.865 106 | 0.864 107 | 0.863 108 | 0.862 109 | 0.861 110 | 0.860 111 | 0.859 112 | 0.858 113 | 0.857 114 | 0.856 115 | 0.855 116 | 0.854 117 | 0.853 118 | 0.852 119 | 0.851 120 | 0.850 121 | 0.849 122 | 0.848 123 | 0.847 124 | 0.846 125 | 0.845 126 | 0.844 127 | 0.843 128 | 0.842 129 | 0.841 130 | 0.840 131 | 0.839 132 | 0.838 133 | 0.837 134 | 0.836 135 | 0.835 136 | 0.834 137 | 0.833 138 | 0.832 139 | 0.831 140 | 0.830 141 | 0.829 142 | 0.828 143 | 0.827 144 | 0.826 145 | 0.825 146 | 0.824 147 | 0.823 148 | 0.822 149 | 0.821 150 | 0.820 151 | 0.819 152 | 0.818 153 | 0.817 154 | 0.816 155 | 0.815 156 | 0.814 157 | 0.813 158 | 0.812 159 | 0.811 160 | 0.810 161 | 0.809 162 | 0.808 163 | 0.807 164 | 0.806 165 | 0.805 166 | 0.804 167 | 0.803 168 | 0.802 169 | 0.801 170 | 0.800 171 | 0.799 172 | 0.798 173 | 0.797 174 | 0.796 175 | 0.795 176 | 0.794 177 | 0.793 178 | 0.792 179 | 0.791 180 | 0.790 181 | 0.789 182 | 0.788 183 | 0.787 184 | 0.786 185 | 0.785 186 | 0.784 187 | 0.783 188 | 0.782 189 | 0.781 190 | 0.780 191 | 0.779 192 | 0.778 193 | 0.777 194 | 0.776 195 | 0.775 196 | 0.774 197 | 0.773 198 | 0.772 199 | 0.771 200 | 0.770 201 | 0.769 202 | 0.768 203 | 0.767 204 | 0.766 205 | 0.765 206 | 0.764 207 | 0.763 208 | 0.762 209 | 0.761 210 | 0.760 211 | 0.759 212 | 0.758 213 | 0.757 214 | 0.756 215 | 0.755 216 | 0.754 217 | 0.753 218 | 0.752 219 | 0.751 220 | 0.750 221 | 0.749 222 | 0.748 223 | 0.747 224 | 0.746 225 | 0.745 226 | 0.744 227 | 0.743 228 | 0.742 229 | 0.741 230 | 0.740 231 | 0.739 232 | 0.738 233 | 0.737 234 | 0.736 235 | 0.735 236 | 0.734 237 | 0.733 238 | 0.732 239 | 0.731 240 | 0.730 241 | 0.729 242 | 0.728 243 | 0.727 244 | 0.726 245 | 0.725 246 | 0.724 247 | 0.723 248 | 0.722 249 | 0.721 250 | 0.720 251 | 0.719 252 | 0.718 253 | 0.717 254 | 0.716 255 | 0.715 256 | 0.714 257 | 0.713 258 | 0.712 259 | 0.711 260 | 0.710 261 | 0.709 262 | 0.708 263 | 0.707 264 | 0.706 265 | 0.705 266 | 0.704 267 | 0.703 268 | 0.702 269 | 0.701 270 | 0.700 271 | 0.699 272 | 0.698 273 | 0.697 274 | 0.696 275 | 0.695 276 | 0.694 277 | 0.693 278 | 0.692 279 | 0.691 280 | 0.690 281 | 0.689 282 | 0.688 283 | 0.687 284 | 0.686 285 | 0.685 286 | 0.684 287 | 0.683 288 | 0.682 289 | 0.681 290 | 0.680 291 | 0.679 292 | 0.678 293 | 0.677 294 | 0.676 295 | 0.675 296 | 0.674 297 | 0.673 298 | 0.672 299 | 0.671 300 | 0.670 301 | 0.669 302 | 0.668 303 | 0.667 304 | 0.666 305 | 0.665 306 | 0.664 307 | 0.663 308 | 0.662 309 | 0.661 310 | 0.660 311 | 0.659 312 | 0.658 313 | 0.657 314 | 0.656 315 | 0.655 316 | 0.654 317 | 0.653 318 | 0.652 319 | 0.651 320 | 0.650 321 | 0.649 322 | 0.648 323 | 0.647 324 | 0.646 325 | 0.645 326 | 0.644 327 | 0.643 328 | 0.642 329 | 0.641 330 | 0.640 331 | 0.639 332 | 0.638 333 | 0.637 334 | 0.636 335 | 0.635 336 | 0.634 337 | 0.633 338 | 0.632 339 | 0.631 340 | 0.630 341 | 0.629 342 | 0.628 343 | 0.627 344 | 0.626 345 | 0.625 346 | 0.624 347 | 0.623 348 | 0.622 349 | 0.621 350 | 0.620 351 | 0.619 352 | 0.618 353 | 0.617 354 | 0.616 355 | 0.615 356 | 0.614 357 | 0.613 358 | 0.612 359 | 0.611 360 | 0.610 361 | 0.609 362 | 0.608 363 | 0.607 364 | 0.606 365 | 0.605 366 | 0.604 367 | 0.603 368 | 0.602 369 | 0.601 370 | 0.600 371 | 0.599 372 | 0.598 373 | 0.597 374 | 0.596 375 | 0.595 376 | 0.594 377 | 0.593 378 | 0.592 379 | 0.591 380 | 0.590 381 | 0.589 382 | 0.588 383 | 0.587 384 | 0.586 385 | 0.585 386 | 0.584 387 | 0.583 388 | 0.582 389 | 0.581 390 | 0.580 391 | 0.579 392 | 0.578 393 | 0.577 394 | 0.576 395 | 0.575 396 | 0.574 397 | 0.573 398 | 0.572 399 | 0.571 400 | 0.570 401 | 0.569 402 | 0.568 403 | 0.567 404 | 0.566 405 | 0.565 406 | 0.564 407 | 0.563 408 | 0.562 409 | 0.561 410 | 0.560 411 | 0.559 412 | 0.558 413 | 0.557 414 | 0.556 415 | 0.555 416 | 0.554 417 | 0.553 418 | 0.552 419 | 0.551 420 | 0.550 421 | 0.549 422 | 0.548 423 | 0.547 424 | 0.546 425 | 0.545 426 | 0.544 427 | 0.543 428 | 0.542 429 | 0.541 430 | 0.540 431 | 0.539 432 | 0.538 433 | 0.537 434 | 0.536 435 | 0.535 436 | 0.534 437 | 0.533 438 | 0.532 439 | 0.531 440 | 0.530 441 | 0.529 442 | 0.528 443 | 0.527 444 | 0.526 445 | 0.525 446 | 0.524 447 | 0.523 448 | 0.522 449 | 0.521 450 | 0.520 451 | 0.519 452 | 0.518 453 | 0.517 454 | 0.516 455 | 0.515 456 | 0.514 457 | 0.513 458 | 0.512 459 | 0.511 460 | 0.510 461 | 0.509 462 | 0.508 463 | 0.507 464 | 0.506 465 | 0.505 466 | 0.504 467 | 0.503 468 | 0.502 469 | 0.501 470 | 0.500 471 | 0.499 472 | 0.498 473 | 0.497 474 | 0.496 475 | 0.495 476 | 0.494 477 | 0.493 478 | 0.492 479 | 0.491 480 | 0.490 481 | 0.489 482 | 0.488 483 | 0.487 484 | 0.486 485 | 0.485 486 | 0.484 487 | 0.483 488 | 0.482 489 | 0.481 490 | 0.480 491 | 0.479 492 | 0.478 493 | 0.477 494 | 0.476 495 | 0.475 496 | 0.474 497 | 0.473 498 | 0.472 499 | 0.471 500 | 0.470 501 | 0.469 502 | 0.468 503 | 0.467 504 | 0.466 505 | 0.465 506 | 0.464 507 | 0.463 508 | 0.462 509 | 0.461 510 | 0.460 511 | 0.459 512 | 0.458 513 | 0.457 514 | 0.456 515 | 0.455 516 | 0.454 517 | 0.453 518 | 0.452 519 | 0.451 520 | 0.450 521 | 0.449 522 | 0.448 523 | 0.447 524 | 0.446 525 | 0.445 526 | 0.444 527 | 0.443 528 | 0.442 529 | 0.441 530 | 0.440 531 | 0.439 532 | 0.438 533 | 0.437 534 | 0.436 535 | 0.435 536 | 0.434 537 | 0.433 538 | 0.432 539 | 0.431 540 | 0.430 541 | 0.429 542 | 0.428 543 | 0.427 544 | 0.426 545 | 0.425 546 | 0.424 547 | 0.423 548 | 0.422 549 | 0.421 550 | 0.420 551 | 0.419 552 | 0.418 553 | 0.417 554 | 0.416 555 | 0.415 556 | 0.414 557 | 0.413 558 | 0.412 559 | 0.411 560 | 0.410 561 | 0.409 562 | 0.408 563 | 0.407 564 | 0.406 565 | 0.405 566 | 0.404 567 | 0.403 568 | 0.402 569 | 0.401 570 | 0.400 571 | 0.399 572 | 0.398 573 | 0.397 574 | 0.396 575 | 0.395 576 | 0.394 577 | 0.393 578 | 0.392 579 | 0.391 580 | 0.390 581 | 0.389 582 | 0.388 583 | 0.387 584 | 0.386 585 | 0.385 586 | 0.384 587 | 0.383 588 | 0.382 589 | 0.381 590 | 0.380 591 | 0.379 592 | 0.378 593 | 0.377 594 | 0.376 595 | 0.375 596 | 0.374 597 | 0.373 598 | 0.372 599 | 0.371 600 | 0.370 601 | 0.369 602 | 0.368 603 | 0.367 604 | 0.366 605 | 0.365 606 | 0.364 607 | 0.363 608 | 0.362 609 | 0.361 610 | 0.360 611 | 0.359 612 | 0.358 613 | 0.357 614 | 0.356 615 | 0.355 616 | 0.354 617 | 0.353 618 | 0.352 619 | 0.351 620 | 0.350 621 | 0.349 622 | 0.348 623 | 0.347 624 | 0.346 625 | 0.345 626 | 0.344 627 | 0.343 628 | 0.342 629 | 0.341 630 | 0.340 631 | 0.339 632 | 0.338 633 | 0.337 634 | 0.336 635 | 0.335 636 | 0.334 637 | 0.333 638 | 0.332 639 | 0.331 640 | 0.330 641 | 0.329 642 | 0.328 643 | 0.327 644 | 0.326 645 | 0.325 646 | 0.324 647 | 0.323 648 | 0.322 649 | 0.321 650 | 0.320 651 | 0.319 652 | 0.318 653 | 0.317 654 | 0.316 655 | 0.315 656 | 0.314 657 | 0.313 658 | 0.312 659 | 0.311 660 | 0.310 661 | 0.309 662 | 0.308 663 | 0.307 664 | 0.306 665 | 0.305 666 | 0.304 667 | 0.303 668 | 0.302 669 | 0.301 670 | 0.300 671 | 0.299 672 | 0.298 673 | 0.297 674 | 0.296 675 | 0.295 676 | 0.294 677 | 0.293 678 | 0.292 679 | 0.291 680 | 0.290 681 | 0.289 682 | 0.288 683 | 0.287 684 | 0.286 685 | 0.285 686 | 0.284 687 | 0.283 688 | 0.282 689 | 0.281 690 | 0.280 691 | 0.279 692 | 0.278 693 | 0.277 694 | 0.276 695 | 0.275 696 | 0.274 697 | 0.273 698 | 0.272 699 | 0.271 700 | 0.270 701 | 0.269 702 | 0.268 703 | 0.267 704 | 0.266 705 | 0.265 706 | 0.264 707 | 0.263 708 | 0.262 709 | 0.261 710 | 0.260 711 | 0.259 712 | 0.258 713 | 0.257 714 | 0.256 715 | 0.255 716 | 0.254 717 | 0.253 718 | 0.252 719 | 0.251 720 | 0.250 721 | 0.249 722 | 0.248 723 | 0.247 724 | 0.246 725 | 0.245 726 | 0.244 727 | 0.243 728 | 0.242 729 | 0.241 730 | 0.240 731 | 0.239 732 | 0.238 733 | 0.237 734 | 0.236 735 | 0.235 736 | 0.234 737 | 0.233 738 | 0.232 739 | 0.231 740 | 0.230 741 | 0.229 742 | 0.228 743 | 0.227 744 | 0.226 745 | 0.225 746 | 0.224 747 | 0.223 748 | 0.222 749 | 0.221 750 | 0.220 751 | 0.219 752 | 0.218 753 | 0.217 754 | 0.216 755 | 0.215 756 | 0.214 757 | 0.213 758 | 0.212 759 | 0.211 760 | 0.210 761 | 0.209 762 | 0.208 763 | 0.207 764 | 0.206 765 | 0.205 766 | 0.204 767 | 0.203 768 | 0.202 769 | 0.201 770 | 0.200 771 | 0.199 772 | 0.198 773 | 0.197 774 | 0.196 775 | 0.195 776 | 0.194 777 | 0.193 778 | 0.192 779 | 0.191 780 | 0.190 781 | 0.189 782 | 0.188 783 | 0.187 784 | 0.186 785 | 0.185 786 | 0.184 787 | 0.183 788 | 0.182 789 | 0.181 790 | 0.180 791 | 0.179 792 | 0.178 793 | 0.177 794 | 0.176 795 | 0.175 796 | 0.174 797 | 0.173 798 | 0.172 799 | 0.171 800 | 0.170 801 | 0.169 802 | 0.168 803 | 0.167 804 | 0.166 805 | 0.165 806 | 0.164 807 | 0.163 808 | 0.162 809 | 0.161 810 | 0.160 811 | 0.159 812 | 0.158 813 | 0.157 814 | 0.156 815 | 0.155 816 | 0.154 817 | 0.153 818 | 0.152 819 | 0.151 820 | 0.150 821 | 0.149 822 | 0.148 823 | 0.147 824 | 0.146 825 | 0.145 826 | 0.144 827 | 0.143 828 | 0.142 829 | 0.141 830 | 0.140 831 | 0.139 832 | 0.138 833 | 0.137 834 | 0.136 835 | 0.135 836 | 0.134 837 | 0.133 838 | 0.132 839 | 0.131 840 | 0.130 841 | 0.129 842 | 0.128 843 | 0.127 844 | 0.126 845 | 0.125 846 | 0.124 847 | 0.123 848 | 0.122 849 | 0.121 850 | 0.120 851 | 0.119 852 | 0.118 853 | 0.117 854 | 0.116 855 | 0.115 856 | 0.114 857 | 0.113 858 | 0.112 859 | 0.111 860 | 0.110 861 | 0.109 862 | 0.108 863 | 0.107 864 | 0.106 865 | 0.105 866 | 0.104 867 | 0.103 868 | 0.102 869 | 0.101 870 | 0.100 871 | 0.099 872 | 0.098 873 | 0.097 874 | 0.096 875 | 0.095 876 | 0.094 877 | 0.093 878 | 0.092 879 | 0.091 880 | 0.090 881 | 0.089 882 | 0.088 883 | 0.087 884 | 0.086 885 | 0.085 886 | 0.084 887 | 0.083 888 | 0.082 889 | 0.081 890 | 0.080 891 | 0.079 892 | 0.078 893 | 0.077 894 | 0.076 895 | 0.075 896 | 0.074 897 | 0.073 898 | 0.072 899 | 0.071 900 | 0.070 901 | 0.069 902 | 0.068 903 | 0.067 904 | 0.066 905 | 0.065 906 | 0.064 907 | 0.063 908 | 0.062 909 | 0.061 910 | 0.060 911 | 0.059 912 | 0.058 913 | 0.057 914 | 0.056 915 | 0.055 916 | 0.054 917 | 0.053 918 | 0.052 919 | 0.051 920 | 0.050 921 | 0.049 922 | 0.048 923 | 0.047 924 | 0.046 925 | 0.045 926 | 0.044 927 | 0.043 928 | 0.042 929 | 0.041 930 | 0.040 931 | 0.039 932 | 0.038 933 | 0.037 934 | 0.036 935 | 0.035 936 | 0.034 937 | 0.033 938 | 0.032 939 | 0.031 940 | 0.030 941 | 0.029 942 | 0.027 943 | 0.026 944 | 0.025 945 | 0.024 946 | 0.023 947 | 0.022 948 | 0.021 949 | 0.020 950 | 0.019 951 | 0.018 952 | 0.017 953 | 0.016 954 | 0.015 955 | 0.014 956 | 0.010 957 | 0.009 958 | 0.004 959 | 0.003 960 | -------------------------------------------------------------------------------- /demo-data/results/LIPNet.thresholds.distribution.txt: -------------------------------------------------------------------------------- 1 | 1.999 2 | 0.999 3 | 0.998 4 | 0.997 5 | 0.996 6 | 0.995 7 | 0.994 8 | 0.993 9 | 0.992 10 | 0.991 11 | 0.990 12 | 0.989 13 | 0.988 14 | 0.987 15 | 0.986 16 | 0.985 17 | 0.984 18 | 0.983 19 | 0.982 20 | 0.981 21 | 0.980 22 | 0.979 23 | 0.978 24 | 0.977 25 | 0.976 26 | 0.975 27 | 0.974 28 | 0.973 29 | 0.972 30 | 0.971 31 | 0.970 32 | 0.969 33 | 0.968 34 | 0.967 35 | 0.966 36 | 0.965 37 | 0.964 38 | 0.963 39 | 0.962 40 | 0.961 41 | 0.960 42 | 0.959 43 | 0.958 44 | 0.957 45 | 0.956 46 | 0.955 47 | 0.954 48 | 0.953 49 | 0.952 50 | 0.951 51 | 0.950 52 | 0.949 53 | 0.948 54 | 0.947 55 | 0.946 56 | 0.945 57 | 0.944 58 | 0.943 59 | 0.942 60 | 0.941 61 | 0.940 62 | 0.939 63 | 0.938 64 | 0.937 65 | 0.936 66 | 0.935 67 | 0.934 68 | 0.933 69 | 0.932 70 | 0.931 71 | 0.930 72 | 0.929 73 | 0.928 74 | 0.927 75 | 0.926 76 | 0.925 77 | 0.924 78 | 0.923 79 | 0.922 80 | 0.921 81 | 0.920 82 | 0.919 83 | 0.918 84 | 0.917 85 | 0.916 86 | 0.915 87 | 0.914 88 | 0.913 89 | 0.912 90 | 0.911 91 | 0.910 92 | 0.909 93 | 0.908 94 | 0.907 95 | 0.906 96 | 0.905 97 | 0.904 98 | 0.903 99 | 0.902 100 | 0.901 101 | 0.900 102 | 0.899 103 | 0.898 104 | 0.897 105 | 0.896 106 | 0.895 107 | 0.894 108 | 0.893 109 | 0.892 110 | 0.891 111 | 0.890 112 | 0.889 113 | 0.888 114 | 0.887 115 | 0.886 116 | 0.885 117 | 0.884 118 | 0.883 119 | 0.882 120 | 0.881 121 | 0.880 122 | 0.879 123 | 0.878 124 | 0.877 125 | 0.876 126 | 0.875 127 | 0.874 128 | 0.873 129 | 0.872 130 | 0.871 131 | 0.870 132 | 0.869 133 | 0.868 134 | 0.867 135 | 0.866 136 | 0.865 137 | 0.864 138 | 0.863 139 | 0.862 140 | 0.861 141 | 0.860 142 | 0.859 143 | 0.858 144 | 0.857 145 | 0.856 146 | 0.855 147 | 0.854 148 | 0.853 149 | 0.852 150 | 0.851 151 | 0.850 152 | 0.849 153 | 0.848 154 | 0.847 155 | 0.846 156 | 0.845 157 | 0.844 158 | 0.843 159 | 0.842 160 | 0.841 161 | 0.840 162 | 0.839 163 | 0.838 164 | 0.837 165 | 0.836 166 | 0.835 167 | 0.834 168 | 0.833 169 | 0.832 170 | 0.831 171 | 0.830 172 | 0.829 173 | 0.828 174 | 0.827 175 | 0.826 176 | 0.825 177 | 0.824 178 | 0.823 179 | 0.822 180 | 0.821 181 | 0.820 182 | 0.819 183 | 0.818 184 | 0.817 185 | 0.816 186 | 0.815 187 | 0.814 188 | 0.813 189 | 0.812 190 | 0.811 191 | 0.810 192 | 0.809 193 | 0.808 194 | 0.807 195 | 0.806 196 | 0.805 197 | 0.804 198 | 0.803 199 | 0.802 200 | 0.801 201 | 0.800 202 | 0.799 203 | 0.798 204 | 0.797 205 | 0.796 206 | 0.795 207 | 0.794 208 | 0.793 209 | 0.792 210 | 0.791 211 | 0.790 212 | 0.789 213 | 0.788 214 | 0.787 215 | 0.786 216 | 0.785 217 | 0.784 218 | 0.783 219 | 0.782 220 | 0.781 221 | 0.780 222 | 0.779 223 | 0.778 224 | 0.777 225 | 0.776 226 | 0.775 227 | 0.774 228 | 0.773 229 | 0.772 230 | 0.771 231 | 0.770 232 | 0.769 233 | 0.768 234 | 0.767 235 | 0.766 236 | 0.765 237 | 0.764 238 | 0.763 239 | 0.762 240 | 0.761 241 | 0.760 242 | 0.759 243 | 0.758 244 | 0.757 245 | 0.756 246 | 0.755 247 | 0.754 248 | 0.753 249 | 0.752 250 | 0.751 251 | 0.750 252 | 0.749 253 | 0.748 254 | 0.747 255 | 0.746 256 | 0.745 257 | 0.744 258 | 0.743 259 | 0.742 260 | 0.741 261 | 0.740 262 | 0.739 263 | 0.738 264 | 0.737 265 | 0.736 266 | 0.735 267 | 0.734 268 | 0.733 269 | 0.732 270 | 0.731 271 | 0.730 272 | 0.729 273 | 0.728 274 | 0.727 275 | 0.726 276 | 0.725 277 | 0.724 278 | 0.723 279 | 0.722 280 | 0.721 281 | 0.720 282 | 0.719 283 | 0.718 284 | 0.717 285 | 0.716 286 | 0.715 287 | 0.714 288 | 0.713 289 | 0.712 290 | 0.711 291 | 0.710 292 | 0.709 293 | 0.708 294 | 0.707 295 | 0.706 296 | 0.705 297 | 0.704 298 | 0.703 299 | 0.702 300 | 0.701 301 | 0.700 302 | 0.699 303 | 0.698 304 | 0.697 305 | 0.696 306 | 0.695 307 | 0.694 308 | 0.693 309 | 0.692 310 | 0.691 311 | 0.690 312 | 0.689 313 | 0.688 314 | 0.687 315 | 0.686 316 | 0.685 317 | 0.684 318 | 0.683 319 | 0.682 320 | 0.681 321 | 0.680 322 | 0.679 323 | 0.678 324 | 0.677 325 | 0.676 326 | 0.675 327 | 0.674 328 | 0.673 329 | 0.672 330 | 0.671 331 | 0.670 332 | 0.669 333 | 0.668 334 | 0.667 335 | 0.666 336 | 0.665 337 | 0.664 338 | 0.663 339 | 0.662 340 | 0.661 341 | 0.660 342 | 0.659 343 | 0.658 344 | 0.657 345 | 0.656 346 | 0.655 347 | 0.654 348 | 0.653 349 | 0.652 350 | 0.651 351 | 0.650 352 | 0.649 353 | 0.648 354 | 0.647 355 | 0.646 356 | 0.645 357 | 0.644 358 | 0.643 359 | 0.642 360 | 0.641 361 | 0.640 362 | 0.639 363 | 0.638 364 | 0.637 365 | 0.636 366 | 0.635 367 | 0.634 368 | 0.633 369 | 0.632 370 | 0.631 371 | 0.630 372 | 0.629 373 | 0.628 374 | 0.627 375 | 0.626 376 | 0.625 377 | 0.624 378 | 0.623 379 | 0.622 380 | 0.621 381 | 0.620 382 | 0.619 383 | 0.618 384 | 0.617 385 | 0.616 386 | 0.615 387 | 0.614 388 | 0.613 389 | 0.612 390 | 0.611 391 | 0.610 392 | 0.609 393 | 0.608 394 | 0.607 395 | 0.606 396 | 0.605 397 | 0.604 398 | 0.603 399 | 0.602 400 | 0.601 401 | 0.600 402 | 0.599 403 | 0.598 404 | 0.597 405 | 0.596 406 | 0.595 407 | 0.594 408 | 0.593 409 | 0.592 410 | 0.591 411 | 0.590 412 | 0.589 413 | 0.588 414 | 0.587 415 | 0.586 416 | 0.585 417 | 0.584 418 | 0.583 419 | 0.582 420 | 0.581 421 | 0.580 422 | 0.579 423 | 0.578 424 | 0.577 425 | 0.576 426 | 0.575 427 | 0.574 428 | 0.573 429 | 0.572 430 | 0.571 431 | 0.570 432 | 0.569 433 | 0.568 434 | 0.567 435 | 0.566 436 | 0.565 437 | 0.564 438 | 0.563 439 | 0.562 440 | 0.561 441 | 0.560 442 | 0.559 443 | 0.558 444 | 0.557 445 | 0.556 446 | 0.555 447 | 0.554 448 | 0.553 449 | 0.552 450 | 0.551 451 | 0.550 452 | 0.549 453 | 0.548 454 | 0.547 455 | 0.546 456 | 0.545 457 | 0.544 458 | 0.543 459 | 0.542 460 | 0.541 461 | 0.540 462 | 0.539 463 | 0.538 464 | 0.537 465 | 0.536 466 | 0.535 467 | 0.534 468 | 0.533 469 | 0.532 470 | 0.531 471 | 0.530 472 | 0.529 473 | 0.528 474 | 0.527 475 | 0.526 476 | 0.525 477 | 0.524 478 | 0.523 479 | 0.522 480 | 0.521 481 | 0.520 482 | 0.519 483 | 0.518 484 | 0.517 485 | 0.516 486 | 0.515 487 | 0.514 488 | 0.513 489 | 0.512 490 | 0.511 491 | 0.510 492 | 0.509 493 | 0.508 494 | 0.507 495 | 0.506 496 | 0.505 497 | 0.504 498 | 0.503 499 | 0.502 500 | 0.501 501 | 0.500 502 | 0.499 503 | 0.498 504 | 0.497 505 | 0.496 506 | 0.495 507 | 0.494 508 | 0.493 509 | 0.492 510 | 0.491 511 | 0.490 512 | 0.489 513 | 0.488 514 | 0.487 515 | 0.486 516 | 0.485 517 | 0.484 518 | 0.483 519 | 0.482 520 | 0.481 521 | 0.480 522 | 0.479 523 | 0.478 524 | 0.477 525 | 0.476 526 | 0.475 527 | 0.474 528 | 0.473 529 | 0.472 530 | 0.471 531 | 0.470 532 | 0.469 533 | 0.468 534 | 0.467 535 | 0.466 536 | 0.465 537 | 0.464 538 | 0.463 539 | 0.462 540 | 0.461 541 | 0.460 542 | 0.459 543 | 0.458 544 | 0.457 545 | 0.456 546 | 0.455 547 | 0.454 548 | 0.453 549 | 0.452 550 | 0.451 551 | 0.450 552 | 0.449 553 | 0.448 554 | 0.447 555 | 0.446 556 | 0.445 557 | 0.444 558 | 0.443 559 | 0.442 560 | 0.441 561 | 0.440 562 | 0.439 563 | 0.438 564 | 0.437 565 | 0.436 566 | 0.435 567 | 0.434 568 | 0.433 569 | 0.432 570 | 0.431 571 | 0.430 572 | 0.429 573 | 0.428 574 | 0.427 575 | 0.426 576 | 0.425 577 | 0.424 578 | 0.423 579 | 0.422 580 | 0.421 581 | 0.420 582 | 0.419 583 | 0.418 584 | 0.417 585 | 0.416 586 | 0.415 587 | 0.414 588 | 0.413 589 | 0.412 590 | 0.411 591 | 0.410 592 | 0.409 593 | 0.408 594 | 0.407 595 | 0.406 596 | 0.405 597 | 0.404 598 | 0.403 599 | 0.402 600 | 0.401 601 | 0.400 602 | 0.399 603 | 0.398 604 | 0.397 605 | 0.396 606 | 0.395 607 | 0.394 608 | 0.393 609 | 0.392 610 | 0.391 611 | 0.390 612 | 0.389 613 | 0.388 614 | 0.387 615 | 0.386 616 | 0.385 617 | 0.384 618 | 0.383 619 | 0.382 620 | 0.381 621 | 0.380 622 | 0.379 623 | 0.378 624 | 0.377 625 | 0.376 626 | 0.375 627 | 0.374 628 | 0.373 629 | 0.372 630 | 0.371 631 | 0.370 632 | 0.369 633 | 0.368 634 | 0.367 635 | 0.366 636 | 0.365 637 | 0.364 638 | 0.363 639 | 0.362 640 | 0.361 641 | 0.360 642 | 0.359 643 | 0.358 644 | 0.357 645 | 0.356 646 | 0.355 647 | 0.354 648 | 0.353 649 | 0.352 650 | 0.351 651 | 0.350 652 | 0.349 653 | 0.348 654 | 0.347 655 | 0.346 656 | 0.345 657 | 0.344 658 | 0.343 659 | 0.342 660 | 0.341 661 | 0.340 662 | 0.339 663 | 0.338 664 | 0.337 665 | 0.336 666 | 0.335 667 | 0.334 668 | 0.333 669 | 0.332 670 | 0.331 671 | 0.330 672 | 0.329 673 | 0.328 674 | 0.327 675 | 0.326 676 | 0.325 677 | 0.324 678 | 0.323 679 | 0.322 680 | 0.321 681 | 0.320 682 | 0.319 683 | 0.318 684 | 0.317 685 | 0.316 686 | 0.315 687 | 0.314 688 | 0.313 689 | 0.312 690 | 0.311 691 | 0.310 692 | 0.309 693 | 0.308 694 | 0.307 695 | 0.306 696 | 0.305 697 | 0.304 698 | 0.303 699 | 0.302 700 | 0.301 701 | 0.300 702 | 0.299 703 | 0.298 704 | 0.297 705 | 0.296 706 | 0.295 707 | 0.294 708 | 0.293 709 | 0.292 710 | 0.291 711 | 0.290 712 | 0.289 713 | 0.288 714 | 0.287 715 | 0.286 716 | 0.285 717 | 0.284 718 | 0.283 719 | 0.282 720 | 0.281 721 | 0.280 722 | 0.279 723 | 0.278 724 | 0.277 725 | 0.276 726 | 0.275 727 | 0.274 728 | 0.273 729 | 0.272 730 | 0.271 731 | 0.270 732 | 0.269 733 | 0.268 734 | 0.267 735 | 0.266 736 | 0.265 737 | 0.264 738 | 0.263 739 | 0.262 740 | 0.261 741 | 0.260 742 | 0.259 743 | 0.258 744 | 0.257 745 | 0.256 746 | 0.255 747 | 0.254 748 | 0.253 749 | 0.252 750 | 0.251 751 | 0.250 752 | 0.249 753 | 0.248 754 | 0.247 755 | 0.246 756 | 0.245 757 | 0.244 758 | 0.243 759 | 0.242 760 | 0.241 761 | 0.240 762 | 0.239 763 | 0.238 764 | 0.237 765 | 0.236 766 | 0.235 767 | 0.234 768 | 0.233 769 | 0.232 770 | 0.231 771 | 0.230 772 | 0.229 773 | 0.228 774 | 0.227 775 | 0.226 776 | 0.225 777 | 0.224 778 | 0.223 779 | 0.222 780 | 0.221 781 | 0.220 782 | 0.219 783 | 0.218 784 | 0.217 785 | 0.216 786 | 0.215 787 | 0.214 788 | 0.213 789 | 0.212 790 | 0.211 791 | 0.210 792 | 0.209 793 | 0.208 794 | 0.207 795 | 0.206 796 | 0.205 797 | 0.204 798 | 0.203 799 | 0.202 800 | 0.201 801 | 0.200 802 | 0.199 803 | 0.198 804 | 0.197 805 | 0.196 806 | 0.195 807 | 0.194 808 | 0.193 809 | 0.192 810 | 0.191 811 | 0.190 812 | 0.189 813 | 0.188 814 | 0.187 815 | 0.186 816 | 0.185 817 | 0.184 818 | 0.183 819 | 0.182 820 | 0.181 821 | 0.180 822 | 0.179 823 | 0.178 824 | 0.177 825 | 0.176 826 | 0.175 827 | 0.174 828 | 0.173 829 | 0.172 830 | 0.171 831 | 0.170 832 | 0.169 833 | 0.168 834 | 0.167 835 | 0.166 836 | 0.165 837 | 0.164 838 | 0.163 839 | 0.162 840 | 0.161 841 | 0.160 842 | 0.159 843 | 0.158 844 | 0.157 845 | 0.156 846 | 0.155 847 | 0.154 848 | 0.153 849 | 0.152 850 | 0.151 851 | 0.150 852 | 0.149 853 | 0.148 854 | 0.147 855 | 0.146 856 | 0.145 857 | 0.144 858 | 0.143 859 | 0.142 860 | 0.141 861 | 0.140 862 | 0.139 863 | 0.138 864 | 0.137 865 | 0.136 866 | 0.135 867 | 0.134 868 | 0.133 869 | 0.132 870 | 0.131 871 | 0.130 872 | 0.129 873 | 0.128 874 | 0.127 875 | 0.126 876 | 0.125 877 | 0.124 878 | 0.123 879 | 0.122 880 | 0.121 881 | 0.120 882 | 0.119 883 | 0.118 884 | 0.117 885 | 0.116 886 | 0.115 887 | 0.114 888 | 0.113 889 | 0.112 890 | 0.111 891 | 0.110 892 | 0.109 893 | 0.108 894 | 0.107 895 | 0.106 896 | 0.105 897 | 0.104 898 | 0.103 899 | 0.102 900 | 0.101 901 | 0.100 902 | 0.099 903 | 0.098 904 | 0.097 905 | 0.096 906 | 0.095 907 | 0.094 908 | 0.093 909 | 0.092 910 | 0.091 911 | 0.090 912 | 0.089 913 | 0.088 914 | 0.087 915 | 0.086 916 | 0.085 917 | 0.084 918 | 0.083 919 | 0.082 920 | 0.081 921 | 0.080 922 | 0.079 923 | 0.078 924 | 0.077 925 | 0.076 926 | 0.075 927 | 0.074 928 | 0.073 929 | 0.072 930 | 0.071 931 | 0.070 932 | 0.069 933 | 0.068 934 | 0.067 935 | 0.066 936 | 0.065 937 | 0.064 938 | 0.063 939 | 0.062 940 | 0.061 941 | 0.060 942 | 0.059 943 | 0.058 944 | 0.057 945 | 0.056 946 | 0.055 947 | 0.054 948 | 0.053 949 | 0.052 950 | 0.051 951 | 0.050 952 | 0.049 953 | 0.048 954 | 0.047 955 | 0.046 956 | 0.045 957 | 0.044 958 | 0.043 959 | 0.042 960 | 0.041 961 | 0.040 962 | 0.039 963 | 0.038 964 | 0.037 965 | 0.036 966 | 0.035 967 | 0.034 968 | 0.033 969 | 0.032 970 | 0.031 971 | 0.030 972 | 0.029 973 | 0.028 974 | 0.027 975 | 0.026 976 | 0.025 977 | 0.024 978 | 0.023 979 | 0.022 980 | 0.021 981 | 0.020 982 | 0.019 983 | 0.018 984 | 0.017 985 | 0.016 986 | 0.015 987 | 0.014 988 | 0.013 989 | 0.012 990 | 0.011 991 | 0.010 992 | 0.009 993 | 0.008 994 | 0.007 995 | 0.006 996 | 0.005 997 | 0.004 998 | -------------------------------------------------------------------------------- /demo-data/results/AIUPred.thresholds.distribution.txt: -------------------------------------------------------------------------------- 1 | 1.997 2 | 0.997 3 | 0.996 4 | 0.995 5 | 0.994 6 | 0.993 7 | 0.992 8 | 0.991 9 | 0.990 10 | 0.989 11 | 0.988 12 | 0.987 13 | 0.986 14 | 0.985 15 | 0.984 16 | 0.983 17 | 0.982 18 | 0.981 19 | 0.980 20 | 0.979 21 | 0.978 22 | 0.977 23 | 0.976 24 | 0.975 25 | 0.974 26 | 0.973 27 | 0.972 28 | 0.971 29 | 0.970 30 | 0.969 31 | 0.968 32 | 0.967 33 | 0.966 34 | 0.965 35 | 0.964 36 | 0.963 37 | 0.962 38 | 0.961 39 | 0.960 40 | 0.959 41 | 0.958 42 | 0.957 43 | 0.956 44 | 0.955 45 | 0.954 46 | 0.953 47 | 0.952 48 | 0.951 49 | 0.950 50 | 0.949 51 | 0.948 52 | 0.947 53 | 0.946 54 | 0.945 55 | 0.944 56 | 0.943 57 | 0.942 58 | 0.941 59 | 0.940 60 | 0.939 61 | 0.938 62 | 0.937 63 | 0.936 64 | 0.935 65 | 0.934 66 | 0.933 67 | 0.932 68 | 0.931 69 | 0.930 70 | 0.929 71 | 0.928 72 | 0.927 73 | 0.926 74 | 0.925 75 | 0.924 76 | 0.923 77 | 0.922 78 | 0.921 79 | 0.920 80 | 0.919 81 | 0.918 82 | 0.917 83 | 0.916 84 | 0.915 85 | 0.914 86 | 0.913 87 | 0.912 88 | 0.911 89 | 0.910 90 | 0.909 91 | 0.908 92 | 0.907 93 | 0.906 94 | 0.905 95 | 0.904 96 | 0.903 97 | 0.902 98 | 0.901 99 | 0.900 100 | 0.899 101 | 0.898 102 | 0.897 103 | 0.896 104 | 0.895 105 | 0.894 106 | 0.893 107 | 0.892 108 | 0.891 109 | 0.890 110 | 0.889 111 | 0.888 112 | 0.887 113 | 0.886 114 | 0.885 115 | 0.884 116 | 0.883 117 | 0.882 118 | 0.881 119 | 0.880 120 | 0.879 121 | 0.878 122 | 0.877 123 | 0.876 124 | 0.875 125 | 0.874 126 | 0.873 127 | 0.872 128 | 0.871 129 | 0.870 130 | 0.869 131 | 0.868 132 | 0.867 133 | 0.866 134 | 0.865 135 | 0.864 136 | 0.863 137 | 0.862 138 | 0.861 139 | 0.860 140 | 0.859 141 | 0.858 142 | 0.857 143 | 0.856 144 | 0.855 145 | 0.854 146 | 0.853 147 | 0.852 148 | 0.851 149 | 0.850 150 | 0.849 151 | 0.848 152 | 0.847 153 | 0.846 154 | 0.845 155 | 0.844 156 | 0.843 157 | 0.842 158 | 0.841 159 | 0.840 160 | 0.839 161 | 0.838 162 | 0.837 163 | 0.836 164 | 0.835 165 | 0.834 166 | 0.833 167 | 0.832 168 | 0.831 169 | 0.830 170 | 0.829 171 | 0.828 172 | 0.827 173 | 0.826 174 | 0.825 175 | 0.824 176 | 0.823 177 | 0.822 178 | 0.821 179 | 0.820 180 | 0.819 181 | 0.818 182 | 0.817 183 | 0.816 184 | 0.815 185 | 0.814 186 | 0.813 187 | 0.812 188 | 0.811 189 | 0.810 190 | 0.809 191 | 0.808 192 | 0.807 193 | 0.806 194 | 0.805 195 | 0.804 196 | 0.803 197 | 0.802 198 | 0.801 199 | 0.800 200 | 0.799 201 | 0.798 202 | 0.797 203 | 0.796 204 | 0.795 205 | 0.794 206 | 0.793 207 | 0.792 208 | 0.791 209 | 0.790 210 | 0.789 211 | 0.788 212 | 0.787 213 | 0.786 214 | 0.785 215 | 0.784 216 | 0.783 217 | 0.782 218 | 0.781 219 | 0.780 220 | 0.779 221 | 0.778 222 | 0.777 223 | 0.776 224 | 0.775 225 | 0.774 226 | 0.773 227 | 0.772 228 | 0.771 229 | 0.770 230 | 0.769 231 | 0.768 232 | 0.767 233 | 0.766 234 | 0.765 235 | 0.764 236 | 0.763 237 | 0.762 238 | 0.761 239 | 0.760 240 | 0.759 241 | 0.758 242 | 0.757 243 | 0.756 244 | 0.755 245 | 0.754 246 | 0.753 247 | 0.752 248 | 0.751 249 | 0.750 250 | 0.749 251 | 0.748 252 | 0.747 253 | 0.746 254 | 0.745 255 | 0.744 256 | 0.743 257 | 0.742 258 | 0.741 259 | 0.740 260 | 0.739 261 | 0.738 262 | 0.737 263 | 0.736 264 | 0.735 265 | 0.734 266 | 0.733 267 | 0.732 268 | 0.731 269 | 0.730 270 | 0.729 271 | 0.728 272 | 0.727 273 | 0.726 274 | 0.725 275 | 0.724 276 | 0.723 277 | 0.722 278 | 0.721 279 | 0.720 280 | 0.719 281 | 0.718 282 | 0.717 283 | 0.716 284 | 0.715 285 | 0.714 286 | 0.713 287 | 0.712 288 | 0.711 289 | 0.710 290 | 0.709 291 | 0.708 292 | 0.707 293 | 0.706 294 | 0.705 295 | 0.704 296 | 0.703 297 | 0.702 298 | 0.701 299 | 0.700 300 | 0.699 301 | 0.698 302 | 0.697 303 | 0.696 304 | 0.695 305 | 0.694 306 | 0.693 307 | 0.692 308 | 0.691 309 | 0.690 310 | 0.689 311 | 0.688 312 | 0.687 313 | 0.686 314 | 0.685 315 | 0.684 316 | 0.683 317 | 0.682 318 | 0.681 319 | 0.680 320 | 0.679 321 | 0.678 322 | 0.677 323 | 0.676 324 | 0.675 325 | 0.674 326 | 0.673 327 | 0.672 328 | 0.671 329 | 0.670 330 | 0.669 331 | 0.668 332 | 0.667 333 | 0.666 334 | 0.665 335 | 0.664 336 | 0.663 337 | 0.662 338 | 0.661 339 | 0.660 340 | 0.659 341 | 0.658 342 | 0.657 343 | 0.656 344 | 0.655 345 | 0.654 346 | 0.653 347 | 0.652 348 | 0.651 349 | 0.650 350 | 0.649 351 | 0.648 352 | 0.647 353 | 0.646 354 | 0.645 355 | 0.644 356 | 0.643 357 | 0.642 358 | 0.641 359 | 0.640 360 | 0.639 361 | 0.638 362 | 0.637 363 | 0.636 364 | 0.635 365 | 0.634 366 | 0.633 367 | 0.632 368 | 0.631 369 | 0.630 370 | 0.629 371 | 0.628 372 | 0.627 373 | 0.626 374 | 0.625 375 | 0.624 376 | 0.623 377 | 0.622 378 | 0.621 379 | 0.620 380 | 0.619 381 | 0.618 382 | 0.617 383 | 0.616 384 | 0.615 385 | 0.614 386 | 0.613 387 | 0.612 388 | 0.611 389 | 0.610 390 | 0.609 391 | 0.608 392 | 0.607 393 | 0.606 394 | 0.605 395 | 0.604 396 | 0.603 397 | 0.602 398 | 0.601 399 | 0.600 400 | 0.599 401 | 0.598 402 | 0.597 403 | 0.596 404 | 0.595 405 | 0.594 406 | 0.593 407 | 0.592 408 | 0.591 409 | 0.590 410 | 0.589 411 | 0.588 412 | 0.587 413 | 0.586 414 | 0.585 415 | 0.584 416 | 0.583 417 | 0.582 418 | 0.581 419 | 0.580 420 | 0.579 421 | 0.578 422 | 0.577 423 | 0.576 424 | 0.575 425 | 0.574 426 | 0.573 427 | 0.572 428 | 0.571 429 | 0.570 430 | 0.569 431 | 0.568 432 | 0.567 433 | 0.566 434 | 0.565 435 | 0.564 436 | 0.563 437 | 0.562 438 | 0.561 439 | 0.560 440 | 0.559 441 | 0.558 442 | 0.557 443 | 0.556 444 | 0.555 445 | 0.554 446 | 0.553 447 | 0.552 448 | 0.551 449 | 0.550 450 | 0.549 451 | 0.548 452 | 0.547 453 | 0.546 454 | 0.545 455 | 0.544 456 | 0.543 457 | 0.542 458 | 0.541 459 | 0.540 460 | 0.539 461 | 0.538 462 | 0.537 463 | 0.536 464 | 0.535 465 | 0.534 466 | 0.533 467 | 0.532 468 | 0.531 469 | 0.530 470 | 0.529 471 | 0.528 472 | 0.527 473 | 0.526 474 | 0.525 475 | 0.524 476 | 0.523 477 | 0.522 478 | 0.521 479 | 0.520 480 | 0.519 481 | 0.518 482 | 0.517 483 | 0.516 484 | 0.515 485 | 0.514 486 | 0.513 487 | 0.512 488 | 0.511 489 | 0.510 490 | 0.509 491 | 0.508 492 | 0.507 493 | 0.506 494 | 0.505 495 | 0.504 496 | 0.503 497 | 0.502 498 | 0.501 499 | 0.500 500 | 0.499 501 | 0.498 502 | 0.497 503 | 0.496 504 | 0.495 505 | 0.494 506 | 0.493 507 | 0.492 508 | 0.491 509 | 0.490 510 | 0.489 511 | 0.488 512 | 0.487 513 | 0.486 514 | 0.485 515 | 0.484 516 | 0.483 517 | 0.482 518 | 0.481 519 | 0.480 520 | 0.479 521 | 0.478 522 | 0.477 523 | 0.476 524 | 0.475 525 | 0.474 526 | 0.473 527 | 0.472 528 | 0.471 529 | 0.470 530 | 0.469 531 | 0.468 532 | 0.467 533 | 0.466 534 | 0.465 535 | 0.464 536 | 0.463 537 | 0.462 538 | 0.461 539 | 0.460 540 | 0.459 541 | 0.458 542 | 0.457 543 | 0.456 544 | 0.455 545 | 0.454 546 | 0.453 547 | 0.452 548 | 0.451 549 | 0.450 550 | 0.449 551 | 0.448 552 | 0.447 553 | 0.446 554 | 0.445 555 | 0.444 556 | 0.443 557 | 0.442 558 | 0.441 559 | 0.440 560 | 0.439 561 | 0.438 562 | 0.437 563 | 0.436 564 | 0.435 565 | 0.434 566 | 0.433 567 | 0.432 568 | 0.431 569 | 0.430 570 | 0.429 571 | 0.428 572 | 0.427 573 | 0.426 574 | 0.425 575 | 0.424 576 | 0.423 577 | 0.422 578 | 0.421 579 | 0.420 580 | 0.419 581 | 0.418 582 | 0.417 583 | 0.416 584 | 0.415 585 | 0.414 586 | 0.413 587 | 0.412 588 | 0.411 589 | 0.410 590 | 0.409 591 | 0.408 592 | 0.407 593 | 0.406 594 | 0.405 595 | 0.404 596 | 0.403 597 | 0.402 598 | 0.401 599 | 0.400 600 | 0.399 601 | 0.398 602 | 0.397 603 | 0.396 604 | 0.395 605 | 0.394 606 | 0.393 607 | 0.392 608 | 0.391 609 | 0.390 610 | 0.389 611 | 0.388 612 | 0.387 613 | 0.386 614 | 0.385 615 | 0.384 616 | 0.383 617 | 0.382 618 | 0.381 619 | 0.380 620 | 0.379 621 | 0.378 622 | 0.377 623 | 0.376 624 | 0.375 625 | 0.374 626 | 0.373 627 | 0.372 628 | 0.371 629 | 0.370 630 | 0.369 631 | 0.368 632 | 0.367 633 | 0.366 634 | 0.365 635 | 0.364 636 | 0.363 637 | 0.362 638 | 0.361 639 | 0.360 640 | 0.359 641 | 0.358 642 | 0.357 643 | 0.356 644 | 0.355 645 | 0.354 646 | 0.353 647 | 0.352 648 | 0.351 649 | 0.350 650 | 0.349 651 | 0.348 652 | 0.347 653 | 0.346 654 | 0.345 655 | 0.344 656 | 0.343 657 | 0.342 658 | 0.341 659 | 0.340 660 | 0.339 661 | 0.338 662 | 0.337 663 | 0.336 664 | 0.335 665 | 0.334 666 | 0.333 667 | 0.332 668 | 0.331 669 | 0.330 670 | 0.329 671 | 0.328 672 | 0.327 673 | 0.326 674 | 0.325 675 | 0.324 676 | 0.323 677 | 0.322 678 | 0.321 679 | 0.320 680 | 0.319 681 | 0.318 682 | 0.317 683 | 0.316 684 | 0.315 685 | 0.314 686 | 0.313 687 | 0.312 688 | 0.311 689 | 0.310 690 | 0.309 691 | 0.308 692 | 0.307 693 | 0.306 694 | 0.305 695 | 0.304 696 | 0.303 697 | 0.302 698 | 0.301 699 | 0.300 700 | 0.299 701 | 0.298 702 | 0.297 703 | 0.296 704 | 0.295 705 | 0.294 706 | 0.293 707 | 0.292 708 | 0.291 709 | 0.290 710 | 0.289 711 | 0.288 712 | 0.287 713 | 0.286 714 | 0.285 715 | 0.284 716 | 0.283 717 | 0.282 718 | 0.281 719 | 0.280 720 | 0.279 721 | 0.278 722 | 0.277 723 | 0.276 724 | 0.275 725 | 0.274 726 | 0.273 727 | 0.272 728 | 0.271 729 | 0.270 730 | 0.269 731 | 0.268 732 | 0.267 733 | 0.266 734 | 0.265 735 | 0.264 736 | 0.263 737 | 0.262 738 | 0.261 739 | 0.260 740 | 0.259 741 | 0.258 742 | 0.257 743 | 0.256 744 | 0.255 745 | 0.254 746 | 0.253 747 | 0.252 748 | 0.251 749 | 0.250 750 | 0.249 751 | 0.248 752 | 0.247 753 | 0.246 754 | 0.245 755 | 0.244 756 | 0.243 757 | 0.242 758 | 0.241 759 | 0.240 760 | 0.239 761 | 0.238 762 | 0.237 763 | 0.236 764 | 0.235 765 | 0.234 766 | 0.233 767 | 0.232 768 | 0.231 769 | 0.230 770 | 0.229 771 | 0.228 772 | 0.227 773 | 0.226 774 | 0.225 775 | 0.224 776 | 0.223 777 | 0.222 778 | 0.221 779 | 0.220 780 | 0.219 781 | 0.218 782 | 0.217 783 | 0.216 784 | 0.215 785 | 0.214 786 | 0.213 787 | 0.212 788 | 0.211 789 | 0.210 790 | 0.209 791 | 0.208 792 | 0.207 793 | 0.206 794 | 0.205 795 | 0.204 796 | 0.203 797 | 0.202 798 | 0.201 799 | 0.200 800 | 0.199 801 | 0.198 802 | 0.197 803 | 0.196 804 | 0.195 805 | 0.194 806 | 0.193 807 | 0.192 808 | 0.191 809 | 0.190 810 | 0.189 811 | 0.188 812 | 0.187 813 | 0.186 814 | 0.185 815 | 0.184 816 | 0.183 817 | 0.182 818 | 0.181 819 | 0.180 820 | 0.179 821 | 0.178 822 | 0.177 823 | 0.176 824 | 0.175 825 | 0.174 826 | 0.173 827 | 0.172 828 | 0.171 829 | 0.170 830 | 0.169 831 | 0.168 832 | 0.167 833 | 0.166 834 | 0.165 835 | 0.164 836 | 0.163 837 | 0.162 838 | 0.161 839 | 0.160 840 | 0.159 841 | 0.158 842 | 0.157 843 | 0.156 844 | 0.155 845 | 0.154 846 | 0.153 847 | 0.152 848 | 0.151 849 | 0.150 850 | 0.149 851 | 0.148 852 | 0.147 853 | 0.146 854 | 0.145 855 | 0.144 856 | 0.143 857 | 0.142 858 | 0.141 859 | 0.140 860 | 0.139 861 | 0.138 862 | 0.137 863 | 0.136 864 | 0.135 865 | 0.134 866 | 0.133 867 | 0.132 868 | 0.131 869 | 0.130 870 | 0.129 871 | 0.128 872 | 0.127 873 | 0.126 874 | 0.125 875 | 0.124 876 | 0.123 877 | 0.122 878 | 0.121 879 | 0.120 880 | 0.119 881 | 0.118 882 | 0.117 883 | 0.116 884 | 0.115 885 | 0.114 886 | 0.113 887 | 0.112 888 | 0.111 889 | 0.110 890 | 0.109 891 | 0.108 892 | 0.107 893 | 0.106 894 | 0.105 895 | 0.104 896 | 0.103 897 | 0.102 898 | 0.101 899 | 0.100 900 | 0.099 901 | 0.098 902 | 0.097 903 | 0.096 904 | 0.095 905 | 0.094 906 | 0.093 907 | 0.092 908 | 0.091 909 | 0.090 910 | 0.089 911 | 0.088 912 | 0.087 913 | 0.086 914 | 0.085 915 | 0.084 916 | 0.083 917 | 0.082 918 | 0.081 919 | 0.080 920 | 0.079 921 | 0.078 922 | 0.077 923 | 0.076 924 | 0.075 925 | 0.074 926 | 0.073 927 | 0.072 928 | 0.071 929 | 0.070 930 | 0.069 931 | 0.068 932 | 0.067 933 | 0.066 934 | 0.065 935 | 0.064 936 | 0.063 937 | 0.062 938 | 0.061 939 | 0.060 940 | 0.059 941 | 0.058 942 | 0.057 943 | 0.056 944 | 0.055 945 | 0.054 946 | 0.053 947 | 0.052 948 | 0.051 949 | 0.050 950 | 0.049 951 | 0.048 952 | 0.047 953 | 0.046 954 | 0.045 955 | 0.044 956 | 0.043 957 | 0.042 958 | 0.041 959 | 0.040 960 | 0.039 961 | 0.038 962 | 0.037 963 | 0.036 964 | 0.035 965 | 0.034 966 | 0.033 967 | 0.032 968 | 0.031 969 | 0.030 970 | 0.029 971 | 0.028 972 | 0.027 973 | 0.026 974 | 0.025 975 | 0.024 976 | 0.023 977 | 0.022 978 | 0.021 979 | 0.020 980 | 0.019 981 | 0.018 982 | 0.017 983 | 0.016 984 | 0.015 985 | 0.014 986 | 0.013 987 | 0.012 988 | 0.011 989 | 0.010 990 | 0.009 991 | 0.008 992 | 0.007 993 | 0.006 994 | 0.005 995 | 0.004 996 | 0.003 997 | 0.002 998 | 0.001 999 | 0.000 1000 | -------------------------------------------------------------------------------- /vectorized_metrics/vectorized_metrics.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import shutil 4 | import warnings 5 | from pathlib import Path 6 | from typing import Callable, List, Tuple, Union 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from scipy import stats 11 | from tqdm import tqdm 12 | 13 | from .logger import set_logger 14 | from .parsers import parse_prediction, parse_reference, parse_thresholds 15 | 16 | 17 | def ignore_numpy_warning(func: Callable) -> Callable: 18 | def wrapper(*arguments): 19 | with warnings.catch_warnings(): 20 | warnings.filterwarnings('ignore') 21 | return func(*arguments) 22 | 23 | return wrapper 24 | 25 | 26 | def binary_clf_curve(y_true: np.ndarray, y_score: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 27 | """Calculate true and false positives per binary classification threshold. 28 | 29 | :param y_true: True labels of binary classifications 30 | :type y_true: np.ndarray 31 | :param y_score: Estimated probabilities or decision function 32 | :type y_score: np.ndarray 33 | :return: 34 | - fps: A count of false positives, at index i being the number of negative samples assigned a 35 | score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives 36 | are given by fps[-1] - fps); 37 | - tps: An increasing count of true positives, at index i being the number of positive samples assigned a 38 | score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives 39 | are given by tps[-1] - tps); 40 | - thresholds: Decreasing unique score values 41 | :rtype: Tuple[np.ndarray, np.ndarray, np.ndarray] 42 | """ 43 | logging.debug("calculating binary clf curve") 44 | pos_label = 1.0 45 | 46 | # make y_true a boolean vector 47 | y_true = (y_true == pos_label) 48 | 49 | # sort scores and corresponding truth values 50 | desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] 51 | y_score = y_score[desc_score_indices] 52 | y_true = y_true[desc_score_indices] 53 | 54 | # y_score typically has many tied values. Here we extract the indices associated with the distinct values. 55 | # We also concatenate a value for the end of the curve. 56 | distinct_value_indices = np.where(np.diff(y_score))[0] 57 | threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] 58 | 59 | # accumulate the true positives with decreasing threshold 60 | tps = np.cumsum(y_true, dtype=np.float64)[threshold_idxs] 61 | fps = 1 + threshold_idxs - tps 62 | thr = y_score[threshold_idxs] 63 | 64 | logging.debug('number of scores: {}'.format(len(y_score))) 65 | logging.debug('number of finite distinct scores: {}'.format(len(set(thr[~np.isnan(thr)])))) 66 | logging.debug("fps: {}..., tps: {}...".format(fps[:5], tps[:5])) 67 | return fps, tps, thr 68 | 69 | 70 | def roc(fps: np.ndarray, tps: np.ndarray, thresholds: np.ndarray, drop_intermediates: bool = False) -> np.ndarray: 71 | """Compute Receiver operating characteristic (ROC). 72 | 73 | :param fps: decreasing count of false positives 74 | :type fps: np.ndarray 75 | :param tps: increasing count of true positives 76 | :type tps: np.ndarray 77 | :param thresholds: Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` 78 | represents no instances being predicted and is arbitrarily set to `max(y_score) + 1` 79 | :type thresholds: np.ndarray 80 | :param drop_intermediates: Whether to drop some suboptimal thresholds which would not appear on a plotted ROC 81 | curve. This is useful in order to create lighter ROC curves. 82 | :type drop_intermediates: bool 83 | :return: 84 | - fpr: Increasing false positive rates such that element i is the false positive rate of predictions 85 | with score >= thresholds[i]; 86 | - tpr: Increasing true positive rates such that element i is the true positive rate of predictions 87 | with score >= thresholds[i]; 88 | - thresholds: Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` 89 | represents no instances being predicted and is arbitrarily set to `max(thresholds) + 1`. 90 | 91 | :rtype: np.ndarray, np.ndarray, np.ndarray 92 | """ 93 | logging.debug("calculating roc: {} {} {}".format(fps[:5], tps[:5], thresholds[:5])) 94 | if drop_intermediates is True and len(fps) > 2: 95 | optimal_idxs = np.where(np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0] 96 | fps = fps[optimal_idxs] 97 | tps = tps[optimal_idxs] 98 | thresholds = thresholds[optimal_idxs] 99 | 100 | # Add an extra threshold to make sure that the curve starts at (0, 0) 101 | tps = np.r_[0, tps] 102 | fps = np.r_[0, fps] 103 | thresholds = np.r_[thresholds[0] + 1, thresholds] 104 | 105 | if fps[-1] <= 0: 106 | fpr = np.repeat(np.nan, fps.shape) 107 | else: 108 | fpr = fps / fps[-1] 109 | 110 | if tps[-1] <= 0: 111 | tpr = np.repeat(np.nan, tps.shape) 112 | else: 113 | tpr = tps / tps[-1] 114 | 115 | return np.array([fpr, tpr, thresholds], dtype=np.float64).round(3) 116 | 117 | 118 | @ignore_numpy_warning 119 | def pr(fps: np.ndarray, tps: np.ndarray, thresholds: np.ndarray) -> np.ndarray: 120 | """Compute precision-recall pairs for different probability thresholds 121 | 122 | The last precision and recall values are 1. and 0. respectively and do not 123 | have a corresponding threshold. This ensures that the graph starts on the 124 | y axis. 125 | 126 | :param fps: decreasing count of false positives 127 | :type fps: np.ndarray 128 | :param tps: increasing count of true positives 129 | :type tps: np.ndarray 130 | :param thresholds: Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` 131 | represents no instances being predicted and is arbitrarily set to `max(y_score) + 1` 132 | :type thresholds: np.ndarray 133 | :return: 134 | - precision : Increasing precision values such that element i is the precision of 135 | predictions with score >= thresholds[i] and the last element is 1. 136 | - recall : Increasing recall values such that element i is the recall of predictions with score >= thresholds[i] 137 | and the last element is 0. 138 | - thresholds : Decreasing thresholds on the decision function used to compute precision and recall. 139 | """ 140 | 141 | logging.debug("calculating precision recall curve") 142 | precision = tps / (tps + fps) 143 | precision[np.isnan(precision)] = 0 144 | recall = tps / tps[-1] 145 | recall[np.isnan(recall)] = 0 146 | 147 | logging.debug('ppv: {}; rec: {}'.format(precision[:4], recall[:4])) 148 | return np.array([np.r_[1, precision], np.r_[0, recall], np.r_[thresholds[0] + 1, thresholds]], dtype=np.float64) \ 149 | .round(3) 150 | 151 | 152 | def confmat(fps: np.ndarray, tps: np.ndarray) -> np.ndarray: 153 | """Compute confusion matrix for different probability thresholds 154 | 155 | Confusion matrix `[[tn fp] [fn tp]]` for the binary case with labels [0,1] is computed for each threshold. 156 | Computation starts from the count of fp (`fps` param) and tp (`tps` param) for each threshold. For each 157 | threshold t in a series of decreasing threshold, $tn_t$ is calculated as $p - fp_t$ where $p$ is the number 158 | of positives labels and is represented by the last element of 159 | 160 | :param fps: decreasing count of false positives 161 | :type fps: np.ndarray 162 | :param tps: increasing count of true positives 163 | :type tps: np.ndarray 164 | :return: array of shape (len(fps), 4) with the count of TNs FPs FNs adn TPs for each couple of values (FP, TP) 165 | in `fps`, `tps` params. 166 | """ 167 | logging.debug("calculating confusion matrix") 168 | # true negatives are given by 169 | tns = fps[-1] - fps 170 | # false negatives are given by 171 | fns = tps[-1] - tps 172 | # tn, fp, fn, tp 173 | return np.array([tns, fps, fns, tps], dtype=np.float64) 174 | 175 | 176 | # TODO: this cannot work since once in a dataframe ref and pred will always have the same length. 177 | # Missing labels will be replaced by nans and those nans will be indistiguishable from masked regions in reference 178 | # Nans in predictions should instead be safe to check 179 | def find_length_mismatches(p: pd.DataFrame) -> List[str]: 180 | """Compare lengths of targets in reference and prediction 181 | 182 | Store and log on the warning level ids of targets with inconsistent lengths when found, 183 | then return a list of the ids with inconsistent lengths. 184 | 185 | :param p: aligned reference and prediction. It is expected in the format: 186 | 187 | | | | ref | ref | predname | predname | 188 | |----------|---|--------|-----|----------|----------| 189 | | | | states | seq | states | scores | 190 | | target 1 | 1 | 1 | M | 1 | 0.789 | 191 | | target 1 | 2 | 1 | S | 0 | 0.456 | 192 | 193 | :type p: pd.DataFrame 194 | :return: list of target ids where the length of the reference is different from the length of the prediction 195 | """ 196 | inconsistent_targets = [] 197 | lbl = p.columns.get_level_values(0).unique()[1] 198 | for tgt, tgt_aligned in p.groupby(level=0): 199 | ps = tgt_aligned[(lbl, "states")].values 200 | rs = tgt_aligned[("ref", "states")].values 201 | 202 | # if np.any(np.isnan(ps)) and not np.all(np.isnan(ps)): 203 | if len(ps) < len(rs): 204 | inconsistent_targets.append(tgt) 205 | logging.warning("prediction is missing some residues; {} excluded".format(tgt)) 206 | # if np.any(np.isnan(rs)): 207 | elif len(ps) > len(rs): 208 | inconsistent_targets.append(tgt) 209 | logging.warning("prediction is longer than reference; {} excluded".format(tgt)) 210 | 211 | return inconsistent_targets 212 | 213 | 214 | def align_reference_prediction(ref: dict, pred: dict, drop_missing: bool = True) -> Tuple[pd.DataFrame, List]: 215 | # merge reference a prediction dicts and cast to Pandas.DataFrame 216 | aln_pred = pd.DataFrame({**ref, **pred}) 217 | predname = aln_pred.columns.get_level_values(0)[-1] 218 | 219 | logging.debug("aligned reference and prediction; {}".format(predname)) 220 | # check for length mismatch between reference and prediction 221 | wrong_len_preds = find_length_mismatches(aln_pred) 222 | # remove targets with length mismatch 223 | aln_pred = aln_pred.loc[~aln_pred.index.get_level_values(0).isin(wrong_len_preds)] 224 | 225 | # remove rows with nan (now it's only possible if all residues are missing) 226 | isnan = aln_pred.isna().all() 227 | isnan = isnan[isnan == 1].index.tolist() 228 | if isnan: 229 | for p in isnan: 230 | aln_pred[p] = aln_pred[(p[0], 'states')] 231 | 232 | if drop_missing is True: 233 | aln_pred = aln_pred.dropna(axis=0) 234 | 235 | return aln_pred, wrong_len_preds 236 | 237 | 238 | def balanced_accuracy(nd_cmat: np.ndarray) -> np.ndarray: 239 | c = nd_cmat.T.reshape(nd_cmat.shape[1], 2, 2) 240 | with np.errstate(divide='ignore', invalid='ignore'): 241 | per_class = np.diagonal(c, axis1=1, axis2=2) / c.sum(axis=2) 242 | score = np.nanmean(per_class, axis=1) 243 | return score 244 | 245 | 246 | def fbeta(precision: np.ndarray, recall: np.ndarray, beta: Union[float, int] = 1) -> np.ndarray: 247 | beta2 = beta ** 2 248 | denom = beta2 * precision + recall 249 | denom[denom == 0.] = 1 # avoid division by 0 250 | fbeta = ((1 + beta2) * precision * recall) / denom 251 | logging.debug("f_{}: denom: {}; score: {}".format(beta, denom[:4], fbeta[:4])) 252 | return fbeta 253 | 254 | 255 | def negative_predictive_value(tn, fn): 256 | denom = tn + fn 257 | return np.divide(tn, denom, out=np.zeros_like(tn).astype(float), where=denom != 0) 258 | 259 | 260 | def matt_cc(tn, fp, fn, tp): 261 | numer = (tp * tn - fp * fn) 262 | denom = (np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) 263 | mcc = np.divide(numer, denom, out=np.zeros_like(numer).astype(float), where=denom != 0) 264 | logging.debug("mcc: num: {}; denom: {}; mcc: {}".format(numer[:4], denom[:4], mcc[:4])) 265 | return mcc 266 | 267 | 268 | def auc(x, y): 269 | """ Compute Area Under the Curve (AUC) using the trapezoidal rule. 270 | 271 | :param x: x coordinates. These must be either monotonic increasing or monotonic decreasing 272 | :type x: np.ndarray 273 | :param y: y coordinates 274 | :type y: np.ndarray 275 | :return: area under the curve 276 | :rtype: float 277 | """ 278 | logging.debug("calculating auc") 279 | 280 | if x.shape[0] < 2: 281 | logging.warning('At least 2 points are needed to compute area under curve, but x.shape = %s' % x.shape) 282 | area = np.nan 283 | else: 284 | direction = 1 285 | dx = np.diff(x) 286 | if np.any(dx < 0): 287 | if np.all(dx <= 0): 288 | direction = -1 289 | else: 290 | logging.error("direction of x argument in auc function is neither increasing nor decreasing.exiting") 291 | # area = np.nan 292 | 293 | area = direction * np.trapz(y, x) 294 | if isinstance(area, np.memmap): 295 | # Reductions such as .sum used internally in np.trapz do not return a 296 | # scalar by default for numpy.memmap instances contrary to 297 | # regular numpy.ndarray instances. 298 | area = area.dtype.type(area) 299 | return area 300 | 301 | 302 | def get_metrics(roc_curve, pr_curve, cmats: np.ndarray) -> dict: 303 | # TODO: check if it's really necessary to split.squeeze cmats 304 | # unpack per-threshold confusion matrices 305 | if cmats.shape == (4, 2): 306 | cmats = np.squeeze(np.split(cmats, 4, 0)) 307 | tn, fp, fn, tp = cmats 308 | 309 | # remove first element (it's artificially added in pr func) 310 | ppv = pr_curve[0][1:] # precision 311 | tpr = pr_curve[1][1:] # sensitivity / recall 312 | 313 | # remove first element (they're calculated from an artificially added threshold in roc func) 314 | fpr = roc_curve[0][1:] # fall-out 315 | tnr = 1 - fpr # specificity / selectivity 316 | fnr = 1 - tpr # miss-rate 317 | 318 | # compute other metrics 319 | bacc = balanced_accuracy(cmats) 320 | f1 = fbeta(ppv, tpr) 321 | f2 = fbeta(ppv, tpr, beta=2) 322 | f05 = fbeta(ppv, tpr, beta=.5) 323 | mcc = matt_cc(tn, fp, fn, tp) 324 | npv = negative_predictive_value(tn, fn) 325 | fom = 1 - npv # false omission rate (for keyword is reserved) 326 | inf = tpr + tnr - 1 # bookmaker informedness 327 | mk = ppv + npv - 1 # markedness 328 | csi = tp / (tp + fn + fp) # critical score index / threat score (doesn't need a func b/c denom can never be 0) 329 | 330 | return dict(npv=npv, ppv=ppv, tpr=tpr, tnr=tnr, fpr=fpr, fnr=fnr, fom=fom, csi=csi, 331 | bac=bacc, f1s=f1, f2s=f2, f05=f05, mcc=mcc, inf=inf, mk=mk) 332 | 333 | 334 | def get_default_threshold(thresholds, predname, pred): 335 | default_thr = None 336 | 337 | if thresholds is not None: 338 | default_thr = thresholds.get(predname) 339 | 340 | if default_thr is None: 341 | default_thr = calculate_default_threshold(pred) 342 | 343 | return default_thr 344 | 345 | 346 | def calculate_default_threshold(pred: np.ndarray) -> float: 347 | # Get the minimum threshold for the positive class 348 | all_pos = pred[pred[:, 0] == 1] 349 | if len(all_pos) == 0: 350 | logging.debug('no positive predictions') 351 | thr = np.nan 352 | else: 353 | thr = all_pos[:, 1].min() 354 | logging.debug('default threshold: {}'.format(thr)) 355 | return thr 356 | 357 | 358 | def calc_curves_and_metrics(ytrue, yscore): 359 | logging.debug("calculting curves and metrics") 360 | logging.debug("positive labels: ref {} pred {}".format(ytrue.sum(), yscore.sum())) 361 | logging.debug("negative labels: ref {} pred {}".format(len(ytrue) - ytrue.sum(), len(yscore) - yscore.sum())) 362 | fps, tps, thr = binary_clf_curve(ytrue, yscore) 363 | roc_curve = roc(fps, tps, thr) 364 | pr_curve = pr(fps, tps, thr) 365 | cmat = confmat(fps, tps) 366 | metrics = get_metrics(roc_curve, pr_curve, cmat) 367 | return roc_curve, pr_curve, cmat, metrics 368 | 369 | 370 | def bootstrap_reference_and_prediction(ytrue, yscore, n=100): 371 | for idx in (np.random.choice(len(ytrue), size=len(ytrue)) for _ in range(n)): 372 | ref = ytrue[idx] 373 | pred = yscore[idx] 374 | yield calc_curves_and_metrics(ref, pred) 375 | 376 | 377 | def confidence_interval(series, interval=0.95): 378 | # TODO: I don't like that this function returns a pd.Series but it is necessary to have pd.DataFrame 379 | # as result of an apply 380 | mean = series.mean() 381 | n = series.count() 382 | test_stat = stats.t.ppf((interval + 1) / 2, n) 383 | norm_test_stat = (test_stat * series.std()) / (n ** 0.5) 384 | lower_bound = mean - norm_test_stat 385 | upper_bound = mean + norm_test_stat 386 | return pd.Series(dict(lo=lower_bound, hi=upper_bound)) 387 | 388 | 389 | def summary_metrics(roc_curve, pr_curve): 390 | logging.debug("calculating summary metrics") 391 | ppv, tpr, _ = pr_curve 392 | auc_roc = auc(*roc_curve[:-1]) 393 | auc_pr = auc(tpr, ppv) 394 | logging.debug("calculating average precision score") 395 | aps = -np.sum(np.diff(tpr[::-1]) * ppv[::-1][:-1]) 396 | logging.debug("building summary metrics dict") 397 | return dict(aucroc=np.round(auc_roc, 3), aucpr=np.round(auc_pr, 3), aps=np.round(aps, 3)) 398 | 399 | 400 | def dataset_curves_and_metrics(ytrue, yscore, predname): 401 | logging.info("calculating dataset curves and metrics") 402 | roc_curve, pr_curve, cmat, metrics = calc_curves_and_metrics(ytrue, yscore) 403 | smry_metrics = summary_metrics(roc_curve, pr_curve) 404 | 405 | indexes, values = zip(*metrics.items()) 406 | logging.debug("metrics to dataframe") 407 | metrics = pd.DataFrame(values, 408 | columns=roc_curve[2][1:], 409 | index=pd.MultiIndex.from_product([[predname], indexes])).round(3) 410 | 411 | logging.debug("roc to dataframe") 412 | roc_df = pd.DataFrame(roc_curve[:-1].T, 413 | columns=pd.MultiIndex.from_product([[predname], [smry_metrics["aucroc"]], ["fpr", "tpr"]], 414 | names=["predictor", "auc", "metric"]), 415 | index=roc_curve[-1].round(3)) 416 | 417 | logging.debug("pr curve to dataframe") 418 | pr_df = pd.DataFrame(pr_curve[1::-1].T, 419 | columns=pd.MultiIndex.from_product( 420 | [[predname], [smry_metrics["aucpr"]], [smry_metrics["aps"]], ["tpr", "ppv"]], 421 | names=["predictor", "auc", "aps", "metric"]), 422 | index=pr_curve[-1].round(3)) 423 | 424 | logging.debug("confusion matrix to dataframe") 425 | cmat = pd.DataFrame(zip(*cmat), 426 | columns=pd.MultiIndex.from_product([[predname], ["tn", "fp", "fn", "tp"]]), 427 | index=roc_curve[-1][1:].round(3)).astype(int) 428 | 429 | # logging.debug("dataset metrics done") 430 | return roc_df, pr_df, cmat, metrics, smry_metrics 431 | 432 | 433 | def bootstrap_curves_and_metrics(aln_refpred, predname, n): 434 | logging.info("Bootstrapping {} times with replacement".format(n)) 435 | bootstrap_metrics = {} 436 | 437 | for i, data_bts in enumerate(bootstrap_reference_and_prediction(aln_refpred[('ref', 'states')].values, 438 | aln_refpred[(predname, 'scores')].values, n=n)): 439 | roc_bts, pr_bts, cmat_bts, metrics_bts = data_bts 440 | 441 | bts_d = {(i, m): dict(np.stack([roc_bts[2][1:], metrics_bts[m]], axis=1)) for m in metrics_bts} 442 | bootstrap_metrics = {**bootstrap_metrics, **bts_d} 443 | # save target evaluation as csv 444 | bootstrap_metrics = pd.DataFrame(bootstrap_metrics).round(3).T 445 | 446 | logging.debug("bootstrapping done") 447 | return bootstrap_metrics 448 | 449 | 450 | def target_curves_and_metrics(aln_refpred, predname): 451 | logging.info("calculating target curves and metrics") 452 | 453 | target_metrics = {} 454 | logging.debug("number of targets: {}".format(len(aln_refpred.index.get_level_values(0).unique()))) 455 | logging.debug("number of predicors: {}".format(len(aln_refpred.columns) / 2)) 456 | for tgt, tgt_scores in aln_refpred.groupby(level=0): 457 | logging.debug("{} : {}...".format(tgt, tgt_scores[predname]["scores"].values[:4])) 458 | roc_tgt, pr_tgt, cmat_tgt, metrics_tgt = calc_curves_and_metrics(tgt_scores[('ref', 'states')].values, 459 | tgt_scores[(predname, 'scores')].values) 460 | # save in a data-structure easily convertible to pd.DataFrame 461 | tgt_d = {(tgt, m): dict(np.stack([roc_tgt[2][1:], metrics_tgt[m]], axis=1)) for m in metrics_tgt} 462 | # update metrics dict 463 | target_metrics = {**target_metrics, **tgt_d} 464 | 465 | logging.debug("converting target metrics dict to dataframe") 466 | logging.debug("number of targets: {}".format(len(target_metrics))) 467 | # deprecated 468 | # target_metrics = pd.DataFrame(target_metrics).round(3) \ 469 | # .sort_index(ascending=False) \ 470 | # .fillna(method='ffill') \ 471 | # .fillna(method='backfill').T 472 | target_metrics = pd.DataFrame(target_metrics).round(3).sort_index(ascending=False).ffill().bfill().T 473 | logging.debug("target metrics and curves done") 474 | return target_metrics 475 | 476 | 477 | def bvaluation(reference: Path, predictions: list, outpath=".", dataset=True, target=False, bootstrap=False, 478 | run_tag="analysis", threshold_file=None, normalize=False, accs_to_read=None): 479 | outpath = Path(outpath) 480 | outpath.mkdir(parents=True, exist_ok=True) 481 | refname = reference.stem 482 | ref_obj, accs = parse_reference(str(reference.resolve(strict=True)), accs_to_read=accs_to_read) # resolve raises an error if file doesn't exists 483 | provided_thr = parse_thresholds(Path(threshold_file)) if threshold_file is not None else None 484 | 485 | metrics_to_write = ["f1s", "default"] 486 | 487 | roc_curves = [] 488 | pr_curves = [] 489 | cmats = [] 490 | all_preds = {} 491 | thresholds = {} 492 | cm_data = {} 493 | dts_data = {} 494 | tgt_data = {} 495 | bts_data = {} 496 | ci_data = {} 497 | 498 | bar = tqdm(predictions, desc="Benchmarking predictions") 499 | 500 | for prediction in bar: 501 | bar.set_description("Benchmarking {}".format(Path(prediction).stem)) 502 | predname = Path(prediction).stem 503 | pred_obj = parse_prediction(prediction, accs, predname, normalize=normalize) # returns dict 504 | aln_ref_pred, wrong_tgt = align_reference_prediction(ref_obj, pred_obj) # remove targets w/ errors 505 | 506 | if aln_ref_pred.empty: 507 | logging.error('Reference-prediction alignment resulted in an Empty array. This is usually due to ' 508 | 'a mismatch between reference and prediction accessions. Skipping {}'.format(predname)) 509 | continue 510 | 511 | all_preds.update(pred_obj) # add reference to be aligned with all preds 512 | 513 | logging.info("number of targets {}".format(len(aln_ref_pred.groupby(level=0).count()))) 514 | logging.info("number of labels: {}".format(len(aln_ref_pred))) 515 | 516 | roc_curve, pr_curve, cmat, dataset_metrics, smry_metrics = dataset_curves_and_metrics( 517 | aln_ref_pred[('ref', 'states')].values, 518 | aln_ref_pred[(predname, 'scores')].values, 519 | predname) 520 | np.savetxt(outpath / '{}.rawscores.distribution.txt'.format(predname), 521 | aln_ref_pred[(predname, 'scores')].values, fmt='%.3f') 522 | np.savetxt(outpath / '{}.thresholds.distribution.txt'.format(predname), 523 | roc_curve.index.values, fmt='%.3f') 524 | 525 | if dataset is True: 526 | dataset_metrics.to_csv(outpath / ".".join([refname, run_tag, predname, "dataset", "metrics", "csv"])) 527 | roc_curves.append(roc_curve) 528 | pr_curves.append(pr_curve) 529 | cmats.append(cmat) 530 | 531 | if bootstrap is True: 532 | bootstrap_metrics = bootstrap_curves_and_metrics(aln_ref_pred, predname, 100) 533 | bootstrap_metrics.to_csv(outpath / ".".join([refname, run_tag, predname, "bootstrap", "metrics", "csv"])) 534 | 535 | if target is True: 536 | target_metrics = target_curves_and_metrics(aln_ref_pred, predname) 537 | target_metrics.to_csv(outpath / ".".join([refname, run_tag, predname, "target", "metrics", "csv"])) 538 | 539 | # {