├── model-training ├── fix_filenames.py ├── conservation_hmm.groovy └── model-training.sh ├── large-scale-predictions ├── PDBe-predictions │ ├── compute-conservation │ │ ├── compute-conservation.py │ │ └── conservation_hmm_wrapper.py │ ├── unpack_homs.py │ ├── PDBe-predictions.sh │ └── generate_chunks.py ├── AlphaFold-predictions │ ├── proteome_IDs │ ├── AlphaFold-predictions.sh │ └── generate_chunks.py └── Swiss-Prot-predictions │ ├── generate_chunks.py │ └── Swiss-Prot-predictions.sh ├── README.md ├── logo ├── logo-ion.svg ├── logo-ligand.svg ├── logo-path.svg ├── logo-ion-path.svg ├── logo-dna.svg ├── logo-ligand-path.svg └── logo.svg └── model-training-updated └── model-training-updated.sh /model-training/fix_filenames.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from os import listdir 4 | from shutil import copy 5 | 6 | for filename in listdir(): 7 | if " " in filename: 8 | copy(filename, filename.replace(" ", "A")) 9 | -------------------------------------------------------------------------------- /large-scale-predictions/PDBe-predictions/compute-conservation/compute-conservation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from sys import argv 4 | 5 | import conservation_hmm_wrapper 6 | 7 | conservation_hmm_wrapper.run_conservation_hmm_wrapper(*argv[1:]) 8 | -------------------------------------------------------------------------------- /model-training/conservation_hmm.groovy: -------------------------------------------------------------------------------- 1 | import cz.siret.prank.program.params.Params 2 | 3 | /** 4 | * P2Rank configuration for use with the new, HMMER-based conservation pipeline. 5 | */ 6 | (params as Params).with { 7 | 8 | //model = "conservation_hmm.model" 9 | 10 | features = ["chem","volsite","protrusion","bfactor","conservation"] 11 | 12 | load_conservation = true 13 | 14 | } 15 | -------------------------------------------------------------------------------- /large-scale-predictions/AlphaFold-predictions/proteome_IDs: -------------------------------------------------------------------------------- 1 | UP000000437_7955_DANRE 2 | UP000000559_237561_CANAL 3 | UP000000589_10090_MOUSE 4 | UP000000625_83333_ECOLI 5 | UP000000803_7227_DROME 6 | UP000000805_243232_METJA 7 | UP000001450_36329_PLAF7 8 | UP000001584_83332_MYCTU 9 | UP000001940_6239_CAEEL 10 | UP000002195_44689_DICDI 11 | UP000002296_353153_TRYCC 12 | UP000002311_559292_YEAST 13 | UP000002485_284812_SCHPO 14 | UP000002494_10116_RAT 15 | UP000005640_9606_HUMAN 16 | UP000006548_3702_ARATH 17 | UP000007305_4577_MAIZE 18 | UP000008153_5671_LEIIN 19 | UP000008816_93061_STAA8 20 | UP000008827_3847_SOYBN 21 | UP000059680_39947_ORYSJ 22 | -------------------------------------------------------------------------------- /large-scale-predictions/PDBe-predictions/unpack_homs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from glob import glob 4 | from os import mkdir 5 | from shutil import copy 6 | 7 | conservation_files = glob('chunks/chunk_*/conservation/*.conservation') 8 | 9 | fasta_file_to_pdb_chain_ids = dict() 10 | 11 | with open('chunks.log') as f: 12 | for line in f: 13 | fasta_file, pdb_chain_ids = line.strip().split() 14 | fasta_file_to_pdb_chain_ids[fasta_file] = pdb_chain_ids.split(';') 15 | 16 | mkdir('homs') 17 | 18 | for conservation_file in conservation_files: 19 | for pdb_chain_id in fasta_file_to_pdb_chain_ids[conservation_file.replace('/conservation', '').replace('.conservation', '')]: 20 | copy(conservation_file, 'homs/{}.hom'.format(pdb_chain_id)) 21 | -------------------------------------------------------------------------------- /large-scale-predictions/PDBe-predictions/compute-conservation/conservation_hmm_wrapper.py: -------------------------------------------------------------------------------- 1 | from conservation_hmm.conservation_hmm import run_conservation_hmm 2 | from conservation_hmm.examples.mask_ic_file import mask_ic_file 3 | 4 | 5 | def run_conservation_hmm_wrapper( 6 | fasta_file, 7 | database_file, 8 | working_directory, 9 | target_file, 10 | max_seqs=1000, 11 | max_freqgap=0.5, 12 | mask_string="-1000.0", 13 | ): 14 | weighted_msa_file = run_conservation_hmm( 15 | fasta_file=fasta_file, 16 | database_file=database_file, 17 | working_directory=working_directory, 18 | target_file=target_file + ".unmasked", 19 | max_seqs=max_seqs, 20 | ) 21 | mask_ic_file( 22 | ic_file=target_file + ".unmasked", 23 | freqgap_file=target_file + ".unmasked.freqgap", 24 | target_file=target_file, 25 | max_freqgap=max_freqgap, 26 | mask_string=mask_string, 27 | ) 28 | return weighted_msa_file 29 | -------------------------------------------------------------------------------- /large-scale-predictions/Swiss-Prot-predictions/generate_chunks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from glob import glob 4 | from os import mkdir 5 | 6 | CHUNK_SIZE = 100 7 | 8 | input_fasta_files = glob('fastas/*.fasta') 9 | 10 | sequence_to_ids = dict() 11 | 12 | for input_fasta_file in input_fasta_files: 13 | with open(input_fasta_file) as f: 14 | next(f) 15 | sequence = next(f).strip() 16 | if sequence in sequence_to_ids: 17 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 18 | else: 19 | sequence_to_ids[sequence] = [] 20 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 21 | 22 | sequences = sorted(sequence_to_ids) 23 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE)) 24 | 25 | mkdir('chunks/') 26 | for chunk_number, chunk_sequences in enumerate(sequence_chunks): 27 | mkdir('chunks/chunk_{}/'.format(chunk_number)) 28 | for sequence_number, sequence in enumerate(chunk_sequences): 29 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f: 30 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t') 31 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n') 32 | -------------------------------------------------------------------------------- /large-scale-predictions/PDBe-predictions/PDBe-predictions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set up P2Rank 4 | cp ../p2rank_2.4-beta.3.tar.gz . 5 | tar xzf p2rank_2.4-beta.3.tar.gz 6 | rm p2rank_2.4-beta.3.tar.gz 7 | cd p2rank_2.4-beta.3/ 8 | sed -i 's/2048m/16G/' prank 9 | rm -r config/ models/ 10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ . 11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ . 12 | 13 | # Download the PDB archive (see https://www.wwpdb.org/ftp/pdb-ftp-sites) 14 | mkdir datasets 15 | cd datasets/ 16 | rsync -rlpt -v -z --delete \ 17 | rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/pdb/ \ 18 | ./pdb > download-pdb.log 19 | 20 | # Extract sequences in FASTA format 21 | for f in $(grep .ent.gz download-pdb.log) 22 | do 23 | echo pdb/${f} >> pdb.ds 24 | done 25 | cd ../ 26 | ./prank analyze fasta-masked datasets/pdb.ds -o datasets/fastas/ &> datasets/fastas.log 27 | 28 | # Find unique sequences and split them into chunks for parallel processing 29 | cd datasets/ 30 | ../../generate_chunks.py > chunks.log 31 | 32 | # Compute conservation scores for individual chunk sequences 33 | # ... 34 | 35 | # Unpack the conservation score files 36 | ../../unpack_homs.py 37 | 38 | # Run the predictions 39 | mkdir predictions 40 | cd ../ 41 | for configuration in conservation_hmm default 42 | do 43 | ./prank eval-predict datasets/pdb.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log 44 | done 45 | -------------------------------------------------------------------------------- /large-scale-predictions/Swiss-Prot-predictions/Swiss-Prot-predictions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set up P2Rank 4 | cp ../p2rank_2.4-beta.3.tar.gz . 5 | tar xzf p2rank_2.4-beta.3.tar.gz 6 | rm p2rank_2.4-beta.3.tar.gz 7 | cd p2rank_2.4-beta.3/ 8 | sed -i 's/2048m/16G/' prank 9 | rm -r config/ models/ 10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ . 11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ . 12 | 13 | # Download the predicted structures 14 | mkdir datasets 15 | cd datasets/ 16 | mkdir swissprot 17 | cd swissprot/ 18 | wget -q https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar 19 | tar xf swissprot_pdb_v2.tar 20 | rm swissprot_pdb_v2.tar 21 | cd ../ 22 | for f in swissprot/*.pdb.gz 23 | do 24 | echo ${f} >> swissprot.ds 25 | done 26 | 27 | # Extract sequences in FASTA format 28 | cd ../ 29 | ./prank analyze fasta-masked datasets/swissprot.ds -o datasets/fastas/ &> datasets/fastas.log 30 | 31 | # Find unique sequences and split them into chunks for parallel processing 32 | cd datasets/ 33 | ../../generate_chunks.py > chunks.log 34 | 35 | # Compute conservation scores for individual chunk sequences 36 | # ... 37 | 38 | # Unpack the conservation score files 39 | ../../../PDBe-predictions/unpack_homs.py 40 | 41 | # Run the predictions 42 | mkdir predictions 43 | cd ../ 44 | for configuration in alphafold_conservation_hmm alphafold 45 | do 46 | ./prank eval-predict datasets/swissprot.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log 47 | done 48 | -------------------------------------------------------------------------------- /large-scale-predictions/AlphaFold-predictions/AlphaFold-predictions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set up P2Rank 4 | cp ../p2rank_2.4-beta.3.tar.gz . 5 | tar xzf p2rank_2.4-beta.3.tar.gz 6 | rm p2rank_2.4-beta.3.tar.gz 7 | cd p2rank_2.4-beta.3/ 8 | sed -i 's/2048m/16G/' prank 9 | rm -r config/ models/ 10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ . 11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ . 12 | 13 | # Download the predicted structures 14 | mkdir datasets 15 | cd datasets/ 16 | for proteome_ID in $(cat ../../proteome_IDs) 17 | do 18 | mkdir ${proteome_ID} 19 | cd ${proteome_ID}/ 20 | wget -q http://ftp.ebi.ac.uk/pub/databases/alphafold/${proteome_ID}.tar 21 | tar xf ${proteome_ID}.tar 22 | rm ${proteome_ID}.tar 23 | cd ../ 24 | for f in ${proteome_ID}/*.pdb.gz 25 | do 26 | echo ${f} >> alphafold.ds 27 | done 28 | done 29 | 30 | # Extract sequences in FASTA format 31 | cd ../ 32 | ./prank analyze fasta-masked datasets/alphafold.ds -o datasets/fastas/ &> datasets/fastas.log 33 | 34 | # Find unique sequences and split them into chunks for parallel processing 35 | cd datasets/ 36 | ../../generate_chunks.py > chunks.log 37 | 38 | # Compute conservation scores for individual chunk sequences 39 | # ... 40 | 41 | # Unpack the conservation score files 42 | ../../../PDBe-predictions/unpack_homs.py 43 | 44 | # Run the predictions 45 | mkdir predictions 46 | cd ../ 47 | for configuration in alphafold_conservation_hmm alphafold 48 | do 49 | ./prank eval-predict datasets/alphafold.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log 50 | done 51 | -------------------------------------------------------------------------------- /large-scale-predictions/PDBe-predictions/generate_chunks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from glob import glob 4 | from os import mkdir 5 | from shutil import copy 6 | 7 | CHUNK_SIZE = 100 8 | 9 | 10 | def generate_sequence_to_legacy_fasta_file_mapping(): 11 | sequence_to_legacy_fasta_file_mapping = dict() 12 | legacy_fasta_files = glob('../../../../P2Rank/PDBe-predictions/p2rank_2.3.1/datasets/chunks/chunk_*/*.fasta') 13 | for legacy_fasta_file in legacy_fasta_files: 14 | with open(legacy_fasta_file) as f: 15 | next(f) 16 | sequence = next(f).strip() 17 | sequence_to_legacy_fasta_file_mapping[sequence] = legacy_fasta_file 18 | return sequence_to_legacy_fasta_file_mapping 19 | 20 | 21 | sequence_to_legacy_fasta_file_mapping = generate_sequence_to_legacy_fasta_file_mapping() 22 | 23 | input_fasta_files = glob('fastas/*.fasta') 24 | 25 | sequence_to_ids = dict() 26 | 27 | for input_fasta_file in input_fasta_files: 28 | with open(input_fasta_file) as f: 29 | next(f) 30 | sequence = next(f).strip() 31 | if sequence in sequence_to_ids: 32 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 33 | else: 34 | sequence_to_ids[sequence] = [] 35 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 36 | 37 | sequences = sorted(sequence_to_ids) 38 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE)) 39 | 40 | mkdir('chunks/') 41 | for chunk_number, chunk_sequences in enumerate(sequence_chunks): 42 | mkdir('chunks/chunk_{}/'.format(chunk_number)) 43 | mkdir('chunks/chunk_{}/conservation/'.format(chunk_number)) 44 | for sequence_number, sequence in enumerate(chunk_sequences): 45 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f: 46 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t') 47 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n') 48 | if sequence in sequence_to_legacy_fasta_file_mapping: 49 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation'.format(chunk_number, sequence_number)) 50 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked'.format(chunk_number, sequence_number)) 51 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked.freqgap', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked.freqgap'.format(chunk_number, sequence_number)) 52 | -------------------------------------------------------------------------------- /large-scale-predictions/AlphaFold-predictions/generate_chunks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from glob import glob 4 | from os import mkdir 5 | from shutil import copy 6 | 7 | CHUNK_SIZE = 100 8 | 9 | 10 | def generate_sequence_to_legacy_fasta_file_mapping(): 11 | sequence_to_legacy_fasta_file_mapping = dict() 12 | legacy_fasta_files = glob('../../../../P2Rank/AlphaFold-predictions/p2rank_2.3.1/datasets/chunks/chunk_*/*.fasta') 13 | for legacy_fasta_file in legacy_fasta_files: 14 | with open(legacy_fasta_file) as f: 15 | next(f) 16 | sequence = next(f).strip() 17 | sequence_to_legacy_fasta_file_mapping[sequence] = legacy_fasta_file 18 | return sequence_to_legacy_fasta_file_mapping 19 | 20 | 21 | sequence_to_legacy_fasta_file_mapping = generate_sequence_to_legacy_fasta_file_mapping() 22 | 23 | input_fasta_files = glob('fastas/*.fasta') 24 | 25 | sequence_to_ids = dict() 26 | 27 | for input_fasta_file in input_fasta_files: 28 | with open(input_fasta_file) as f: 29 | next(f) 30 | sequence = next(f).strip() 31 | if sequence in sequence_to_ids: 32 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 33 | else: 34 | sequence_to_ids[sequence] = [] 35 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6]) 36 | 37 | sequences = sorted(sequence_to_ids) 38 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE)) 39 | 40 | mkdir('chunks/') 41 | for chunk_number, chunk_sequences in enumerate(sequence_chunks): 42 | mkdir('chunks/chunk_{}/'.format(chunk_number)) 43 | mkdir('chunks/chunk_{}/conservation/'.format(chunk_number)) 44 | for sequence_number, sequence in enumerate(chunk_sequences): 45 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f: 46 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t') 47 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n') 48 | if sequence in sequence_to_legacy_fasta_file_mapping: 49 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation'.format(chunk_number, sequence_number)) 50 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked'.format(chunk_number, sequence_number)) 51 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked.freqgap', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked.freqgap'.format(chunk_number, sequence_number)) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # P2Rank framework 2 | 3 | P2Rank is a ligand binding site prediction tools utilizing machine learning to identify sites on the surface on the input 3D protein structure capable of binding an unspecified small molecule. P2Rank framework is a loosely coupled framework of several components with P2Rank at its core. 4 | 5 | The purpose of this repository is to be the central entry point to the project containing links to the individual projects, including references to documentation, datasets, etc. 6 | 7 | ## P2Rank applications 8 | - [Command line app](https://github.com/rdk/p2rank) enabling users to run high-throughput analysis 9 | - [Web app](https://p2rank.cz) supporting online detection and their visual inspection, including download of the results to [PyMol](https://pymol.org/) 10 | 11 | ## P2Rank modules 12 | 13 | - [P2Rank code repository](https://github.com/rdk/p2rank) - the main app, serving also as the backend to the web 14 | - [PrankWeb code repository](https://github.com/cusbg/prankweb) - code for the web frontend 15 | - [Old conservation pipeline](https://github.com/cusbg/sequence-conservation) - pipeline used to compute conservation which is used as one of the P2rank features. In PrankWeb3 this is replaced by HMM-based conservation available in the [PrankWeb repo](https://github.com/cusbg/prankweb/tree/main/conservation) 16 | - [PDBe-KB integration](https://github.com/cusbg/p2rank-pdbe-kb) - code used to share predictions with [PDBe-KB](https://www.ebi.ac.uk/pdbe/pdbe-kb) 17 | 18 | ## Documentation 19 | 20 | - [Wiki](https://github.com/cusbg/p2rank-framework/wiki) in this repository 21 | - [P2Rank tutorials](https://github.com/rdk/p2rank/tree/develop/misc/tutorials) available for some more advanced topics (such as hyperparameter optimization) related to P2Rank backend (some information might overlap with the docs available in this repo) 22 | 23 | ## Datasets 24 | - protein-ligand 25 | - https://github.com/rdk/p2rank-datasets 26 | - protein-DNA 27 | - https://github.com/cusbg/p2rank-data-dna 28 | 29 | ## Publications 30 | - Lukáš Polák, Petr Škoda, Kamila Riedlová, Radoslav Krivák, Marian Novotný and David Hoksza. [PrankWeb 4: a modular web server for protein–ligand binding site prediction and downstream analysis](https://doi.org/10.1093/nar/gkaf421). Nucleic Acids Research. May 2025 31 | - Dávid Jakubec, Petr Škoda, Radoslav Krivák, Marian Novotný and David Hoksza. [PrankWeb 3: accelerated ligand-binding site predictions for experimental and modelled protein structures](https://doi.org/10.1093/nar/gkac389). Nucleic Acids Research. May 2022 32 | - Lukáš Jendele and Radoslav Krivák and Petr Škoda and Marian Novotný and David Hoksza. [PrankWeb: a web server for ligand binding site prediction and visualization](https://academic.oup.com/nar/article/47/W1/W345/5494740?login=true). Nucleic Acids Research. May 2019 33 | - Radoslav Krivák and David Hoksza. [P2Rank: machine learning based tool for rapid and accurate prediction of ligand binding sites from protein structure](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0285-8). Journal of Cheminformatics. Aug 2018 34 | -------------------------------------------------------------------------------- /logo/logo-ion.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 63 | 69 | 72 | P2RANK 83 | ION 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /model-training/model-training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Download and set up P2Rank 4 | wget https://github.com/rdk/p2rank/releases/download/2.3/p2rank_2.3.tar.gz 5 | tar xzf p2rank_2.3.tar.gz 6 | rm p2rank_2.3.tar.gz 7 | cd p2rank_2.3/ 8 | sed -i 's/2048m/8G/' prank 9 | 10 | # Download p2rank-datasets 11 | git clone https://github.com/rdk/p2rank-datasets.git 12 | 13 | # Set up conservation calculations (see conservation_hmm README for requirements) 14 | mkdir conservation 15 | cd conservation/ 16 | wget https://raw.githubusercontent.com/cusbg/prankweb/master/conservation/conservation_hmm/conservation_hmm.py 17 | chmod +x conservation_hmm.py 18 | mkdir databases fastas homs 19 | cd databases/ 20 | wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz 21 | gunzip uniprot_sprot.fasta.gz 22 | wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz 23 | gunzip uniref50.fasta.gz 24 | wget https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt 25 | cd ../ 26 | 27 | # Prepare fasta files 28 | cd ../ 29 | for dataset in chen11 coach420 holo4k joined 30 | do 31 | ./prank analyze fasta-masked p2rank-datasets/${dataset}.ds -o conservation/fastas/${dataset} &> conservation/fastas/${dataset}.log 32 | done 33 | 34 | # Calculate conservations 35 | cd conservation/ 36 | for dataset in chen11 coach420 holo4k joined 37 | do 38 | for database in uniprot_sprot uniref50 39 | do 40 | mkdir -p homs/${dataset}/${database} 41 | for fasta_file in fastas/${dataset}/*.fasta 42 | do 43 | cp "${fasta_file}" no_spaces.fasta 44 | mkdir tmp_dir 45 | ./conservation_hmm.py no_spaces.fasta databases/${database}.fasta tmp_dir/ homs/${dataset}/${database}/"$(basename "${fasta_file}")".hom --max_seqs 1000 &> homs/${dataset}/${database}/"$(basename "${fasta_file}")".log 46 | rm no_spaces.fasta 47 | rm -r tmp_dir/ 48 | done 49 | done 50 | done 51 | 52 | # Prepare masked conservation files 53 | wget https://raw.githubusercontent.com/cusbg/prankweb/master/conservation/conservation_hmm/examples/mask_ic_file.py 54 | chmod +x mask_ic_file.py 55 | cd homs/ 56 | for dataset in chen11 coach420 holo4k joined 57 | do 58 | cd ${dataset}/ 59 | for database in uniprot_sprot uniref50 60 | do 61 | cd ${database}/ 62 | for max_freqgap in 30 50 70 90 63 | do 64 | mkdir -p masked_${max_freqgap} 65 | for hom_file in *.hom 66 | do 67 | ../../../mask_ic_file.py "${hom_file}" "${hom_file}".freqgap masked_${max_freqgap}/"$(basename "${hom_file}" .hom)".masked_${max_freqgap}.hom 0.${max_freqgap} -1000.0 68 | done 69 | done 70 | cd ../ 71 | done 72 | cd ../ 73 | done 74 | 75 | # Fix filenames containing spaces 76 | for dataset in joined 77 | do 78 | cd ${dataset}/ 79 | for database in uniprot_sprot uniref50 80 | do 81 | cd ${database}/ 82 | for max_freqgap in 30 50 70 90 83 | do 84 | cd masked_${max_freqgap}/ 85 | cp ../../../../../../fix_filenames.py . 86 | ./fix_filenames.py 87 | rm fix_filenames.py 88 | cd ../ 89 | done 90 | cd ../ 91 | done 92 | cd ../ 93 | done 94 | 95 | # Prepare new models 96 | cd ../../ 97 | cp ../conservation_hmm.groovy config/ 98 | for database in uniprot_sprot uniref50 99 | do 100 | for max_freqgap in 30 50 70 90 101 | do 102 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c conservation_hmm.groovy -conservation_dirs "(../conservation/homs/chen11/${database}/masked_${max_freqgap}, ../conservation/homs/joined/${database}/masked_${max_freqgap})" -delete_models false -loop 10 -o new_models/${database}/masked_${max_freqgap} -threads 4 103 | # ./prank traineval -t p2rank-datasets/chen11-fpocket.ds -e p2rank-datasets/joined.ds -c conservation_hmm.groovy -conservation_dirs "(../conservation/homs/chen11/${database}/masked_${max_freqgap}, ../conservation/homs/joined/${database}/masked_${max_freqgap})" -delete_models false -loop 10 -o new_models/${database}/masked_${max_freqgap} -threads 4 104 | done 105 | done 106 | 107 | # Evaluate the new models 108 | for dataset in coach420 holo4k 109 | do 110 | for database in uniprot_sprot uniref50 111 | do 112 | for max_freqgap in 30 50 70 90 113 | do 114 | for seed in $(seq 42 1 51) 115 | do 116 | ./prank eval-predict p2rank-datasets/${dataset}.ds -c conservation_hmm.groovy -conservation_dirs ../conservation/homs/${dataset}/${database}/masked_${max_freqgap} -m new_models/${database}/masked_${max_freqgap}/runs/seed.${seed}/FastRandomForest.model -o new_models_evaluation/${dataset}/${database}/masked_${max_freqgap}/runs/seed.${seed} -threads 4 117 | done 118 | done 119 | done 120 | done 121 | 122 | # Rename the selected model 123 | cp new_models/uniref50/masked_50/runs/seed.45/FastRandomForest.model new_models/uniref50/masked_50/runs/seed.45/conservation_hmm.model 124 | -------------------------------------------------------------------------------- /logo/logo-ligand.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 62 | 67 | 73 | 74 | P2RANK 85 | ligand 96 | 97 | 98 | -------------------------------------------------------------------------------- /model-training-updated/model-training-updated.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Set up P2Rank 4 | cp ../p2rank_2.4-beta.3.tar.gz . 5 | tar xzf p2rank_2.4-beta.3.tar.gz 6 | rm p2rank_2.4-beta.3.tar.gz 7 | cd p2rank_2.4-beta.3/ 8 | sed -i 's/2048m/16G/' prank 9 | 10 | # Download p2rank-datasets 11 | git clone https://github.com/rdk/p2rank-datasets.git &> p2rank-datasets.log 12 | 13 | # Set up conservation calculations 14 | mkdir conservation 15 | cd conservation/ 16 | mkdir fastas homs 17 | cd ../ 18 | 19 | # Prepare fasta files 20 | for dataset in chen11 coach420 holo4k joined 21 | do 22 | ./prank analyze fasta-masked p2rank-datasets/${dataset}.ds -o conservation/fastas/${dataset} &> conservation/fastas/${dataset}.log 23 | done 24 | 25 | # Calculate conservations 26 | cd conservation/ 27 | for dataset in chen11 coach420 holo4k joined 28 | do 29 | for database in uniref50 30 | do 31 | mkdir -p homs/${dataset}/${database} 32 | for fasta_file in fastas/${dataset}/*.fasta 33 | do 34 | cp "${fasta_file}" no_spaces.fasta 35 | mkdir tmp_dir 36 | ../../../PDBe-predictions/compute-conservation/compute-conservation.py no_spaces.fasta ../../../PDBe-predictions/compute-conservation/${database}.fasta tmp_dir/ homs/${dataset}/${database}/"$(basename "${fasta_file}")".hom &> homs/${dataset}/${database}/"$(basename "${fasta_file}")".log 37 | rm no_spaces.fasta 38 | rm -r tmp_dir/ 39 | done 40 | done 41 | done 42 | 43 | # Prepare new models 44 | cd ../ 45 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm 46 | do 47 | rm -f models/${configuration}.model 48 | rm -f models/score/${configuration}_*.json 49 | rm -f models/score/residue/${configuration}_*.json 50 | sed 's# model#\/\/ model#' config/${configuration}.groovy | sed 's# zscoretp#\/\/ zscoretp#' | sed 's# probatp#\/\/ probatp#' > config/${configuration}_training.groovy 51 | for database in uniref50 52 | do 53 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c ${configuration}_training.groovy -conservation_dirs "(../conservation/homs/chen11/${database}, ../conservation/homs/joined/${database})" -delete_models false -loop 20 -o new_models/${configuration}/${database} -threads 16 54 | done 55 | done 56 | 57 | for configuration in default 58 | do 59 | for database in uniref50 60 | do 61 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c ${configuration}.groovy -delete_models false -loop 20 -o new_models/${configuration}/${database} -threads 16 62 | done 63 | rm -f models/${configuration}.model 64 | # rm -f models/score/${configuration}_*.json 65 | # rm -f models/score/residue/p2rank_${configuration}_*.json 66 | done 67 | 68 | cp new_models/alphafold_conservation_hmm/uniref50/runs/seed.46/FastRandomForest.model models/alphafold_conservation_hmm.model 69 | cp new_models/alphafold/uniref50/runs/seed.49/FastRandomForest.model models/alphafold.model 70 | cp new_models/conservation_hmm/uniref50/runs/seed.45/FastRandomForest.model models/conservation_hmm.model 71 | cp new_models/default/uniref50/runs/seed.58/FastRandomForest.model models/default.model 72 | 73 | cp config/default.groovy config/default_training.groovy 74 | mkdir transformers 75 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm default 76 | do 77 | ./prank eval-predict p2rank-datasets/holo4k.ds -c ${configuration}_training.groovy -conservation_dirs ../conservation/homs/holo4k/uniref50 -m ${configuration}.model -o transformers/${configuration} -threads 16 -train_score_transformers "(ProbabilityScoreTransformer, ZscoreTpTransformer)" -train_score_transformers_for_residues true -visualizations false &> transformers/${configuration}.log 78 | cp transformers/${configuration}/score/ProbabilityScoreTransformer.json models/score/${configuration}_ProbabilityScoreTransformer.json 79 | cp transformers/${configuration}/score/ZscoreTpTransformer.json models/score/${configuration}_ZscoreTpTransformer.json 80 | cp transformers/${configuration}/residue-score/ProbabilityScoreTransformer.json models/score/residue/${configuration}_ProbabilityScoreTransformer.json 81 | cp transformers/${configuration}/residue-score/ZscoreTpTransformer.json models/score/residue/${configuration}_ZscoreTpTransformer.json 82 | done 83 | 84 | sed -i 's/default_probatp.json/default_ProbabilityScoreTransformer.json/' config/default.groovy 85 | sed -i 's/default_zscoretp.json/default_ZscoreTpTransformer.json/' config/default.groovy 86 | sed -i 's/p2rank_default_proba.json/default_ProbabilityScoreTransformer.json/' config/default.groovy 87 | sed -i 's/p2rank_default_zscore.json/default_ZscoreTpTransformer.json/' config/default.groovy 88 | 89 | # Evaluate the new models 90 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm default 91 | do 92 | for dataset in coach420 holo4k 93 | do 94 | ./prank eval-predict p2rank-datasets/${dataset}.ds -c ${configuration}.groovy -conservation_dirs ../conservation/homs/${dataset}/uniref50 -o new_models_evaluation/${configuration}/${dataset} -threads 16 95 | done 96 | done 97 | -------------------------------------------------------------------------------- /logo/logo-path.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 64 | 68 | 72 | 76 | 80 | 84 | 88 | 89 | 93 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /logo/logo-ion-path.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 38 | 40 | 41 | 43 | image/svg+xml 44 | 46 | 47 | 48 | 49 | 50 | 55 | 59 | 65 | 70 | 74 | 78 | 82 | 86 | 90 | 94 | 95 | 100 | 104 | 108 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /logo/logo-dna.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | P2RANK 70 | 74 | DNA 85 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /logo/logo-ligand-path.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 38 | 40 | 41 | 43 | image/svg+xml 44 | 46 | 47 | 48 | 49 | 50 | 55 | 58 | 63 | 69 | 70 | 73 | 77 | 81 | 85 | 89 | 93 | 97 | 101 | 102 | 106 | 110 | 114 | 118 | 122 | 126 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /logo/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | P2RANK 71 | 76 | 82 | P2RANK 93 | 97 | 103 | P2RANK 114 | 118 | 124 | P2RANK 135 | 139 | 145 | P2RANK 156 | 160 | 166 | P2RANK 177 | 181 | 187 | P2RANK 198 | 202 | 208 | P2RANK 219 | 223 | 229 | P2RANK 240 | 244 | 250 | P2RANK 261 | 265 | 271 | 275 | 281 | P2RANK 292 | 1 304 | 2 316 | 3 328 | 4 340 | 5 352 | 6 364 | 7 376 | 8 388 | 9 400 | 10 412 | 11 424 | 425 | 426 | --------------------------------------------------------------------------------