├── model-training
├── fix_filenames.py
├── conservation_hmm.groovy
└── model-training.sh
├── large-scale-predictions
├── PDBe-predictions
│ ├── compute-conservation
│ │ ├── compute-conservation.py
│ │ └── conservation_hmm_wrapper.py
│ ├── unpack_homs.py
│ ├── PDBe-predictions.sh
│ └── generate_chunks.py
├── AlphaFold-predictions
│ ├── proteome_IDs
│ ├── AlphaFold-predictions.sh
│ └── generate_chunks.py
└── Swiss-Prot-predictions
│ ├── generate_chunks.py
│ └── Swiss-Prot-predictions.sh
├── README.md
├── logo
├── logo-ion.svg
├── logo-ligand.svg
├── logo-path.svg
├── logo-ion-path.svg
├── logo-dna.svg
├── logo-ligand-path.svg
└── logo.svg
└── model-training-updated
└── model-training-updated.sh
/model-training/fix_filenames.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from os import listdir
4 | from shutil import copy
5 |
6 | for filename in listdir():
7 | if " " in filename:
8 | copy(filename, filename.replace(" ", "A"))
9 |
--------------------------------------------------------------------------------
/large-scale-predictions/PDBe-predictions/compute-conservation/compute-conservation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from sys import argv
4 |
5 | import conservation_hmm_wrapper
6 |
7 | conservation_hmm_wrapper.run_conservation_hmm_wrapper(*argv[1:])
8 |
--------------------------------------------------------------------------------
/model-training/conservation_hmm.groovy:
--------------------------------------------------------------------------------
1 | import cz.siret.prank.program.params.Params
2 |
3 | /**
4 | * P2Rank configuration for use with the new, HMMER-based conservation pipeline.
5 | */
6 | (params as Params).with {
7 |
8 | //model = "conservation_hmm.model"
9 |
10 | features = ["chem","volsite","protrusion","bfactor","conservation"]
11 |
12 | load_conservation = true
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/large-scale-predictions/AlphaFold-predictions/proteome_IDs:
--------------------------------------------------------------------------------
1 | UP000000437_7955_DANRE
2 | UP000000559_237561_CANAL
3 | UP000000589_10090_MOUSE
4 | UP000000625_83333_ECOLI
5 | UP000000803_7227_DROME
6 | UP000000805_243232_METJA
7 | UP000001450_36329_PLAF7
8 | UP000001584_83332_MYCTU
9 | UP000001940_6239_CAEEL
10 | UP000002195_44689_DICDI
11 | UP000002296_353153_TRYCC
12 | UP000002311_559292_YEAST
13 | UP000002485_284812_SCHPO
14 | UP000002494_10116_RAT
15 | UP000005640_9606_HUMAN
16 | UP000006548_3702_ARATH
17 | UP000007305_4577_MAIZE
18 | UP000008153_5671_LEIIN
19 | UP000008816_93061_STAA8
20 | UP000008827_3847_SOYBN
21 | UP000059680_39947_ORYSJ
22 |
--------------------------------------------------------------------------------
/large-scale-predictions/PDBe-predictions/unpack_homs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from glob import glob
4 | from os import mkdir
5 | from shutil import copy
6 |
7 | conservation_files = glob('chunks/chunk_*/conservation/*.conservation')
8 |
9 | fasta_file_to_pdb_chain_ids = dict()
10 |
11 | with open('chunks.log') as f:
12 | for line in f:
13 | fasta_file, pdb_chain_ids = line.strip().split()
14 | fasta_file_to_pdb_chain_ids[fasta_file] = pdb_chain_ids.split(';')
15 |
16 | mkdir('homs')
17 |
18 | for conservation_file in conservation_files:
19 | for pdb_chain_id in fasta_file_to_pdb_chain_ids[conservation_file.replace('/conservation', '').replace('.conservation', '')]:
20 | copy(conservation_file, 'homs/{}.hom'.format(pdb_chain_id))
21 |
--------------------------------------------------------------------------------
/large-scale-predictions/PDBe-predictions/compute-conservation/conservation_hmm_wrapper.py:
--------------------------------------------------------------------------------
1 | from conservation_hmm.conservation_hmm import run_conservation_hmm
2 | from conservation_hmm.examples.mask_ic_file import mask_ic_file
3 |
4 |
5 | def run_conservation_hmm_wrapper(
6 | fasta_file,
7 | database_file,
8 | working_directory,
9 | target_file,
10 | max_seqs=1000,
11 | max_freqgap=0.5,
12 | mask_string="-1000.0",
13 | ):
14 | weighted_msa_file = run_conservation_hmm(
15 | fasta_file=fasta_file,
16 | database_file=database_file,
17 | working_directory=working_directory,
18 | target_file=target_file + ".unmasked",
19 | max_seqs=max_seqs,
20 | )
21 | mask_ic_file(
22 | ic_file=target_file + ".unmasked",
23 | freqgap_file=target_file + ".unmasked.freqgap",
24 | target_file=target_file,
25 | max_freqgap=max_freqgap,
26 | mask_string=mask_string,
27 | )
28 | return weighted_msa_file
29 |
--------------------------------------------------------------------------------
/large-scale-predictions/Swiss-Prot-predictions/generate_chunks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from glob import glob
4 | from os import mkdir
5 |
6 | CHUNK_SIZE = 100
7 |
8 | input_fasta_files = glob('fastas/*.fasta')
9 |
10 | sequence_to_ids = dict()
11 |
12 | for input_fasta_file in input_fasta_files:
13 | with open(input_fasta_file) as f:
14 | next(f)
15 | sequence = next(f).strip()
16 | if sequence in sequence_to_ids:
17 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
18 | else:
19 | sequence_to_ids[sequence] = []
20 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
21 |
22 | sequences = sorted(sequence_to_ids)
23 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE))
24 |
25 | mkdir('chunks/')
26 | for chunk_number, chunk_sequences in enumerate(sequence_chunks):
27 | mkdir('chunks/chunk_{}/'.format(chunk_number))
28 | for sequence_number, sequence in enumerate(chunk_sequences):
29 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f:
30 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t')
31 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n')
32 |
--------------------------------------------------------------------------------
/large-scale-predictions/PDBe-predictions/PDBe-predictions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set up P2Rank
4 | cp ../p2rank_2.4-beta.3.tar.gz .
5 | tar xzf p2rank_2.4-beta.3.tar.gz
6 | rm p2rank_2.4-beta.3.tar.gz
7 | cd p2rank_2.4-beta.3/
8 | sed -i 's/2048m/16G/' prank
9 | rm -r config/ models/
10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ .
11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ .
12 |
13 | # Download the PDB archive (see https://www.wwpdb.org/ftp/pdb-ftp-sites)
14 | mkdir datasets
15 | cd datasets/
16 | rsync -rlpt -v -z --delete \
17 | rsync.ebi.ac.uk::pub/databases/pdb/data/structures/divided/pdb/ \
18 | ./pdb > download-pdb.log
19 |
20 | # Extract sequences in FASTA format
21 | for f in $(grep .ent.gz download-pdb.log)
22 | do
23 | echo pdb/${f} >> pdb.ds
24 | done
25 | cd ../
26 | ./prank analyze fasta-masked datasets/pdb.ds -o datasets/fastas/ &> datasets/fastas.log
27 |
28 | # Find unique sequences and split them into chunks for parallel processing
29 | cd datasets/
30 | ../../generate_chunks.py > chunks.log
31 |
32 | # Compute conservation scores for individual chunk sequences
33 | # ...
34 |
35 | # Unpack the conservation score files
36 | ../../unpack_homs.py
37 |
38 | # Run the predictions
39 | mkdir predictions
40 | cd ../
41 | for configuration in conservation_hmm default
42 | do
43 | ./prank eval-predict datasets/pdb.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log
44 | done
45 |
--------------------------------------------------------------------------------
/large-scale-predictions/Swiss-Prot-predictions/Swiss-Prot-predictions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set up P2Rank
4 | cp ../p2rank_2.4-beta.3.tar.gz .
5 | tar xzf p2rank_2.4-beta.3.tar.gz
6 | rm p2rank_2.4-beta.3.tar.gz
7 | cd p2rank_2.4-beta.3/
8 | sed -i 's/2048m/16G/' prank
9 | rm -r config/ models/
10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ .
11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ .
12 |
13 | # Download the predicted structures
14 | mkdir datasets
15 | cd datasets/
16 | mkdir swissprot
17 | cd swissprot/
18 | wget -q https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/swissprot_pdb_v2.tar
19 | tar xf swissprot_pdb_v2.tar
20 | rm swissprot_pdb_v2.tar
21 | cd ../
22 | for f in swissprot/*.pdb.gz
23 | do
24 | echo ${f} >> swissprot.ds
25 | done
26 |
27 | # Extract sequences in FASTA format
28 | cd ../
29 | ./prank analyze fasta-masked datasets/swissprot.ds -o datasets/fastas/ &> datasets/fastas.log
30 |
31 | # Find unique sequences and split them into chunks for parallel processing
32 | cd datasets/
33 | ../../generate_chunks.py > chunks.log
34 |
35 | # Compute conservation scores for individual chunk sequences
36 | # ...
37 |
38 | # Unpack the conservation score files
39 | ../../../PDBe-predictions/unpack_homs.py
40 |
41 | # Run the predictions
42 | mkdir predictions
43 | cd ../
44 | for configuration in alphafold_conservation_hmm alphafold
45 | do
46 | ./prank eval-predict datasets/swissprot.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log
47 | done
48 |
--------------------------------------------------------------------------------
/large-scale-predictions/AlphaFold-predictions/AlphaFold-predictions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set up P2Rank
4 | cp ../p2rank_2.4-beta.3.tar.gz .
5 | tar xzf p2rank_2.4-beta.3.tar.gz
6 | rm p2rank_2.4-beta.3.tar.gz
7 | cd p2rank_2.4-beta.3/
8 | sed -i 's/2048m/16G/' prank
9 | rm -r config/ models/
10 | cp --recursive ../../model-training/p2rank_2.4-beta.3/config/ .
11 | cp --recursive ../../model-training/p2rank_2.4-beta.3/models/ .
12 |
13 | # Download the predicted structures
14 | mkdir datasets
15 | cd datasets/
16 | for proteome_ID in $(cat ../../proteome_IDs)
17 | do
18 | mkdir ${proteome_ID}
19 | cd ${proteome_ID}/
20 | wget -q http://ftp.ebi.ac.uk/pub/databases/alphafold/${proteome_ID}.tar
21 | tar xf ${proteome_ID}.tar
22 | rm ${proteome_ID}.tar
23 | cd ../
24 | for f in ${proteome_ID}/*.pdb.gz
25 | do
26 | echo ${f} >> alphafold.ds
27 | done
28 | done
29 |
30 | # Extract sequences in FASTA format
31 | cd ../
32 | ./prank analyze fasta-masked datasets/alphafold.ds -o datasets/fastas/ &> datasets/fastas.log
33 |
34 | # Find unique sequences and split them into chunks for parallel processing
35 | cd datasets/
36 | ../../generate_chunks.py > chunks.log
37 |
38 | # Compute conservation scores for individual chunk sequences
39 | # ...
40 |
41 | # Unpack the conservation score files
42 | ../../../PDBe-predictions/unpack_homs.py
43 |
44 | # Run the predictions
45 | mkdir predictions
46 | cd ../
47 | for configuration in alphafold_conservation_hmm alphafold
48 | do
49 | ./prank eval-predict datasets/alphafold.ds -c ${configuration}.groovy -conservation_dirs homs/ -o datasets/predictions/${configuration}/ -threads 16 &> datasets/predictions/${configuration}.log
50 | done
51 |
--------------------------------------------------------------------------------
/large-scale-predictions/PDBe-predictions/generate_chunks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from glob import glob
4 | from os import mkdir
5 | from shutil import copy
6 |
7 | CHUNK_SIZE = 100
8 |
9 |
10 | def generate_sequence_to_legacy_fasta_file_mapping():
11 | sequence_to_legacy_fasta_file_mapping = dict()
12 | legacy_fasta_files = glob('../../../../P2Rank/PDBe-predictions/p2rank_2.3.1/datasets/chunks/chunk_*/*.fasta')
13 | for legacy_fasta_file in legacy_fasta_files:
14 | with open(legacy_fasta_file) as f:
15 | next(f)
16 | sequence = next(f).strip()
17 | sequence_to_legacy_fasta_file_mapping[sequence] = legacy_fasta_file
18 | return sequence_to_legacy_fasta_file_mapping
19 |
20 |
21 | sequence_to_legacy_fasta_file_mapping = generate_sequence_to_legacy_fasta_file_mapping()
22 |
23 | input_fasta_files = glob('fastas/*.fasta')
24 |
25 | sequence_to_ids = dict()
26 |
27 | for input_fasta_file in input_fasta_files:
28 | with open(input_fasta_file) as f:
29 | next(f)
30 | sequence = next(f).strip()
31 | if sequence in sequence_to_ids:
32 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
33 | else:
34 | sequence_to_ids[sequence] = []
35 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
36 |
37 | sequences = sorted(sequence_to_ids)
38 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE))
39 |
40 | mkdir('chunks/')
41 | for chunk_number, chunk_sequences in enumerate(sequence_chunks):
42 | mkdir('chunks/chunk_{}/'.format(chunk_number))
43 | mkdir('chunks/chunk_{}/conservation/'.format(chunk_number))
44 | for sequence_number, sequence in enumerate(chunk_sequences):
45 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f:
46 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t')
47 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n')
48 | if sequence in sequence_to_legacy_fasta_file_mapping:
49 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation'.format(chunk_number, sequence_number))
50 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked'.format(chunk_number, sequence_number))
51 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked.freqgap', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked.freqgap'.format(chunk_number, sequence_number))
52 |
--------------------------------------------------------------------------------
/large-scale-predictions/AlphaFold-predictions/generate_chunks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from glob import glob
4 | from os import mkdir
5 | from shutil import copy
6 |
7 | CHUNK_SIZE = 100
8 |
9 |
10 | def generate_sequence_to_legacy_fasta_file_mapping():
11 | sequence_to_legacy_fasta_file_mapping = dict()
12 | legacy_fasta_files = glob('../../../../P2Rank/AlphaFold-predictions/p2rank_2.3.1/datasets/chunks/chunk_*/*.fasta')
13 | for legacy_fasta_file in legacy_fasta_files:
14 | with open(legacy_fasta_file) as f:
15 | next(f)
16 | sequence = next(f).strip()
17 | sequence_to_legacy_fasta_file_mapping[sequence] = legacy_fasta_file
18 | return sequence_to_legacy_fasta_file_mapping
19 |
20 |
21 | sequence_to_legacy_fasta_file_mapping = generate_sequence_to_legacy_fasta_file_mapping()
22 |
23 | input_fasta_files = glob('fastas/*.fasta')
24 |
25 | sequence_to_ids = dict()
26 |
27 | for input_fasta_file in input_fasta_files:
28 | with open(input_fasta_file) as f:
29 | next(f)
30 | sequence = next(f).strip()
31 | if sequence in sequence_to_ids:
32 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
33 | else:
34 | sequence_to_ids[sequence] = []
35 | sequence_to_ids[sequence].append(input_fasta_file.split('/')[1][:-6])
36 |
37 | sequences = sorted(sequence_to_ids)
38 | sequence_chunks = (sequences[i:i + CHUNK_SIZE] for i in range(0, len(sequences), CHUNK_SIZE))
39 |
40 | mkdir('chunks/')
41 | for chunk_number, chunk_sequences in enumerate(sequence_chunks):
42 | mkdir('chunks/chunk_{}/'.format(chunk_number))
43 | mkdir('chunks/chunk_{}/conservation/'.format(chunk_number))
44 | for sequence_number, sequence in enumerate(chunk_sequences):
45 | with open('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), mode='w') as f:
46 | print('chunks/chunk_{}/sequence_{}.fasta'.format(chunk_number, sequence_number), ';'.join(sequence_to_ids[sequence]), sep='\t')
47 | f.write('>' + ';'.join(sequence_to_ids[sequence]) + '\n' + sequence + '\n')
48 | if sequence in sequence_to_legacy_fasta_file_mapping:
49 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation'.format(chunk_number, sequence_number))
50 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked'.format(chunk_number, sequence_number))
51 | copy(sequence_to_legacy_fasta_file_mapping[sequence].replace('/sequence_', '/conservation/sequence_') + '.conservation.unmasked.freqgap', 'chunks/chunk_{}/conservation/sequence_{}.fasta.conservation.unmasked.freqgap'.format(chunk_number, sequence_number))
52 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # P2Rank framework
2 |
3 | P2Rank is a ligand binding site prediction tools utilizing machine learning to identify sites on the surface on the input 3D protein structure capable of binding an unspecified small molecule. P2Rank framework is a loosely coupled framework of several components with P2Rank at its core.
4 |
5 | The purpose of this repository is to be the central entry point to the project containing links to the individual projects, including references to documentation, datasets, etc.
6 |
7 | ## P2Rank applications
8 | - [Command line app](https://github.com/rdk/p2rank) enabling users to run high-throughput analysis
9 | - [Web app](https://p2rank.cz) supporting online detection and their visual inspection, including download of the results to [PyMol](https://pymol.org/)
10 |
11 | ## P2Rank modules
12 |
13 | - [P2Rank code repository](https://github.com/rdk/p2rank) - the main app, serving also as the backend to the web
14 | - [PrankWeb code repository](https://github.com/cusbg/prankweb) - code for the web frontend
15 | - [Old conservation pipeline](https://github.com/cusbg/sequence-conservation) - pipeline used to compute conservation which is used as one of the P2rank features. In PrankWeb3 this is replaced by HMM-based conservation available in the [PrankWeb repo](https://github.com/cusbg/prankweb/tree/main/conservation)
16 | - [PDBe-KB integration](https://github.com/cusbg/p2rank-pdbe-kb) - code used to share predictions with [PDBe-KB](https://www.ebi.ac.uk/pdbe/pdbe-kb)
17 |
18 | ## Documentation
19 |
20 | - [Wiki](https://github.com/cusbg/p2rank-framework/wiki) in this repository
21 | - [P2Rank tutorials](https://github.com/rdk/p2rank/tree/develop/misc/tutorials) available for some more advanced topics (such as hyperparameter optimization) related to P2Rank backend (some information might overlap with the docs available in this repo)
22 |
23 | ## Datasets
24 | - protein-ligand
25 | - https://github.com/rdk/p2rank-datasets
26 | - protein-DNA
27 | - https://github.com/cusbg/p2rank-data-dna
28 |
29 | ## Publications
30 | - Lukáš Polák, Petr Škoda, Kamila Riedlová, Radoslav Krivák, Marian Novotný and David Hoksza. [PrankWeb 4: a modular web server for protein–ligand binding site prediction and downstream analysis](https://doi.org/10.1093/nar/gkaf421). Nucleic Acids Research. May 2025
31 | - Dávid Jakubec, Petr Škoda, Radoslav Krivák, Marian Novotný and David Hoksza. [PrankWeb 3: accelerated ligand-binding site predictions for experimental and modelled protein structures](https://doi.org/10.1093/nar/gkac389). Nucleic Acids Research. May 2022
32 | - Lukáš Jendele and Radoslav Krivák and Petr Škoda and Marian Novotný and David Hoksza. [PrankWeb: a web server for ligand binding site prediction and visualization](https://academic.oup.com/nar/article/47/W1/W345/5494740?login=true). Nucleic Acids Research. May 2019
33 | - Radoslav Krivák and David Hoksza. [P2Rank: machine learning based tool for rapid and accurate prediction of ligand binding sites from protein structure](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0285-8). Journal of Cheminformatics. Aug 2018
34 |
--------------------------------------------------------------------------------
/logo/logo-ion.svg:
--------------------------------------------------------------------------------
1 |
2 |
97 |
--------------------------------------------------------------------------------
/model-training/model-training.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Download and set up P2Rank
4 | wget https://github.com/rdk/p2rank/releases/download/2.3/p2rank_2.3.tar.gz
5 | tar xzf p2rank_2.3.tar.gz
6 | rm p2rank_2.3.tar.gz
7 | cd p2rank_2.3/
8 | sed -i 's/2048m/8G/' prank
9 |
10 | # Download p2rank-datasets
11 | git clone https://github.com/rdk/p2rank-datasets.git
12 |
13 | # Set up conservation calculations (see conservation_hmm README for requirements)
14 | mkdir conservation
15 | cd conservation/
16 | wget https://raw.githubusercontent.com/cusbg/prankweb/master/conservation/conservation_hmm/conservation_hmm.py
17 | chmod +x conservation_hmm.py
18 | mkdir databases fastas homs
19 | cd databases/
20 | wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
21 | gunzip uniprot_sprot.fasta.gz
22 | wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz
23 | gunzip uniref50.fasta.gz
24 | wget https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
25 | cd ../
26 |
27 | # Prepare fasta files
28 | cd ../
29 | for dataset in chen11 coach420 holo4k joined
30 | do
31 | ./prank analyze fasta-masked p2rank-datasets/${dataset}.ds -o conservation/fastas/${dataset} &> conservation/fastas/${dataset}.log
32 | done
33 |
34 | # Calculate conservations
35 | cd conservation/
36 | for dataset in chen11 coach420 holo4k joined
37 | do
38 | for database in uniprot_sprot uniref50
39 | do
40 | mkdir -p homs/${dataset}/${database}
41 | for fasta_file in fastas/${dataset}/*.fasta
42 | do
43 | cp "${fasta_file}" no_spaces.fasta
44 | mkdir tmp_dir
45 | ./conservation_hmm.py no_spaces.fasta databases/${database}.fasta tmp_dir/ homs/${dataset}/${database}/"$(basename "${fasta_file}")".hom --max_seqs 1000 &> homs/${dataset}/${database}/"$(basename "${fasta_file}")".log
46 | rm no_spaces.fasta
47 | rm -r tmp_dir/
48 | done
49 | done
50 | done
51 |
52 | # Prepare masked conservation files
53 | wget https://raw.githubusercontent.com/cusbg/prankweb/master/conservation/conservation_hmm/examples/mask_ic_file.py
54 | chmod +x mask_ic_file.py
55 | cd homs/
56 | for dataset in chen11 coach420 holo4k joined
57 | do
58 | cd ${dataset}/
59 | for database in uniprot_sprot uniref50
60 | do
61 | cd ${database}/
62 | for max_freqgap in 30 50 70 90
63 | do
64 | mkdir -p masked_${max_freqgap}
65 | for hom_file in *.hom
66 | do
67 | ../../../mask_ic_file.py "${hom_file}" "${hom_file}".freqgap masked_${max_freqgap}/"$(basename "${hom_file}" .hom)".masked_${max_freqgap}.hom 0.${max_freqgap} -1000.0
68 | done
69 | done
70 | cd ../
71 | done
72 | cd ../
73 | done
74 |
75 | # Fix filenames containing spaces
76 | for dataset in joined
77 | do
78 | cd ${dataset}/
79 | for database in uniprot_sprot uniref50
80 | do
81 | cd ${database}/
82 | for max_freqgap in 30 50 70 90
83 | do
84 | cd masked_${max_freqgap}/
85 | cp ../../../../../../fix_filenames.py .
86 | ./fix_filenames.py
87 | rm fix_filenames.py
88 | cd ../
89 | done
90 | cd ../
91 | done
92 | cd ../
93 | done
94 |
95 | # Prepare new models
96 | cd ../../
97 | cp ../conservation_hmm.groovy config/
98 | for database in uniprot_sprot uniref50
99 | do
100 | for max_freqgap in 30 50 70 90
101 | do
102 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c conservation_hmm.groovy -conservation_dirs "(../conservation/homs/chen11/${database}/masked_${max_freqgap}, ../conservation/homs/joined/${database}/masked_${max_freqgap})" -delete_models false -loop 10 -o new_models/${database}/masked_${max_freqgap} -threads 4
103 | # ./prank traineval -t p2rank-datasets/chen11-fpocket.ds -e p2rank-datasets/joined.ds -c conservation_hmm.groovy -conservation_dirs "(../conservation/homs/chen11/${database}/masked_${max_freqgap}, ../conservation/homs/joined/${database}/masked_${max_freqgap})" -delete_models false -loop 10 -o new_models/${database}/masked_${max_freqgap} -threads 4
104 | done
105 | done
106 |
107 | # Evaluate the new models
108 | for dataset in coach420 holo4k
109 | do
110 | for database in uniprot_sprot uniref50
111 | do
112 | for max_freqgap in 30 50 70 90
113 | do
114 | for seed in $(seq 42 1 51)
115 | do
116 | ./prank eval-predict p2rank-datasets/${dataset}.ds -c conservation_hmm.groovy -conservation_dirs ../conservation/homs/${dataset}/${database}/masked_${max_freqgap} -m new_models/${database}/masked_${max_freqgap}/runs/seed.${seed}/FastRandomForest.model -o new_models_evaluation/${dataset}/${database}/masked_${max_freqgap}/runs/seed.${seed} -threads 4
117 | done
118 | done
119 | done
120 | done
121 |
122 | # Rename the selected model
123 | cp new_models/uniref50/masked_50/runs/seed.45/FastRandomForest.model new_models/uniref50/masked_50/runs/seed.45/conservation_hmm.model
124 |
--------------------------------------------------------------------------------
/logo/logo-ligand.svg:
--------------------------------------------------------------------------------
1 |
2 |
98 |
--------------------------------------------------------------------------------
/model-training-updated/model-training-updated.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Set up P2Rank
4 | cp ../p2rank_2.4-beta.3.tar.gz .
5 | tar xzf p2rank_2.4-beta.3.tar.gz
6 | rm p2rank_2.4-beta.3.tar.gz
7 | cd p2rank_2.4-beta.3/
8 | sed -i 's/2048m/16G/' prank
9 |
10 | # Download p2rank-datasets
11 | git clone https://github.com/rdk/p2rank-datasets.git &> p2rank-datasets.log
12 |
13 | # Set up conservation calculations
14 | mkdir conservation
15 | cd conservation/
16 | mkdir fastas homs
17 | cd ../
18 |
19 | # Prepare fasta files
20 | for dataset in chen11 coach420 holo4k joined
21 | do
22 | ./prank analyze fasta-masked p2rank-datasets/${dataset}.ds -o conservation/fastas/${dataset} &> conservation/fastas/${dataset}.log
23 | done
24 |
25 | # Calculate conservations
26 | cd conservation/
27 | for dataset in chen11 coach420 holo4k joined
28 | do
29 | for database in uniref50
30 | do
31 | mkdir -p homs/${dataset}/${database}
32 | for fasta_file in fastas/${dataset}/*.fasta
33 | do
34 | cp "${fasta_file}" no_spaces.fasta
35 | mkdir tmp_dir
36 | ../../../PDBe-predictions/compute-conservation/compute-conservation.py no_spaces.fasta ../../../PDBe-predictions/compute-conservation/${database}.fasta tmp_dir/ homs/${dataset}/${database}/"$(basename "${fasta_file}")".hom &> homs/${dataset}/${database}/"$(basename "${fasta_file}")".log
37 | rm no_spaces.fasta
38 | rm -r tmp_dir/
39 | done
40 | done
41 | done
42 |
43 | # Prepare new models
44 | cd ../
45 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm
46 | do
47 | rm -f models/${configuration}.model
48 | rm -f models/score/${configuration}_*.json
49 | rm -f models/score/residue/${configuration}_*.json
50 | sed 's# model#\/\/ model#' config/${configuration}.groovy | sed 's# zscoretp#\/\/ zscoretp#' | sed 's# probatp#\/\/ probatp#' > config/${configuration}_training.groovy
51 | for database in uniref50
52 | do
53 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c ${configuration}_training.groovy -conservation_dirs "(../conservation/homs/chen11/${database}, ../conservation/homs/joined/${database})" -delete_models false -loop 20 -o new_models/${configuration}/${database} -threads 16
54 | done
55 | done
56 |
57 | for configuration in default
58 | do
59 | for database in uniref50
60 | do
61 | ./prank traineval -t p2rank-datasets/chen11.ds -e p2rank-datasets/joined.ds -c ${configuration}.groovy -delete_models false -loop 20 -o new_models/${configuration}/${database} -threads 16
62 | done
63 | rm -f models/${configuration}.model
64 | # rm -f models/score/${configuration}_*.json
65 | # rm -f models/score/residue/p2rank_${configuration}_*.json
66 | done
67 |
68 | cp new_models/alphafold_conservation_hmm/uniref50/runs/seed.46/FastRandomForest.model models/alphafold_conservation_hmm.model
69 | cp new_models/alphafold/uniref50/runs/seed.49/FastRandomForest.model models/alphafold.model
70 | cp new_models/conservation_hmm/uniref50/runs/seed.45/FastRandomForest.model models/conservation_hmm.model
71 | cp new_models/default/uniref50/runs/seed.58/FastRandomForest.model models/default.model
72 |
73 | cp config/default.groovy config/default_training.groovy
74 | mkdir transformers
75 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm default
76 | do
77 | ./prank eval-predict p2rank-datasets/holo4k.ds -c ${configuration}_training.groovy -conservation_dirs ../conservation/homs/holo4k/uniref50 -m ${configuration}.model -o transformers/${configuration} -threads 16 -train_score_transformers "(ProbabilityScoreTransformer, ZscoreTpTransformer)" -train_score_transformers_for_residues true -visualizations false &> transformers/${configuration}.log
78 | cp transformers/${configuration}/score/ProbabilityScoreTransformer.json models/score/${configuration}_ProbabilityScoreTransformer.json
79 | cp transformers/${configuration}/score/ZscoreTpTransformer.json models/score/${configuration}_ZscoreTpTransformer.json
80 | cp transformers/${configuration}/residue-score/ProbabilityScoreTransformer.json models/score/residue/${configuration}_ProbabilityScoreTransformer.json
81 | cp transformers/${configuration}/residue-score/ZscoreTpTransformer.json models/score/residue/${configuration}_ZscoreTpTransformer.json
82 | done
83 |
84 | sed -i 's/default_probatp.json/default_ProbabilityScoreTransformer.json/' config/default.groovy
85 | sed -i 's/default_zscoretp.json/default_ZscoreTpTransformer.json/' config/default.groovy
86 | sed -i 's/p2rank_default_proba.json/default_ProbabilityScoreTransformer.json/' config/default.groovy
87 | sed -i 's/p2rank_default_zscore.json/default_ZscoreTpTransformer.json/' config/default.groovy
88 |
89 | # Evaluate the new models
90 | for configuration in alphafold_conservation_hmm alphafold conservation_hmm default
91 | do
92 | for dataset in coach420 holo4k
93 | do
94 | ./prank eval-predict p2rank-datasets/${dataset}.ds -c ${configuration}.groovy -conservation_dirs ../conservation/homs/${dataset}/uniref50 -o new_models_evaluation/${configuration}/${dataset} -threads 16
95 | done
96 | done
97 |
--------------------------------------------------------------------------------
/logo/logo-path.svg:
--------------------------------------------------------------------------------
1 |
2 |
101 |
--------------------------------------------------------------------------------
/logo/logo-ion-path.svg:
--------------------------------------------------------------------------------
1 |
2 |
115 |
--------------------------------------------------------------------------------
/logo/logo-dna.svg:
--------------------------------------------------------------------------------
1 |
2 |
91 |
--------------------------------------------------------------------------------
/logo/logo-ligand-path.svg:
--------------------------------------------------------------------------------
1 |
2 |
134 |
--------------------------------------------------------------------------------
/logo/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
426 |
--------------------------------------------------------------------------------