├── .idea
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── PDBClean-0.0.2.iml
├── images
└── FlowChart.png
├── Notebooks
├── images
│ ├── JalviewTIMB_2.png
│ ├── TIMJalview0_2.png
│ └── TIM_PyMOL_CatalyticResidues.png
├── CheckProject_CheckCreateDelete.ipynb
├── Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb
├── Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb
└── Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb
├── src
├── __init__.py
├── listutils.py
├── pdbclean_io.py
├── pdbutils.py
├── cleanutils.py
├── alignmentutils.py
└── pdbcleanresiduestandardizationutils.py
├── LICENSE
├── CONTRIBUTING.md
├── scripts
├── PDBClean_ResidueStandardization_CIF.py
├── PDBClean_ChainStandardization_CIF.py
└── PDBClean_MolID_CIF.py
├── CODE_OF_CONDUCT.md
├── setup.py
├── environment_M1.yml
├── environment.yml
└── README.md
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/images/FlowChart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/images/FlowChart.png
--------------------------------------------------------------------------------
/Notebooks/images/JalviewTIMB_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/JalviewTIMB_2.png
--------------------------------------------------------------------------------
/Notebooks/images/TIMJalview0_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/TIMJalview0_2.png
--------------------------------------------------------------------------------
/Notebooks/images/TIM_PyMOL_CatalyticResidues.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/TIM_PyMOL_CatalyticResidues.png
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/PDBClean-0.0.2.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # PDBClean base dir
2 |
3 | import logging
4 | import glob
5 | import os
6 | import sys
7 |
8 |
9 | # list all the files included in PDBClean
10 | __all__ = [os.path.basename(f)[:-3] for f in glob.glob(os.path.dirname(__file__) + "/*.py") if not f.endswith('__init__.py')]
11 |
12 | # set up the logger
13 | logger = logging.getLogger(__name__)
14 | logger.setLevel(logging.INFO)
15 |
16 | sh = logging.StreamHandler(sys.stdout)
17 | formatter = logging.Formatter(fmt='%(asctime)s - %(message)s', datefmt="%H:%M:%S")
18 | sh.setFormatter(formatter)
19 |
20 | logger.addHandler(sh)
21 | logger.propagate = False
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Michael Levitt's Lab at Stanford University
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/listutils.py:
--------------------------------------------------------------------------------
1 | #
2 | import os
3 | #
4 | def remove_file_defined_chain_from_list(list):
5 | """
6 | Removes specified chain IDs from list based on user's file input.
7 |
8 | The user is prompted to enter the file name containing the chain IDs they want to remove.
9 | The chain IDs in the file will be removed from the chain ID list.
10 |
11 | Parameters:
12 | -----------
13 | list : list
14 | Contains all the chain IDs from CIF(s)
15 |
16 | Returns:
17 | --------
18 | list : list
19 | Updated list without the chain ID from user's input
20 | """
21 | remove_chid = []
22 | print(" Enter the file name containing the list of chain IDs you want removed from Standard Sequences.")
23 | user_input = input('File: ')
24 | if (os.path.isfile(user_input) == True):
25 | my_file = open(user_input)
26 | for line in my_file:
27 | remove_chid.append(line.strip())
28 | else:
29 | print("File does not exist.")
30 | list = remove_chid_from_list(list, remove_chid)
31 | return list
32 |
33 | def remove_user_defined_chain_from_list(list):
34 | """
35 | Removes the chain ID from list based off of user's input of chain ID
36 |
37 | The user is prompted to input the chain ID which they wish to remove
38 |
39 | Parameter:
40 | ----------
41 | list : list
42 | Contains all the chain IDs from CIF(s)
43 |
44 | Returns:
45 | --------
46 | list : list
47 | Updated list without the chain ID from user's input
48 | """
49 | remove_chid = []
50 | user_input = ""
51 | print(" Enter chain IDs of the chains you want removed. When done, enter DONE.")
52 | while (user_input != "DONE"):
53 | user_input = input('Chain ID: ')
54 | remove_chid.append(user_input)
55 | list = remove_chid_from_list(list, remove_chid)
56 | return list
57 |
58 | def remove_chid_from_list(list, remove_list):
59 | """
60 | Removes the chain ID from the list
61 |
62 | Parameters:
63 | -----------
64 | list : list
65 | contains the chain IDs
66 | remove_list : list
67 | contains the chain IDs to be removed
68 |
69 | Returns:
70 | --------
71 | list : list
72 | Updated list without the chain ID from user's input
73 | """
74 | for elt in remove_list:
75 | if elt in list:
76 | list.remove(elt)
77 | return list
78 |
79 | def show_list(list):
80 | """
81 | This function prints each item in a list.
82 |
83 | Parameters:
84 | -----------
85 | list : list
86 | A list of items to be printed.
87 |
88 | Returns:
89 | --------
90 | None
91 | """
92 | for elt in list:
93 | print(elt)
94 |
95 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute to PDBCleanV2
2 |
3 | Hello! Thank you for helping us improve our project!
4 |
5 | Please read and follow our guidelines to ensure a positive experience for contributors and maintainers.
6 |
7 | ## :page_with_curl: Code of Conduct
8 |
9 | Before you start, review our [Code of Conduct](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/CODE_OF_CONDUCT.md). Bad behavior will not be tolerated.
10 |
11 | ## :incoming_envelope: Opening an Issue
12 |
13 | Sign in to your GitHub account and create a **New Issue** in [GitHub issues](https://github.com/fatipardo/PDBClean-0.0.2/issues)
14 |
15 | Before you create a new entry, please check if the issue has already been reported in [GitHub](https://github.com/fatipardo/PDBClean-0.0.2/issues). And review our [documentation](https://github.com/fatipardo/PDBClean-0.0.2/tree/master/Notebooks).
16 |
17 | Make sure to select an appropriate label for the issue before you submit it (e.g. bug, enhancement, etc).
18 |
19 | ### :bug: Bug reports
20 |
21 | Please write a bug report that you would like to receive.
22 |
23 | - Include the operating system and processor on which the code is being run on.
24 | - Include the location of where the issue arose.
25 | - If you get an error message, include it in the issue.
26 | - Use [GitHub-flavored Markdown](https://help.github.com/en/github/writing-on-github/basic-writing-and-formatting-syntax). Especially put code blocks and console outputs in backticks (```). This improves readability.
27 |
28 | > **If an issue already exists.**
29 | Comment on the existing issue to add more information or leave a reaction on their issue. This helps us become aware of which issue is being faced more commonly and allows us to prioritize which issues to solve first.
30 |
31 | ### :art: Feature request
32 |
33 | - Make sure to select the enhancement label when submitting an issue.
34 | - Be precise about the proposed outcome of the feature and how it relates to existing features. Include implementation details if possible.
35 | - Do not open a duplicate feature request. Search for existing feature requests first. If you find your feature (or one very similar) previously requested, comment on that issue.
36 |
37 | ## :construction: Do you want to help fix an issue?
38 |
39 | - Comment "take" to the issue you want to fix and we will assign it to you.
40 | - When you submit a pull request, add the GitHub issue number in the title.
41 | - **Only submit a pull request to issues that have been asssigned to you**
42 |
43 |
44 | ## :purple_heart: Credits
45 |
46 | Written by [@fatipardo](https://github.com/fatipardo) and [@gdkwxn](https://github.com/gdkwxn).
47 |
48 | Many of the ideas and prose for the statements in this document were based on or inspired by the [contributing page](https://github.com/jessesquires/.github/blob/main/CONTRIBUTING.md) written by [@jessesquires](https://github.com/jessesquires)
49 |
--------------------------------------------------------------------------------
/scripts/PDBClean_ResidueStandardization_CIF.py:
--------------------------------------------------------------------------------
1 | #!/Users/fatima/anaconda3/envs/PDBCleanV2/bin/python
2 | # coding: utf-8
3 |
4 | from __future__ import print_function
5 | from __future__ import division
6 | import sys, glob
7 | from PDBClean import pdbcleanresiduestandardizationutils as resstd
8 |
9 | ########################
10 | # READ INPUT ARGUMENTS #
11 | ########################
12 | n_arg = len(sys.argv)
13 | if(n_arg<3):
14 | print('Usage error: {0} '.format(sys.argv[0]))
15 | sys.exit()
16 | source_dir=sys.argv[1]
17 | target_dir=sys.argv[2]
18 |
19 |
20 | #############################################
21 | # READ PDB FILES AND DEFINE STRUCTURE LISTS #
22 | #############################################
23 | filelist=glob.glob(source_dir+'/*.cif')
24 | Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list = resstd.pdb_to_structurelists(filelist)
25 |
26 |
27 | ############################################
28 | # INTERACTIVE RESIDUE STANDARDIZATION MENU #
29 | ############################################
30 | input_menu = ""
31 | input_menu_check = ""
32 |
33 | while(input_menu != "QUIT"):
34 | print("PDBClean Residue Number Standardization Menu",
35 | " After checking all structures are loaded, select option 1 to proceed:",
36 | " 1) Proceed to multiple alignment menu",
37 | sep="\n")
38 | if(input_menu_check == "1"):
39 | print(" 2) View conversion template")
40 | print(" 3) Perform residue number standardization")
41 | print(" 4) Save conversion template")
42 | print(" OR Type QUIT to exit")
43 | input_menu = input('Option Number: ')
44 | if (input_menu == "1"):
45 | Structure_Sequences_Aligned, Structure_ConversionTemplate, chid_list, input_menu_check = resstd.perform_multiple_alignment(Structure_Sequences,
46 | ChID_ResiNum_Vector,
47 | structid_list,
48 | chid_list,
49 | input_menu_check)
50 | elif (input_menu == "2" and input_menu_check == "1"):
51 | resstd.show_conversiontemplate(Structure_ConversionTemplate)
52 | #elif (input_menu == "3" and input_menu_check == "1"):
53 | # resstd.conversiontemplate_to_pdb(filelist, Structure_ConversionTemplate, target_dir=target_dir)
54 | elif (input_menu == "4" and input_menu_check == "1"):
55 | resstd.write_and_show_conversiontemplate(Structure_ConversionTemplate,target_dir,True)
56 | elif (input_menu == "3" and input_menu_check == "1"):
57 | resstd.conversiontemplate_to_pdb_FAPA(filelist, Structure_ConversionTemplate, target_dir=target_dir)
58 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate. Enforcement of these policies may include warnings and/or banning.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 |
60 | ## Attribution
61 |
62 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
63 | version 2.0, available at
64 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
65 |
66 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
67 | enforcement ladder](https://github.com/mozilla/diversity).
68 |
69 | [homepage]: https://www.contributor-covenant.org
70 |
71 | For answers to common questions about this code of conduct, see the FAQ at
72 | https://www.contributor-covenant.org/faq. Translations are available at
73 | https://www.contributor-covenant.org/translations.
74 |
--------------------------------------------------------------------------------
/scripts/PDBClean_ChainStandardization_CIF.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | #
4 | from __future__ import print_function
5 | from __future__ import division
6 | import sys, glob
7 | from PDBClean import pdbcleanchainstandardizationutils as chainstd
8 |
9 | ########################
10 | # READ INPUT ARGUMENTS #
11 | ########################
12 | n_arg = len(sys.argv)
13 | if(n_arg<3):
14 | print('Usage error: {0} '.format(sys.argv[0]))
15 | sys.exit()
16 | source_dir=sys.argv[1]
17 | target_dir=sys.argv[2]
18 |
19 |
20 | #############################################
21 | # READ PDB FILES AND DEFINE STRUCTURE LISTS #
22 | #############################################
23 |
24 | filelist=glob.glob(source_dir+'/*.cif')
25 | Structure_Sequences, structid_list, chid_list = chainstd.pdb_to_structurelists(filelist)
26 | Standard_Sequences = {}
27 |
28 |
29 | ############################################
30 | # INTERACTIVE ChainID STANDARDIZATION MENU #
31 | ############################################
32 | input_menu = ""
33 | input_menu_check_1 = ""
34 |
35 | while(input_menu != "QUIT"):
36 | print("PDBClean ChainID Standardization Menu",
37 | " Select one of the following options to proceed:",
38 | " 1) Select Standard Sequences from a chosen input structure",
39 | " 2) Generate Standard Sequences based on all the input structures",
40 | sep="\n")
41 | if(input_menu_check_1 == "1"):
42 | print(" 3) Inspect/Edit Standard Sequences",
43 | " 4) Perform Standardization of Chain IDs",
44 | sep="\n")
45 | input_menu = input('Option Number: ')
46 | if (input_menu == "1"):
47 | Standard_Sequences, input_menu_check_1 = chainstd.select_standard_seq_from_reference(Structure_Sequences,
48 | Standard_Sequences,
49 | structid_list,
50 | input_menu_check_1)
51 | elif (input_menu == "2"):
52 | Standard_Sequences, input_menu_check_1 = chainstd.create_standard_seq_from_consensus(Structure_Sequences,
53 | Standard_Sequences,
54 | chid_list,
55 | input_menu_check_1)
56 | print("These are the standard sequences:")
57 | print(Standard_Sequences)
58 | elif (input_menu == "3" and input_menu_check_1 == "1"):
59 | chainstd.review_standard_seq(Structure_Sequences, Standard_Sequences)
60 |
61 | elif (input_menu == "4" and input_menu_check_1=="1"):
62 | chainstd.align_to_std_seq_and_save_to_disk(Structure_Sequences,
63 | Standard_Sequences,
64 | structid_list,
65 | filelist,
66 | target_dir=target_dir)
67 | print("Done!")
68 | input_menu = "QUIT"
69 |
--------------------------------------------------------------------------------
/src/pdbclean_io.py:
--------------------------------------------------------------------------------
1 | import sys, os, shutil, datetime
2 | #
3 |
4 | def check_project(projdir=None, level='top', action='create', verbose=True):
5 | """
6 | Manages the project directory by creating, cleaning, or deleting directories.
7 |
8 | Parameters:
9 | -----------
10 | projdir : str, optional
11 | The path to the project directory. If None, a message will display asking to provide the path.
12 | level : str, optional
13 | Specifies the directory level. Default is 'top', meaning the project directory itself.
14 | You can specify a subdirectory within the project directory.
15 | action : str, optional
16 | The action to perform on the directory. Options are:
17 | - 'create': Create the directory if it doesn't already exist.
18 | - 'clean': Remove all files in the directory, leaving it empty.
19 | - 'delete': Deletes the directory and everything within it.
20 | verbose : bool, optional
21 | If True, prints informative messages about the actions being performed. Default is True.
22 |
23 | Returns:
24 | --------
25 | None
26 | """
27 |
28 | if projdir is None:
29 | print("Please provide a project directory path")
30 | else:
31 | dirname = projdir
32 | if(level!='top'):
33 | dirname=dirname+'/'+level
34 | if(action=='create'):
35 | create_dir(dirname, verbose=verbose)
36 | elif(action=='clean'):
37 | clean_dir(dirname, verbose=verbose)
38 | elif(action=='delete'):
39 | delete_dir(dirname, verbose=verbose)
40 |
41 | def create_dir(dirpath, verbose=True):
42 | """
43 | Creates a directory if it does not exist, and writes a creation timestamp in 'info.txt'.
44 |
45 | Parameters:
46 | -----------
47 | dirpath : str
48 | The path of the directory to create.
49 | verbose : bool, optional
50 | If True, prints informative messages about the action taken. Default is True.
51 |
52 | Returns:
53 | --------
54 | None
55 | """
56 |
57 | if not os.path.exists(dirpath):
58 | os.mkdir(dirpath)
59 | if verbose:
60 | now=datetime.datetime.now()
61 | f = open(dirpath+'/info.txt', 'w')
62 | f.write('directory created on {0}'.format(now))
63 | f.close()
64 | else:
65 | if verbose:
66 | print('{0} already exists, with content:'.format(dirpath))
67 | print(os.listdir(dirpath))
68 |
69 | def clean_dir(dirpath, verbose=True):
70 | """
71 | Removes all files from the specified directory, leaving it empty.
72 |
73 | Parameters:
74 | -----------
75 | dirpath : str
76 | The path of the directory to clean.
77 | verbose : bool, optional
78 | If True, a message is printed regarding the action taken. Default is True.
79 |
80 | Returns:
81 | --------
82 | None
83 | """
84 |
85 | if os.path.exists(dirpath):
86 | listfile = (file for file in os.listdir(dirpath) if os.path.isfile(os.path.join(dirpath, file)))
87 | if verbose:
88 | print('Cleaning {0}...'.format(dirpath))
89 | for f in listfile:
90 | os.remove(dirpath+'/'+f)
91 |
92 | def delete_dir(dirpath, verbose=True):
93 | """
94 | Deletes the specified directory and all of its contents.
95 |
96 | Parameters:
97 | -----------
98 | dirpath : str
99 | The path of the directory to delete.
100 | verbose : bool, optional
101 | If True, a message is printed regarding the action taken. Default is True.
102 |
103 | Returns:
104 | --------
105 | None
106 | """
107 |
108 | if os.path.exists(dirpath):
109 | shutil.rmtree(dirpath)
110 | if verbose:
111 | print('Deleting {0}...'.format(dirpath))
112 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | setup.py: Install PDBClean
3 | """
4 |
5 | import os
6 | import sys
7 | import re
8 | import subprocess
9 | from os.path import join as pjoin
10 | from glob import glob
11 |
12 | from distutils.extension import Extension
13 | from distutils.core import setup
14 |
15 | from Cython.Distutils import build_ext
16 | import numpy
17 |
18 | # ------------------------------------------------------------------------------
19 | # HEADER
20 | #
21 |
22 | VERSION = "0.0.2"
23 | ISRELEASED = False
24 | DISABLE_CUDA = True
25 | __author__ = "Levitt Lab, Stanford"
26 | __version__ = VERSION
27 |
28 | metadata = {
29 | 'name': 'PDBClean',
30 | 'version': VERSION,
31 | 'author': __author__,
32 | 'author_email': 'fpardo@stanford.edu',
33 | 'license': 'MIT',
34 | 'url': 'https://github.com/fatipardo/PDBClean-0.0.2',
35 | 'download_url': 'https://github.com/fatipardo/PDBClean-0.0.2',
36 | 'platforms': ['Linux', 'OSX'],
37 | 'description': "PDB curation tools",
38 | 'long_description': """PDBClean offers curation tools for structural ensemble deposited in the Protein Data Bank."""}
39 |
40 | # ------------------------------------------------------------------------------
41 | # HELPER FUNCTIONS -- path finding, git, python version, readthedocs
42 | #
43 |
44 | class bcolors:
45 | HEADER = '\033[95m'
46 | OKBLUE = '\033[94m'
47 | OKGREEN = '\033[92m'
48 | WARNING = '\033[93m'
49 | FAIL = '\033[91m'
50 | ENDC = '\033[0m'
51 |
52 |
53 | def print_warning(string):
54 | print(bcolors.WARNING + string + bcolors.ENDC)
55 |
56 |
57 | def find_in_path(name, path):
58 | "Find a file in a search path"
59 | #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
60 | for dir in path.split(os.pathsep):
61 | binpath = pjoin(dir, name)
62 | if os.path.exists(binpath):
63 | return os.path.abspath(binpath)
64 | return None
65 |
66 |
67 | def get_numpy_include():
68 | """
69 | Obtain the numpy include directory. This logic works across numpy versions.
70 | """
71 | try:
72 | numpy_include = numpy.get_include()
73 | except AttributeError:
74 | numpy_include = numpy.get_numpy_include()
75 | return numpy_include
76 |
77 |
78 | def git_version():
79 | """
80 | Return the git revision as a string.
81 | Copied from numpy setup.py
82 | """
83 |
84 | def _minimal_ext_cmd(cmd):
85 | # construct minimal environment
86 | env = {}
87 | for k in ['SYSTEMROOT', 'PATH']:
88 | v = os.environ.get(k)
89 | if v is not None:
90 | env[k] = v
91 | # LANGUAGE is used on win32
92 | env['LANGUAGE'] = 'C'
93 | env['LANG'] = 'C'
94 | env['LC_ALL'] = 'C'
95 | out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
96 | return out
97 |
98 | try:
99 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
100 | GIT_REVISION = out.strip().decode('ascii')
101 | except OSError:
102 | GIT_REVISION = "Unknown"
103 |
104 | return GIT_REVISION
105 |
106 | # -----------------------------------------------------------------------------
107 | # INSTALL
108 |
109 | metadata['packages'] = ['PDBClean']
110 | metadata['package_dir'] = {'PDBClean' : 'src'}
111 | metadata['ext_modules'] = []
112 | metadata['scripts'] = [s for s in glob('scripts/*') if not s.endswith('__.py')]
113 | #metadata['data_files'] = [('reference', glob('./reference/*'))]
114 | #metadata['cmdclass'] = {'build_ext': custom_build_ext}
115 |
116 | # ------------------------------------------------------------------------------
117 | #
118 | # Finally, print a warning at the *end* of the build if something fails
119 | #
120 |
121 | def print_warnings():
122 | print("\n")
123 |
124 | if __name__ == '__main__':
125 | setup(**metadata) # ** will unpack dictionary 'metadata' providing the values as arguments
126 | print_warnings()
127 |
--------------------------------------------------------------------------------
/environment_M1.yml:
--------------------------------------------------------------------------------
1 | name: PDBCleanV2
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | dependencies:
6 | - appnope=0.1.3
7 | - argon2-cffi=21.3.0
8 | - argon2-cffi-bindings=21.2.0
9 | - asttokens=2.0.5
10 | - attrs=21.4.0
11 | - backcall=0.2.0
12 | - backports=1.0
13 | - backports.functools_lru_cache=1.6.4
14 | - beautifulsoup4=4.11.1
15 | - biopython=1.83
16 | - bleach=5.0.1
17 | - brotli=1.0.9
18 | - brotli-bin=1.0.9
19 | - bzip2=1.0.8
20 | - ca-certificates=2023.5.7
21 | - certifi=2023.5.7
22 | - cffi=1.15.1
23 | - cvxopt=1.3.0
24 | - cycler=0.11.0
25 | - cython=0.29.30
26 | - debugpy=1.6.0
27 | - decorator=5.1.1
28 | - defusedxml=0.7.1
29 | - dsdp=5.8
30 | - entrypoints=0.4
31 | - executing=0.8.3
32 | - fftw=3.3.10
33 | - flit-core=3.7.1
34 | - fonttools=4.34.4
35 | - freetype=2.10.4
36 | - gettext=0.19.8.1
37 | - giflib=5.2.1
38 | - glib=2.72.1
39 | - glib-tools=2.72.1
40 | - glpk=4.65
41 | - gmp=6.2.1
42 | - gsl=2.7
43 | - gst-plugins-base=1.20.3
44 | - gstreamer=1.20.3
45 | - icu=70.1
46 | - importlib-metadata=4.11.4
47 | - importlib_resources=5.8.0
48 | - ipykernel=6.15.1
49 | - ipython=8.4.0
50 | - ipython_genutils=0.2.0
51 | - ipywidgets=7.7.1
52 | - jedi=0.18.1
53 | - jinja2=3.1.2
54 | - jpeg=9e
55 | - jsonschema=4.7.2
56 | - jupyter=1.0.0
57 | - jupyter_client=7.3.4
58 | - jupyter_console=6.4.4
59 | - jupyter_core=4.10.0
60 | - jupyterlab_pygments=0.2.2
61 | - jupyterlab_widgets=1.1.1
62 | - kiwisolver=1.4.3
63 | - krb5=1.19.3
64 | - lcms2=2.12
65 | - lerc=3.0
66 | - libblas=3.9.0
67 | - libbrotlicommon=1.0.9
68 | - libbrotlidec=1.0.9
69 | - libbrotlienc=1.0.9
70 | - libcblas=3.9.0
71 | - libclang=14.0.6
72 | - libclang13=14.0.6
73 | - libcxx=14.0.6
74 | - libdeflate=1.12
75 | - libedit=3.1.20191231
76 | - libffi=3.4.2
77 | - libgfortran=5.0.0
78 | - libgfortran5
79 | - libglib=2.72.1
80 | - libiconv=1.16
81 | - liblapack=3.9.0
82 | - libllvm14=14.0.6
83 | - libogg=1.3.4
84 | - libopenblas=0.3.20
85 | - libopus=1.3.1
86 | - libpng=1.6.37
87 | - libpq=14.4
88 | - libsodium=1.0.18
89 | - libtiff=4.4.0
90 | - libvorbis=1.3.7
91 | - libwebp=1.2.2
92 | - libwebp-base=1.2.2
93 | - libxcb=1.13
94 | - libzlib=1.2.12
95 | - llvm-openmp=14.0.4
96 | - lz4-c=1.9.3
97 | - markupsafe=2.1.1
98 | - matplotlib=3.5.2
99 | - matplotlib-base=3.5.2
100 | - matplotlib-inline=0.1.3
101 | - metis=5.1.0
102 | - mistune=0.8.4
103 | - mmseqs2=14.7e284
104 | - mpfr=4.1.0
105 | - munkres=1.1.4
106 | - mysql-common=8.0.29
107 | - mysql-libs=8.0.29
108 | - nbclient=0.6.6
109 | - nbconvert=6.5.0
110 | - nbconvert-core=6.5.0
111 | - nbconvert-pandoc=6.5.0
112 | - nbformat=5.4.0
113 | - ncurses=6.3
114 | - nest-asyncio=1.5.5
115 | - notebook=6.4.12
116 | - nspr=4.32
117 | - nss=3.78
118 | - numpy=1.23.1
119 | - openjpeg=2.4.0
120 | - openssl=1.1.1u
121 | - packaging=21.3
122 | - pandas=2.0.0
123 | - pandoc
124 | - pandocfilters=1.5.0
125 | - parso=0.8.3
126 | - patsy=0.5.3
127 | - pcre=8.45
128 | - pexpect=4.8.0
129 | - pickleshare=0.7.5
130 | - pillow=9.2.0
131 | - ply=3.11
132 | - prometheus_client=0.14.1
133 | - prompt-toolkit=3.0.30
134 | - prompt_toolkit=3.0.30
135 | - psutil=5.9.1
136 | - pthread-stubs=0.4
137 | - ptyprocess=0.7.0
138 | - pure_eval=0.2.2
139 | - pycparser=2.21
140 | - pygments=2.12.0
141 | - pyparsing=3.0.9
142 | - pyqt=5.15.7
143 | - pyrsistent=0.18.1
144 | - python=3.10.5
145 | - python-dateutil=2.8.2
146 | - python-fastjsonschema=2.15.3
147 | - python-tzdata=2023.3
148 | - python_abi=3.10
149 | - pytz=2023.3
150 | - pyzmq=23.2.0
151 | - qt-main=5.15.4
152 | - qtconsole=5.3.1
153 | - qtconsole-base=5.3.1
154 | - qtpy=2.1.0
155 | - readline=8.1.2
156 | - scipy=1.8.1
157 | - seaborn=0.12.2
158 | - seaborn-base=0.12.2
159 | - send2trash=1.8.0
160 | - setuptools=63.1.0
161 | - sip=6.6.2
162 | - six=1.16.0
163 | - soupsieve=2.3.1
164 | - sqlite=3.39.0
165 | - stack_data=0.3.0
166 | - statsmodels=0.14.0
167 | - suitesparse=5.10.1
168 | - tbb=2021.5.0
169 | - terminado=0.15.0
170 | - tinycss2=1.1.1
171 | - tk=8.6.12
172 | - toml=0.10.2
173 | - tornado=6.2
174 | - traitlets=5.3.0
175 | - typing_extensions=4.6.3
176 | - tzdata=2022a
177 | - unicodedata2=14.0.0
178 | - wcwidth=0.2.5
179 | - webencodings=0.5.1
180 | - wheel=0.37.1
181 | - widgetsnbextension=3.6.1
182 | - xorg-libxau=1.0.9
183 | - xorg-libxdmcp=1.1.3
184 | - xz=5.2.5
185 | - zeromq=4.3.4
186 | - zipp=3.8.0
187 | - zlib=1.2.12
188 | - zstd=1.5.2
189 | - pip:
190 | - matching==1.4
191 | - pip==22.1.2
192 | - pyqt5-sip==12.11.0
193 | prefix: ~/opt/anaconda3/envs/PDBCleanV2
194 |
--------------------------------------------------------------------------------
/src/pdbutils.py:
--------------------------------------------------------------------------------
1 | #
2 | import os
3 | import shutil
4 | import re
5 | import numpy as np
6 | from urllib.request import urlopen
7 | from contextlib import closing, suppress
8 | #
9 | def download_pdb_from_metadata(metadata, projdir=None):
10 | """
11 | Downloads PDB files based on metadata, in this case the description lines of the fasta files,
12 | and saves them in the specified project directory.
13 |
14 | Parameters:
15 | -----------
16 | metadata : list of str
17 | A list of metadata strings, from which PDB IDs will be extracted.
18 | projdir : str, optional
19 | The path to the project directory where the PDB files will be saved. If None, a message will display.
20 |
21 | Returns:
22 | --------
23 | None
24 | """
25 | if projdir is None:
26 | print("Please provide a project directory ...")
27 | else:
28 | download_dir=projdir+'/raw_bank'
29 | if not os.path.exists(download_dir):
30 | os.mkdir(download_dir)
31 | idset = get_idset_from_metadata(metadata)
32 | for pdbid in idset:
33 | download_pdb_from_id(pdbid, download_dir=download_dir)
34 |
35 | def download_pdb_from_id(pdbid, pdbformat='.cif', download_dir=None):
36 | """
37 | Downloads a specific PDB file using its ID and saves it in the specified directory.
38 |
39 | Parameters:
40 | -----------
41 | pdbid : str
42 | The PDB ID of the file that will be downloaded.
43 | pdbformat : str, optional
44 | The format of the PDB file to be downloaded. Default is '.cif'.
45 | download_dir : str, optional
46 | The directory where the downloaded file will be saved. If None, a message will display.
47 |
48 | Returns:
49 | --------
50 | None
51 | """
52 | download_url='https://files.rcsb.org/download/'
53 | if download_dir is None:
54 | print("Please provide a directory where to store downloaded files...")
55 | else:
56 | target = download_dir+'/'+pdbid+pdbformat
57 | source = download_url+pdbid.upper()+pdbformat
58 | download_from_url(source, target)
59 |
60 | def get_idset_from_metadata(metadata):
61 | """
62 | Extracts and returns a set of unique PDB IDs from the provided metadata.
63 |
64 | Parameters:
65 | -----------
66 | metadata : list of str
67 | A list of metadata strings, each containing a PDB ID.
68 |
69 | Returns:
70 | --------
71 | idlist : list of str
72 | A sorted list of unique PDB IDs extracted from the metadata.
73 | """
74 | idlist = []
75 | for elt in metadata:
76 | idlist.append(elt[1:5])
77 | return sorted(set(idlist))
78 |
79 | #
80 | def retrieve_sequence_from_PDB(keyword, mode='sequence', update=True, seqfile=None):
81 | """
82 | Retrieves sequences or metadata from a PDB sequence file based on the keyword match.
83 |
84 | Parameters:
85 | ---------
86 | keyword : str
87 | The keyword to search for in the sequence or metadata.
88 | mode : str, optional
89 | Specifies whether to match the keyword in the 'sequence' or 'metadata'. Default is 'sequence'.
90 | update : bool, optional
91 | If True, the sequence file will be downloaded or updated before searching. Default is True.
92 | seqfile : str, optional
93 | The path to the sequence file. If None, the file will be downloaded if update is True.
94 |
95 | Returns:
96 | --------
97 | sequence : numpy.ndarray
98 | A list of sequences that match the keyword.
99 | metadata : numpy.ndarray
100 | A list of metadata associated with the matching sequences (fasta files description line).
101 | """
102 | if update:
103 | with suppress(FileNotFoundError):
104 | os.remove(seqfile) # remove existing seqfile if any
105 | seqfile = retrieve_seqfile(seqfile=seqfile)
106 | metadata = []
107 | sequence = []
108 | with open(seqfile) as f:
109 | nextline=False
110 | prevline='#'
111 | for line in f:
112 | if nextline:
113 | sequence.append(line)
114 | nextline=False
115 | else:
116 | hit = re.findall(keyword, line, flags=re.I)
117 | if hit:
118 | if(mode=='sequence'):
119 | metadata.append(prevline)
120 | sequence.append(line)
121 | elif(mode=='metadata'):
122 | metadata.append(line)
123 | nextline = True
124 | prevline=line
125 | return np.atleast_1d(sequence), np.atleast_1d(metadata)
126 | #
127 | def retrieve_seqfile(seqfile=None):
128 | """
129 | Downloads the PDB sequence file from the official RCSB FTP site.
130 |
131 | Parameters:
132 | -----------
133 | seqfile : str, optional
134 | The path where the sequence file will be saved. If None, the file will be saved as 'seqfile.txt'.
135 |
136 | Returns:
137 | --------
138 | seqfile : str
139 | The path to the downloaded sequence file.
140 | """
141 | #sequrl='ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' FAPA CHANGED ADDRESS FEB 2025
142 | sequrl='https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt'
143 | if seqfile is None:
144 | seqfile='seqfile.txt'
145 | download_from_url(sequrl, seqfile)
146 | return seqfile
147 | #
148 | def download_from_url(source, target):
149 | """
150 | Downloads a file from a given URL and saves it to a specified target location.
151 |
152 | Parameters:
153 | -----------
154 | source : str
155 | The URL of the file to be downloaded.
156 | target : str
157 | The path where the downloaded file will be saved.
158 |
159 | Returns:
160 | --------
161 | None
162 | """
163 | with closing(urlopen(source)) as r:
164 | with open(target, 'wb') as f:
165 | shutil.copyfileobj(r,f)
166 | print('wrote {0} from {1}'.format(target, source))
167 |
168 |
--------------------------------------------------------------------------------
/Notebooks/CheckProject_CheckCreateDelete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "0667c937",
6 | "metadata": {},
7 | "source": [
8 | "# How to use 'Check Project'\n",
9 | "\n",
10 | "In this Notebook we will demonstrate how to use 'pdbclean_io.check_project'.\n",
11 | "This function helps you check if a directory exists, create it, or delete it. \n",
12 | "\n",
13 | "Note: We are running this tutorial after finishing steps 1 and 2."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "id": "cbe7e596",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from PDBClean import pdbclean_io"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "id": "82634f8e",
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "# Path to project directory\n",
34 | "PROJDIR=\"./TIM\""
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "id": "c47d581a",
41 | "metadata": {},
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "ls: ./TIM: No such file or directory\r\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "# Let's check first what the project directory contains\n",
53 | "!ls $PROJDIR"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 4,
59 | "id": "8c46605c",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Action 'create' will create a directory\n",
64 | "# With Option 'level' you can name the new directory\n",
65 | "# The new directory contains text file 'info.txt' with the date when the directory was created\n",
66 | "\n",
67 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 5,
73 | "id": "6af82697",
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "\u001b[34mclean_bank\u001b[m\u001b[m \u001b[34mraw_bank\u001b[m\u001b[m \u001b[34msimple_bank\u001b[m\u001b[m\r\n",
81 | "info.txt seqres.txt \u001b[34mstandard_MolID_bank\u001b[m\u001b[m\r\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "# Run the next 2 cells to verify that the directory was created, and that it contains the info.txt file.\n",
87 | "!ls $PROJDIR"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 6,
93 | "id": "4504b8bd",
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "directory created on 2022-08-23 23:41:44.214555"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "!cat $PROJDIR/standard_MolID_bank/info.txt"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "id": "ee1cd175",
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "Cleaning ./TIM/standard_MolID_bank...\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "# action 'clean' will remove all files inside the directory 'level'\n",
124 | "pdbclean_io.check_project(projdir=PROJDIR, action='clean', level='standard_MolID_bank')"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 8,
130 | "id": "e275c246",
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "# Run this cell to verify that the directory we just created is now empty\n",
135 | "!ls $PROJDIR/standard_MolID_bank/"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 9,
141 | "id": "4015d784",
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Deleting ./TIM/standard_MolID_bank...\n"
149 | ]
150 | }
151 | ],
152 | "source": [
153 | "# action 'delete' will delete the 'level' directory\n",
154 | "pdbclean_io.check_project(projdir=PROJDIR, action='delete', level='standard_MolID_bank')"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 10,
160 | "id": "9964d932",
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "\u001b[34mclean_bank\u001b[m\u001b[m info.txt \u001b[34mraw_bank\u001b[m\u001b[m seqres.txt \u001b[34msimple_bank\u001b[m\u001b[m\r\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "# Run this cell to verify that the directory has been removed\n",
173 | "!ls $PROJDIR/"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 11,
179 | "id": "18f26559",
180 | "metadata": {},
181 | "outputs": [],
182 | "source": [
183 | "# Now, let's create the directory again\n",
184 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 12,
190 | "id": "82722c14",
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "name": "stdout",
195 | "output_type": "stream",
196 | "text": [
197 | "./TIM/standard_MolID_bank already exists, with content:\n",
198 | "['info.txt']\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "# Notice that if we run the same command twice, or if the directory already exists,\n",
204 | "# the contents of the directory will be printed to screen. \n",
205 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "c6305b7b",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": []
215 | }
216 | ],
217 | "metadata": {
218 | "kernelspec": {
219 | "display_name": "PDBCleanV2",
220 | "language": "python",
221 | "name": "PDBCleanV2"
222 | },
223 | "language_info": {
224 | "codemirror_mode": {
225 | "name": "ipython",
226 | "version": 3
227 | },
228 | "file_extension": ".py",
229 | "mimetype": "text/x-python",
230 | "name": "python",
231 | "nbconvert_exporter": "python",
232 | "pygments_lexer": "ipython3",
233 | "version": "3.10.4"
234 | }
235 | },
236 | "nbformat": 4,
237 | "nbformat_minor": 5
238 | }
239 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: PDBCleanV2
2 | channels:
3 | - bioconda
4 | - conda-forge
5 | dependencies:
6 | - appnope=0.1.3=pyhd8ed1ab_0
7 | - argon2-cffi=21.3.0=pyhd8ed1ab_0
8 | - argon2-cffi-bindings=21.2.0=py310h1961e1f_2
9 | - asttokens=2.0.5=pyhd8ed1ab_0
10 | - attrs=21.4.0=pyhd8ed1ab_0
11 | - backcall=0.2.0=pyh9f0ad1d_0
12 | - backports=1.0=py_2
13 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
14 | - beautifulsoup4=4.11.1=pyha770c72_0
15 | - biopython=1.83
16 | - bleach=5.0.1=pyhd8ed1ab_0
17 | - brotli=1.0.9=h5eb16cf_7
18 | - brotli-bin=1.0.9=h5eb16cf_7
19 | - bzip2=1.0.8=h0d85af4_4
20 | - ca-certificates=2023.5.7=h8857fd0_0
21 | - certifi=2023.5.7=pyhd8ed1ab_0
22 | - cffi=1.15.1=py310h96bbf6e_0
23 | - cvxopt=1.3.0=py310hed5b562_1
24 | - cycler=0.11.0=pyhd8ed1ab_0
25 | - cython=0.29.30=py310hd4537e4_0
26 | - debugpy=1.6.0=py310h9d931ec_0
27 | - decorator=5.1.1=pyhd8ed1ab_0
28 | - defusedxml=0.7.1=pyhd8ed1ab_0
29 | - dsdp=5.8=h6e329d1_1203
30 | - entrypoints=0.4=pyhd8ed1ab_0
31 | - executing=0.8.3=pyhd8ed1ab_0
32 | - fftw=3.3.10=nompi_hf082fe4_102
33 | - flit-core=3.7.1=pyhd8ed1ab_0
34 | - fonttools=4.34.4=py310h6c45266_0
35 | - freetype=2.10.4=h4cff582_1
36 | - gettext=0.19.8.1=hd1a6beb_1008
37 | - giflib=5.2.1=hbcb3906_2
38 | - glib=2.72.1=h2292cb8_0
39 | - glib-tools=2.72.1=h2292cb8_0
40 | - glpk=4.65=h0f52abe_1004
41 | - gmp=6.2.1=h2e338ed_0
42 | - gsl=2.7=h93259b0_0
43 | - gst-plugins-base=1.20.3=hda0ba4b_0
44 | - gstreamer=1.20.3=hdc08c3f_0
45 | - icu=70.1=h96cf925_0
46 | - importlib-metadata=4.11.4=py310h2ec42d9_0
47 | - importlib_resources=5.8.0=pyhd8ed1ab_0
48 | - ipykernel=6.15.1=pyh736e0ef_0
49 | - ipython=8.4.0=py310h2ec42d9_0
50 | - ipython_genutils=0.2.0=py_1
51 | - ipywidgets=7.7.1=pyhd8ed1ab_0
52 | - jedi=0.18.1=py310h2ec42d9_1
53 | - jinja2=3.1.2=pyhd8ed1ab_1
54 | - jpeg=9e=hac89ed1_2
55 | - jsonschema=4.7.2=pyhd8ed1ab_0
56 | - jupyter=1.0.0=py310h2ec42d9_7
57 | - jupyter_client=7.3.4=pyhd8ed1ab_0
58 | - jupyter_console=6.4.4=pyhd8ed1ab_0
59 | - jupyter_core=4.10.0=py310h2ec42d9_0
60 | - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
61 | - jupyterlab_widgets=1.1.1=pyhd8ed1ab_0
62 | - kiwisolver=1.4.3=py310habb735a_0
63 | - krb5=1.19.3=hb49756b_0
64 | - lcms2=2.12=h577c468_0
65 | - lerc=3.0=he49afe7_0
66 | - libblas=3.9.0=15_osx64_openblas
67 | - libbrotlicommon=1.0.9=h5eb16cf_7
68 | - libbrotlidec=1.0.9=h5eb16cf_7
69 | - libbrotlienc=1.0.9=h5eb16cf_7
70 | - libcblas=3.9.0=15_osx64_openblas
71 | - libclang=14.0.6=default_h55ffa42_0
72 | - libclang13=14.0.6=default_hb5731bd_0
73 | - libcxx=14.0.6=hce7ea42_0
74 | - libdeflate=1.12=hac89ed1_0
75 | - libedit=3.1.20191231=h0678c8f_2
76 | - libffi=3.4.2=h0d85af4_5
77 | - libgfortran=5.0.0=9_3_0_h6c81a4c_23
78 | - libgfortran5=9.3.0=h6c81a4c_23
79 | - libglib=2.72.1=hfbcb929_0
80 | - libiconv=1.16=haf1e3a3_0
81 | - liblapack=3.9.0=15_osx64_openblas
82 | - libllvm14=14.0.6=h41df66c_0
83 | - libogg=1.3.4=h35c211d_1
84 | - libopenblas=0.3.20=openmp_hb3cd9ec_0
85 | - libopus=1.3.1=hc929b4f_1
86 | - libpng=1.6.37=h5a3d3bf_3
87 | - libpq=14.4=hf6bb32a_0
88 | - libsodium=1.0.18=hbcb3906_1
89 | - libtiff=4.4.0=h9847915_1
90 | - libvorbis=1.3.7=h046ec9c_0
91 | - libwebp=1.2.2=h28dabe5_0
92 | - libwebp-base=1.2.2=h0d85af4_1
93 | - libxcb=1.13=h0d85af4_1004
94 | - libzlib=1.2.12=hfe4f2af_2
95 | - llvm-openmp=14.0.4=ha654fa7_0
96 | - lz4-c=1.9.3=he49afe7_1
97 | - markupsafe=2.1.1=py310h1961e1f_1
98 | - matplotlib=3.5.2=py310h2ec42d9_0
99 | - matplotlib-base=3.5.2=py310h4510841_0
100 | - matplotlib-inline=0.1.3=pyhd8ed1ab_0
101 | - metis=5.1.0=h2e338ed_1006
102 | - mistune=0.8.4=py310he24745e_1005
103 | - mpfr=4.1.0=h0f52abe_1
104 | - munkres=1.1.4=pyh9f0ad1d_0
105 | - muscle=5.1=hb339e23_1
106 | - mysql-common=8.0.29=h924029e_1
107 | - mysql-libs=8.0.29=h3cab752_1
108 | - nbclient=0.6.6=pyhd8ed1ab_0
109 | - nbconvert=6.5.0=pyhd8ed1ab_0
110 | - nbconvert-core=6.5.0=pyhd8ed1ab_0
111 | - nbconvert-pandoc=6.5.0=pyhd8ed1ab_0
112 | - nbformat=5.4.0=pyhd8ed1ab_0
113 | - ncurses=6.3=h96cf925_1
114 | - nest-asyncio=1.5.5=pyhd8ed1ab_0
115 | - notebook=6.4.12=pyha770c72_0
116 | - nspr=4.32=hcd9eead_1
117 | - nss=3.78=ha8197d3_0
118 | - numpy=1.23.1=py310ha3f357c_0
119 | - openjpeg=2.4.0=h6e7aa92_1
120 | - openssl=1.1.1u=h8a1eda9_0
121 | - packaging=21.3=pyhd8ed1ab_0
122 | - pandas=2.0.0=py310hecf8f37_0
123 | - pandoc=2.18=h694c41f_0
124 | - pandocfilters=1.5.0=pyhd8ed1ab_0
125 | - parso=0.8.3=pyhd8ed1ab_0
126 | - patsy=0.5.3=pyhd8ed1ab_0
127 | - pcre=8.45=he49afe7_0
128 | - pexpect=4.8.0=pyh9f0ad1d_2
129 | - pickleshare=0.7.5=py_1003
130 | - pillow=9.2.0=py310hb3240ae_0
131 | - ply=3.11=py_1
132 | - prometheus_client=0.14.1=pyhd8ed1ab_0
133 | - prompt-toolkit=3.0.30=pyha770c72_0
134 | - prompt_toolkit=3.0.30=hd8ed1ab_0
135 | - psutil=5.9.1=py310h6c45266_0
136 | - pthread-stubs=0.4=hc929b4f_1001
137 | - ptyprocess=0.7.0=pyhd3deb0d_0
138 | - pure_eval=0.2.2=pyhd8ed1ab_0
139 | - pycparser=2.21=pyhd8ed1ab_0
140 | - pygments=2.12.0=pyhd8ed1ab_0
141 | - pyparsing=3.0.9=pyhd8ed1ab_0
142 | - pyqt=5.15.7=py310h57cebac_0
143 | - pyrsistent=0.18.1=py310h1961e1f_1
144 | - python=3.10.5=hdaaf3db_0_cpython
145 | - python-dateutil=2.8.2=pyhd8ed1ab_0
146 | - python-fastjsonschema=2.15.3=pyhd8ed1ab_0
147 | - python-tzdata=2023.3=pyhd8ed1ab_0
148 | - python_abi=3.10=2_cp310
149 | - pytz=2023.3=pyhd8ed1ab_0
150 | - pyzmq=23.2.0=py310h85fb675_0
151 | - qt-main=5.15.4=h938c29d_2
152 | - qtconsole=5.3.1=pyhd8ed1ab_0
153 | - qtconsole-base=5.3.1=pyha770c72_0
154 | - qtpy=2.1.0=pyhd8ed1ab_0
155 | - readline=8.1.2=h3899abd_0
156 | - scipy=1.8.1=py310h1f9c157_0
157 | - seaborn=0.12.2=hd8ed1ab_0
158 | - seaborn-base=0.12.2=pyhd8ed1ab_0
159 | - send2trash=1.8.0=pyhd8ed1ab_0
160 | - setuptools=63.1.0=py310h2ec42d9_0
161 | - sip=6.6.2=py310hd4537e4_0
162 | - six=1.16.0=pyh6c4a22f_0
163 | - soupsieve=2.3.1=pyhd8ed1ab_0
164 | - sqlite=3.39.0=hd9f0692_0
165 | - stack_data=0.3.0=pyhd8ed1ab_0
166 | - statsmodels=0.14.0=py310hc1335a1_1
167 | - suitesparse=5.10.1=h7aff33d_1
168 | - tbb=2021.5.0=hbb4e6a2_1
169 | - terminado=0.15.0=py310h2ec42d9_0
170 | - tinycss2=1.1.1=pyhd8ed1ab_0
171 | - tk=8.6.12=h5dbffcc_0
172 | - toml=0.10.2=pyhd8ed1ab_0
173 | - tornado=6.2=py310h6c45266_0
174 | - traitlets=5.3.0=pyhd8ed1ab_0
175 | - typing_extensions=4.6.3=pyha770c72_0
176 | - tzdata=2022a=h191b570_0
177 | - unicodedata2=14.0.0=py310h1961e1f_1
178 | - wcwidth=0.2.5=pyh9f0ad1d_2
179 | - webencodings=0.5.1=py_1
180 | - wheel=0.37.1=pyhd8ed1ab_0
181 | - widgetsnbextension=3.6.1=pyha770c72_0
182 | - xorg-libxau=1.0.9=h35c211d_0
183 | - xorg-libxdmcp=1.1.3=h35c211d_0
184 | - xz=5.2.5=haf1e3a3_1
185 | - zeromq=4.3.4=he49afe7_1
186 | - zipp=3.8.0=pyhd8ed1ab_0
187 | - zlib=1.2.12=hfe4f2af_2
188 | - zstd=1.5.2=ha9df2e0_2
189 | - pip:
190 | - matching==1.4
191 | - pip==22.1.2
192 | - pyqt5-sip==12.11.0
193 | prefix: ~/opt/anaconda3/envs/PDBCleanV2
194 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PDBCleanV2
2 |
3 | With PDBCleanV2, users can create their own self-consistent structure dataset, enabling more straightforward comparison among structures. The library creates separate files for each biological assembly present in a structure file and standardizes chain names and numbering. Our goal is to provide researchers with a consistent dataset that facilitates their analysis.
4 |
5 | ## Table of contents
6 |
7 | * [PDBCleanV2 workflow and tutorial](#pdbcleanv2-workflow)
8 | * [Other tools](#other-tools)
9 | * [Installation](#installation)
10 | * [PDBClean team](#pdbclean-team)
11 |
12 | ## PDBCleanV2 Workflow
13 |
14 | We have created Jupyter Notebooks that provide a step-by-step guide for creating a curated ensemble of structures using PDBCleanV2.
15 |
16 | 
17 |
18 | ### [Step 1. Download structural ensemble form RCSB PDB.](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step1.DownloadStructuralEnsembleFromRCSBPDB.ipynb)
19 |
20 | Download all structures that match the name and sequence of your molecule of interest.
21 |
22 | > **Note:** This notebook sometimes does not display on the Github website, download and open in your browser.
23 |
24 | ### [Step 2. Clean Structures and Create one CIF file per biological assembly.](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb)
25 |
26 | A CIF file may contain multiple biological assemblies within one asymmetric unit. In this step we separate these biological assemblies, and create one CIF file for each one. We also reduce the number of data blocks included in the CIF file.
27 |
28 | ### [Step 3.1. Assign MOLID to the entities found in the CIF files, version 1](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb)
29 |
30 | The script goes over all the CIF files and collects all entities. The user can decide what Mol ID to assign them. In this example, we show the case in which we give a different ID to each entity found.
31 | This step is also important because it lists all the entities that were found in your ensemble, so it allows you to identify if there is a structure that doesn't belong. We show an example of this in this notebook.
32 |
33 | ### [Step 3.2. Assign MOLID to the entities found in the CIF files, version 2](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb)
34 |
35 | Same as Step 3.1, but in our example, we give the same MOL ID to different entities. You may want to do this for example, if you want to give the same MOL ID to all ligands, or water molecules. Doing this will trigger a concatenation menu, which we show how to use.
36 |
37 | ### [Step 3.3. Assign MOLID to the entities found in the CIF files, version 3](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.3AssignMolIDToEntitiesFoundInCIFfiles3.ipynb)
38 |
39 | In this notebook we show how to perform concatenations and conversions by using a conversion file (useful when a structure contains many entities).
40 | We also show an option that allows users to keep a record of the changes introduced in this step (track old chain names, new chain names, entity names and file names).
41 |
42 | ### [Step 4. Chain ID standardization](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step4.ChainIDStandardization.ipynb)
43 |
44 | Step 4 allows us to name each entity with whatever name we want. Step 4 makes sure that the chains that are the same (we do sequence alignment to determine similarity) in different CIF files, have a consistent name. Sometimes entities/chains are mislabeled in deposited structures, this step is recommended to identify any such cases. This step can also be used to identify any possible outliers, by seeing how all chains score compared to our reference.
45 |
46 | We divide the tutorial for this step in two parts. The second part shows how to generate the reference sequences, as well as showing how to load them when running the script. Doing this could also help speed up this step, as it allows to run the script in parallel in batches. This is particularly important when working with large datasets, or with molecules with many chains.
47 |
48 | ### [Step 4.2 Chain ID standardization: generate reference sequences and how to load them](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step4.2.ChainIDStandardization.ipynb)
49 |
50 | In this tutorial, we show how the reference sequence is selected by our script, and show how the user can modify it. It also shows how to load the reference sequences, creating the opportunity for running this step in parallel, in batches, speeding up the whole process.
51 |
52 | ### [Step 5. Residue ID Standardization](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step5.ResidueIDStandardization.ipynb)
53 |
54 | Following step 4, now that we have consistent chain (entity) naming among all structures in the ensemble, we want to make sure that the numbering is also consistent (that the same residue position has the same number in all structures).
55 |
56 | This is also the last step! You have a curated dataset!
57 |
58 |
59 | > **Note:** There are more advanced curation steps and analysis that we will cover in future releases.
60 |
61 | ## Other tools
62 |
63 | [Check project mini tutorial](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/CheckProject_CheckCreateDelete.ipynb). This mini tutorial can be run after doing step 2. `Check_project` checks if a directory has been created, if not it creates the directory and an info.txt file with the creation date.
64 |
65 | [Dataset Summary](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Analysis.SummaryPDBDataset.ipynb).
66 | This notebook can be run after doing step 0. It creates plots that summarize important information from your dataset such as organism of origin, resolution, year, and method used to solve the structure. The notebook also creates a pandas dataframe so users can create their own personalized plots.
67 |
68 | ## Installation
69 |
70 | We recommend installing PDBClean inside a virtual environment. We provide an `environment.yml` with the libraries you will need.
71 | We have tested the installation on MacOS with intel processors.
72 | Ensuring you have the pre-requisites will facilitate the installation process!
73 |
74 | ### Pre-requisites
75 |
76 | - Xcode including Command Line tools
77 | - Anaconda
78 | - Create SSH Key and link it to your GitHub account
79 | - Jupyter notebook
80 | - If running in a Mac with M1/M2/M3 chips, install Muscle v.5.1.0 with homebrew, using the following command:
81 | > `brew install brewsci/bio/muscle`
82 |
83 | ### Installation steps
84 |
85 | 1. Download PDBClean from GitHub and install environment from YML file
86 |
87 | >git clone git@github.com:fatipardo/PDBCleanV2
88 |
89 | >cd PDBCleanV2
90 |
91 | >conda config --remove channels defaults
92 |
93 | If you are installing in a computer with Intel chip, use the following command:
94 |
95 | >conda env create -f environment.yml
96 |
97 | If you are installing in a computer with M1/M2/M3 chip, use the following command instead:
98 |
99 | >conda env create -f environment_M1.yml
100 |
101 | 2. Activate environment and install PDBClean
102 |
103 | >conda activate PDBCleanV2
104 |
105 | >python setup.py install
106 |
107 | 3. Install Jupyter Notebook kernel
108 |
109 | > python -m ipykernel install --user --name PDBCleanV2 --display-name PDBCleanV2
110 |
111 |
112 | 4. Running notebook:
113 |
114 | > cd Notebooks
115 |
116 | > jupyter notebook
117 |
118 | - Open any notebook you would like to run.
119 | - If Jupyter does not recognize the kernel, select ‘PDBCleanV2’ from the drop down menu.
120 |
121 |
122 | ## PDBClean team
123 |
124 | The code in this repository is based on the code found [here](https://test.pypi.org/project/PDBClean/#files).
125 | The code was originally written by Frédéric Poitevin and Nicholas Corsepius.
126 | Fátima Pardo Avila and Liv Weiner created this repository. Paulina Cabral contributed to the code and documentation.
127 | We all worked on this project while being part of the Levitt Lab at Stanford University.
128 |
--------------------------------------------------------------------------------
/scripts/PDBClean_MolID_CIF.py:
--------------------------------------------------------------------------------
1 | #!/Users/fatima/anaconda3/envs/PDBCleanV2/bin/python
2 | # coding: utf-8
3 | #
4 | # ! ! ! master_molID_class_list is very important
5 | # This is the list that contains every file's MolID class
6 | # !! molIDConversion_list is important . . . it contains the objects
7 | # MolIDConversion and is what is going to be updated by the user and evaulated
8 | # to determine when the next step in the program is unlocked
9 | # Create list of MolIDConversion objects using unique_molID_occur_map
10 |
11 | from __future__ import print_function
12 | import sys, glob
13 | from PDBClean import pdbcleanmolidcifutils as molidutils
14 |
15 |
16 | ########################
17 | # READ INPUT ARGUMENTS #
18 | ########################
19 | n_arg = len(sys.argv)
20 | if(n_arg<3):
21 | print('Usage error: {0} '.format(sys.argv[0]))
22 | sys.exit()
23 | source_dir = sys.argv[1]
24 | target_dir = sys.argv[2]
25 |
26 |
27 | #########################################
28 | # READ PDB FILES AND DEFINE MolID LISTS #
29 | #########################################
30 |
31 | filelist=glob.glob(source_dir+'/*.cif')
32 | master_molID_class_list = molidutils.pdb_to_masterlist(filelist)
33 | unique_molID_occur_map = molidutils.CreateMasterUniqueMolIDMap(master_molID_class_list)
34 | molIDConversion_list = molidutils.uniquelist_to_conversionlist(unique_molID_occur_map)
35 | #FAPA MARCH 2024
36 | MolID_to_files_map = molidutils.CreateMasterUniqueMolIDMapWithFileName(master_molID_class_list)
37 | MolID_occur_dict_of_lists = molidutils.CreateMasterUniqueMolIDOccursLIST(master_molID_class_list)
38 | MolID_ChainID_dict_of_lists = molidutils.CreateMasterUniqueMolIDinitialChainIDsLIST(master_molID_class_list)
39 |
40 | #####################################
41 | # INTERACTIVE MOLID CONVERSION MENU #
42 | #####################################
43 | # Goal:
44 | # users complete their molID conversion templates by ensuring that each member of
45 | # molIDConversion_list has status complete = True
46 | input_menu = ""
47 | input_menu_complete = ""
48 | # For use in the next section
49 | concat_menu = ""
50 | final_menu=""
51 |
52 | while(input_menu != "QUIT"):
53 | if (input_menu_complete == "1"):
54 | print("""Congratulations! You have successfully constructed your
55 | conversion templates. You can proceed to the next section
56 | by selection option 7 or, continue to edit your conversion
57 | template through this menu
58 | """)
59 | print("""PDBClean MolID Conversion Build Menu
60 | Select one of the following options to proceed:
61 | 1) Show full conversion
62 | 2) Show only unassigned conversions
63 | 3) Enter input file
64 | 4) Search MolID to add chain ID conversion
65 | 5) Go entry by entry to add chain ID conversion
66 | 6) Remove a chain ID conversion
67 | A) Track changes (original_chain_name:new_chain:entity:file_name)
68 | """)
69 | if (input_menu_complete == "1"):
70 | print(" 7) Continue to next step of curation")
71 | input_menu = input('Option Number: ')
72 | if (input_menu == "1"):
73 | molidutils.show_full_conversion(molIDConversion_list)
74 | elif (input_menu == "2"):
75 | molidutils.show_unassigned_conversion(molIDConversion_list)
76 | elif (input_menu == "3"):
77 | molIDConversion_list = molidutils.add_user_conversion(molIDConversion_list)
78 | elif (input_menu == "4"):
79 | molIDConversion_list = molidutils.edit_conversion_interface(molIDConversion_list, action='add')#FAPA
80 | elif (input_menu == "5"):
81 | molIDConversion_list = molidutils.edit_conversion_manual(molIDConversion_list)
82 | elif (input_menu == "6"):
83 | molIDConversion_list = molidutils.edit_conversion_interface(molIDConversion_list, action='remove')
84 | elif (input_menu == "B"): # SECRET MENU: Print entity:file_name list
85 | molidutils.Print_MolID_To_Files_Map(MolID_to_files_map,target_dir)
86 | elif (input_menu == "C"): # SECRET MENU Print CHAIN-NAME:ENTITY:FILE-NAME
87 | molidutils.show_full_conversion_and_file_list(molIDConversion_list,MolID_to_files_map,target_dir)
88 | elif (input_menu == "D"): # SECRET MENU Print similar to C but print only relevant chain names
89 | molidutils.show_full_conversion_and_file_list_by_number_chains(molIDConversion_list,MolID_to_files_map,MolID_occur_dict_of_lists,target_dir)
90 | elif (input_menu == "A"):
91 | molidutils.TEST_show_full_conversion_and_file_list_by_number_chains(MolID_ChainID_dict_of_lists,molIDConversion_list, MolID_to_files_map,
92 | MolID_occur_dict_of_lists, target_dir)
93 | elif (input_menu == "7"):
94 | if (input_menu_complete == "1"):
95 | master_molID_class_list = molidutils.update_masterlist(master_molID_class_list, molIDConversion_list)
96 | count_problems = molidutils.problem_counter(master_molID_class_list)
97 | if (count_problems == 0):
98 | final_menu = "START"
99 | elif (count_problems != 0):
100 | concat_menu = "START"
101 | input_menu = "QUIT"
102 | input_menu_complete = molidutils.check_complete(molIDConversion_list)
103 |
104 | #########################################
105 | # New menu to finalize without printing #
106 | # concatenation menu #
107 | #########################################
108 |
109 | if (final_menu == "START"):
110 |
111 | count_problems = molidutils.problem_counter(master_molID_class_list)
112 | if (count_problems == 0):
113 | final_menu_complete = "1"
114 |
115 | if (final_menu_complete == "1"):
116 | print(" 6) Finalize Curation")
117 |
118 | final_menu = input('Option Number: ')
119 |
120 | if (final_menu == "6"):
121 | print("Finalizing Curation ...")
122 | molidutils.masterlist_to_pdb(filelist, master_molID_class_list, target_dir=target_dir)
123 | final_menu = "QUIT"
124 | else:
125 | print("Sorry, something went wrong, try again")
126 |
127 |
128 |
129 | ########################################
130 | # INTERACTIVE MOLID CONCATENATION MENU #
131 | ########################################
132 |
133 |
134 | if (concat_menu == "START"):
135 | # Prepare for concatenation step
136 | # We now have to take the information contained in the MolIDConversion objects
137 | # in molIDConversion_list to update the MolID objects in master_molID_class_list
138 | # We then need to mine these updated MolID objects to figure out which ones
139 | # contain concatenated chains. These will be presented to the user in another
140 | # interactive menu section where they can update the planned conversion on
141 | # a file by file basis
142 |
143 | master_molID_class_list = molidutils.update_masterlist(master_molID_class_list, molIDConversion_list)
144 |
145 | concat_menu = ""
146 | concat_menu_complete = ""
147 |
148 | while(concat_menu != "QUIT"):
149 |
150 | count_problems = molidutils.problem_counter(master_molID_class_list)
151 | if (count_problems == 0):
152 | concat_menu_complete = "1"
153 |
154 | if (concat_menu_complete == "1"):
155 | print("""Congratulations! You have successfully constructed your
156 | conversion templates.You can proceed to the next section
157 | by selection option 6 or, continue to edit your conversion
158 | template through this menu
159 | """)
160 | print("""PDBClean Concatenations Menu
161 | -------------------------------
162 | This menu appeared because you have assigned the same chain name to two (or more) entities.
163 | Note that this will concatenate the entities. So you need to either re-assign chain names,
164 | or ACCEPT concatenations.
165 |
166 | Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can
167 | be completed.
168 |
169 | Before you do anything, we suggest to choose option 2, so you know which concatenations have not
170 | been accepted. It will also give you the proper format of the input for option 3.
171 |
172 | If you are sure that all the concatenations are correct. Option 5 will accept all of them. They
173 | will be printed to screen as they are being accepted.
174 |
175 | Select one of the following options to proceed:
176 | 1) Show all conversions
177 | 2) Show only unaccepted concatenations
178 | 3) Search and modify destination chainIDs of proposed concatenations
179 | 4) Accept proposed concatenation one by one
180 | (Repeat this step until finalizing option appears)
181 | 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)
182 | """)
183 | if (concat_menu_complete == "1"):
184 | print(" 6) Finalize Curation")
185 |
186 | concat_menu = input('Option Number: ')
187 |
188 | if (concat_menu == "1"):
189 | molidutils.show_full_conversion(master_molID_class_list, step='concatenation')
190 | elif (concat_menu == "2"):
191 | molidutils.show_unassigned_conversion(master_molID_class_list, step='concatenation')
192 | elif (concat_menu == "3"):
193 | master_molID_class_list = molidutils.edit_concatenation_interface(master_molID_class_list, action='try')[0]
194 | elif (concat_menu == "4"):
195 | unassigned_MolID=molidutils.return_unassigned_conversion(master_molID_class_list, step='concatenation')[0]
196 | print("This is the concatenation you need to accept:")
197 | new_order=None
198 | master_molID_class_list = molidutils.list_accept_concatenations(master_molID_class_list, unassigned_MolID, new_order=new_order, action='accept')[0]
199 | # Note for tomorrow: here we need to create a new function in molidutils, so we can go over all concatenations!
200 | elif (concat_menu == "5"):
201 | allnewchains=molidutils.return_unassigned_conversion(master_molID_class_list, step='concatenation')
202 | for newchain in allnewchains:
203 | new_order=None
204 | master_molID_class_list = molidutils.list_accept_concatenations_auto(master_molID_class_list, newchain, new_order=new_order, action='accept')[0]
205 | count_problems = molidutils.problem_counter(master_molID_class_list)
206 | print(count_problems)
207 | elif (concat_menu == "6"):
208 | print("Finalizing Curation ...")
209 | molidutils.masterlist_to_pdb(filelist, master_molID_class_list, target_dir=target_dir)
210 | concat_menu = "QUIT"
211 |
--------------------------------------------------------------------------------
/src/cleanutils.py:
--------------------------------------------------------------------------------
1 | import os, glob
2 | import re
3 | from Bio.PDB.MMCIF2Dict import MMCIF2Dict
4 |
5 | def process(projdir=None, step='clean', source='raw_bank', target='clean_bank', pdbformat='.cif', verbose=True):
6 | """
7 | Processes all CIF files in the source directory through one of the processing steps,
8 | and then saves the results to the target directory. The specified steps include, 'clean' and 'simplify'
9 |
10 | Parameters:
11 | -----------
12 | projdir : str, optional
13 | The project directory containing source and target subdirectories. If none, the current directory is used.
14 | step : str, optional
15 | The processing step to apply to each CIF(s). If none, the current step is 'clean', which rewrites the CIF(s)
16 | including only a limited set of data blocks.
17 | source : str, optional
18 | The subdirectory within the project directory where the raw CIF(s) are in. The default is the
19 | subdirectory titled 'raw_bank'
20 | target : str, optional
21 | The subdirectory within the project directory where processed CIF(s) will be saved. The default is the
22 | subdirectory titled 'clean_bank'
23 | pdbformat : str, optional
24 | The file extension format for CIF(s). Thr default is '.cif'.
25 | verbose : bool, optional
26 | If True, progress is printed to the console. Default is true.
27 |
28 | Returns:
29 | -----------
30 | None
31 | """
32 | if projdir is not None:
33 | source_dir = projdir+'/'+source
34 | target_dir = projdir+'/'+target
35 | input_list = glob.glob(source_dir+'/*'+pdbformat)
36 | i=0
37 |
38 | for input_cif in input_list:
39 | cif_name=os.path.basename(input_cif)
40 |
41 | if verbose:
42 | i+=1
43 | print('[{0}/{1}]: {2}'.format(i,len(input_list),cif_name))
44 |
45 | output_cif=target_dir+'/'+cif_name
46 |
47 | if(step=='clean'):
48 | if os.path.isfile(output_cif):
49 | os.remove(output_cif)
50 | clean_cif(input_cif, output_cif)
51 |
52 | elif(step=='simplify'):
53 | # missing line: remove all assembly cif already created
54 | simplify_cif(input_cif, output_cif, pdbformat)
55 |
56 | def simplify_cif(oldfile, newfile, pdbformat):
57 | """
58 | Separate all biological assemblies in a CIF into separate files.
59 |
60 | Parameters:
61 | -----------
62 | oldfile : str
63 | Path to the original CIF(s) needed to be simplified.
64 | newfile: str
65 | Path where the new, simplified CIF(s) will be saved. The function creates multiple files if there are
66 | more than one biological assemblies.
67 | pdbformat : str
68 | The file format extension used when saving the new CIF.
69 |
70 | Returns:
71 | -----------
72 | None
73 | """
74 | mmcif_dict = MMCIF2Dict(oldfile)
75 |
76 | # Create map from asym_id to assembly_id
77 | # Convert assembly_id to a list, as it can be either a string or a list
78 |
79 | asym_assembly_map = {}
80 | assembly_id = mmcif_dict['_pdbx_struct_assembly_gen.assembly_id']
81 |
82 | if not isinstance(assembly_id, list):
83 | assembly_id_list = []
84 | asym_id_list = []
85 | assembly_id_list.append(assembly_id)
86 | asym_id_list.append(mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list'])
87 |
88 | else:
89 | assembly_id_list = assembly_id
90 | asym_id_list = mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list']
91 |
92 | # Convert asym_id entry into a list of asym_ids
93 | for i in range(len(assembly_id_list)):
94 | asym_id = asym_id_list[i]
95 | asym_id = asym_id.strip()
96 | asym_id = re.sub(' ', '', asym_id)
97 | asym_id = asym_id.split(',')
98 | for ident in asym_id:
99 | asym_assembly_map[ident] = assembly_id_list[i]
100 |
101 | for assembly in assembly_id_list:
102 |
103 | if (len(assembly_id_list)==1):
104 | newciffilename = str(re.sub(pdbformat, '', newfile))+"+00"
105 | else:
106 | newciffilename = str(re.sub(pdbformat, '', newfile))+"+0"+str(assembly)
107 |
108 | newciffile = open(newciffilename+pdbformat, 'w')
109 | newciffile.write("data_"+newciffilename+"\n")
110 |
111 | # Writes entry.id
112 | newciffile.write("#\n")
113 |
114 | # Changes the list format to str
115 | L = str(mmcif_dict['_entry.id'])
116 | entryid = '_entry.id ' + L
117 | newciffile.write(entryid + "\n")
118 |
119 | # Write Audit category
120 | newciffile.write("#\n")
121 | newciffile.write("loop_\n")
122 | newciffile.write("_citation_author.name\n")
123 |
124 | if '_citation_author.name' in mmcif_dict:
125 | L = mmcif_dict['_citation_author.name']
126 | else:
127 | L = "???"
128 |
129 |
130 | if isinstance(L, list):
131 | for i in L:
132 | newciffile.write("'" + re.sub("'", "", i) + "'" + "\n")
133 | else:
134 | newciffile.write("'" + re.sub("'", "", L) + "'" + "\n")
135 |
136 | # Writes Citation category
137 | newciffile.write("#" + "\n")
138 | newciffile.write("loop_" + "\n")
139 | newciffile.write("_citation.title" + "\n")
140 | newciffile.write("_citation.year" + "\n")
141 | newciffile.write("_citation.pdbx_database_id_DOI" + "\n")
142 | L1 = mmcif_dict['_citation.title']
143 | L2 = mmcif_dict['_citation.year']
144 | L3 = mmcif_dict['_citation.pdbx_database_id_DOI']
145 | if isinstance(L1, list):
146 | for i in range(len(L1)):
147 | newciffile.write("'" + re.sub("\n"," ",re.sub("'", "", L1[i])) + "' " + L2[i] + " " + L3[i] + "\n") #FAPA
148 | else:
149 | newciffile.write("'" + re.sub("\n"," ",re.sub("'", "", L1[i])) + "' " + L2[i] + " " + L3[i] + "\n") #FAPA
150 |
151 | # Writes Resolution category
152 | newciffile.write("#" + "\n")
153 | newciffile.write("loop_" + "\n")
154 | newciffile.write("_exptl.method" + "\n")
155 | newciffile.write("_exptl.resolution" + "\n")
156 | if '_exptl.method' in mmcif_dict:
157 | L1 = mmcif_dict['_exptl.method']
158 | elif '_refine_hist.pdbx_refine_id' in mmcif_dict:
159 | L1 = mmcif_dict['_refine_hist.pdbx_refine_id']
160 | else:
161 | L1 = mmcif_dict['_refine.pdbx_refine_id']
162 | if '_refine.ls_d_res_high' in mmcif_dict:
163 | L2 = mmcif_dict['_refine.ls_d_res_high']
164 | elif '_em_3d_reconstruction.resolution' in mmcif_dict:
165 | L2 = mmcif_dict['_em_3d_reconstruction.resolution']
166 | elif '_refine_hist.d_res_high' in mmcif_dict:
167 | L2 = mmcif_dict['_refine_hist.d_res_high']
168 | else:
169 | L2 = '????'
170 | if isinstance(L1, list) and isinstance(L2, list):
171 | for i in range(len(L1)):
172 | newciffile.write("'" + L1[i] + "' " + L2[i] + " " + "\n")
173 | elif isinstance(L1, list) and not isinstance(L2,list):
174 | newciffile.write("'" + L1[0] + "' " + L2 + " " + "\n")
175 | elif not isinstance(L1,list) and isinstance(L2,list):
176 | newciffile.write("'" + L1 + "' " + L2[0] + " " + "\n")
177 | else:
178 | newciffile.write("'" + L1 + "' " + L2 + " " + "\n")
179 |
180 | # Writes Entity category
181 | newciffile.write("#" + "\n")
182 | newciffile.write("loop_" + "\n")
183 | newciffile.write("_entity.id" + "\n")
184 | newciffile.write("_entity.pdbx_description" + "\n")
185 | L1 = mmcif_dict['_entity.id']
186 | L2 = mmcif_dict['_entity.pdbx_description']
187 | for i in range(len(L1)):
188 | L2[i] = L2[i].upper()
189 | L2[i] = L2[i].replace(":", "")
190 | newciffile.write(L1[i] + " '" + L2[i].replace("'", "") + "'\n")
191 |
192 | # Writes the coordinate portion of the file
193 | newciffile.write("#" + "\n")
194 | newciffile.write("loop_" + "\n")
195 | newciffile.write("_atom_site.group_PDB" + "\n")
196 | newciffile.write("_atom_site.id" + "\n")
197 | newciffile.write("_atom_site.type_symbol" + "\n")
198 | newciffile.write("_atom_site.label_atom_id" + "\n")
199 | newciffile.write("_atom_site.label_alt_id" + "\n")
200 | newciffile.write("_atom_site.label_comp_id" + "\n")
201 | newciffile.write("_atom_site.label_asym_id" + "\n")
202 | newciffile.write("_atom_site.label_entity_id" + "\n")
203 | newciffile.write("_atom_site.label_seq_id" + "\n")
204 | newciffile.write("_atom_site.pdbx_PDB_ins_code" + "\n")
205 | newciffile.write("_atom_site.Cartn_x" + "\n")
206 | newciffile.write("_atom_site.Cartn_y" + "\n")
207 | newciffile.write("_atom_site.Cartn_z" + "\n")
208 | newciffile.write("_atom_site.occupancy" + "\n")
209 | newciffile.write("_atom_site.B_iso_or_equiv" + "\n")
210 | newciffile.write("_atom_site.auth_seq_id" + "\n")
211 | newciffile.write("_atom_site.auth_comp_id" + "\n")
212 | newciffile.write("_atom_site.auth_asym_id" + "\n")
213 | newciffile.write("_atom_site.auth_atom_id" + "\n")
214 | newciffile.write("_atom_site.pdbx_PDB_model_num" + "\n")
215 | L1 = mmcif_dict['_atom_site.group_PDB']
216 | L2 = mmcif_dict['_atom_site.id']
217 | L3 = mmcif_dict['_atom_site.type_symbol']
218 | L4 = mmcif_dict['_atom_site.label_atom_id']
219 | L5 = mmcif_dict['_atom_site.label_alt_id']
220 | L6 = mmcif_dict['_atom_site.label_comp_id']
221 | L7 = mmcif_dict['_atom_site.label_asym_id']
222 | L8 = mmcif_dict['_atom_site.label_entity_id']
223 | L9 = mmcif_dict['_atom_site.label_seq_id']
224 | L10 = mmcif_dict['_atom_site.pdbx_PDB_ins_code']
225 | L11 = mmcif_dict['_atom_site.Cartn_x']
226 | L12 = mmcif_dict['_atom_site.Cartn_y']
227 | L13 = mmcif_dict['_atom_site.Cartn_z']
228 | L14 = mmcif_dict['_atom_site.occupancy']
229 | L15 = mmcif_dict['_atom_site.B_iso_or_equiv']
230 | L16 = mmcif_dict['_atom_site.auth_seq_id']
231 | L17 = mmcif_dict['_atom_site.auth_comp_id']
232 | L18 = mmcif_dict['_atom_site.auth_asym_id']
233 | L19 = mmcif_dict['_atom_site.auth_atom_id']
234 | L20 = mmcif_dict['_atom_site.pdbx_PDB_model_num']
235 |
236 | # This section is necessary to print the biological assemblies on separate files
237 | BioAssembly = mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list']
238 |
239 | for i in range(len(L1)):
240 | if (L7[i] in BioAssembly[int(assembly)-1].split(',')):
241 | newciffile.write(L1[i] + " " + L2[i] + " " + L3[i] + ' "' + L4[i] + '" ' + L5[i] + " " + L6[i] + " " + L7[i] + " " + L8[i] + " " + L9[i] + " " + L10[i] + " " + L11[i] + " " + L12[i] + " " + L13[i] + " " + L14[i] + " " + L15[i] + " " + L16[i] + " " + L17[i] + " " + L18[i] + ' "' + L19[i] + '" ' + L20[i] + "\n")
242 | newciffile.write("#" + "\n")
243 |
244 |
245 | #
246 | def clean_cif(oldfile, newfile):
247 | """
248 | Rewrites CIF, including only a limited set of data blocks.
249 |
250 | Parameters:
251 | -----------
252 | oldfile : str
253 | The path to the original CIF(s) needed to be cleaned.
254 | newfile : str
255 | The path where the cleaned CIF(s) will be written.
256 |
257 | Returns:
258 | -----------
259 | None
260 | """
261 | entry_list = ['_entry.id',
262 | '_atom_site.group_PDB',
263 | '_citation_author.name',
264 | '_citation.title',
265 | '_pdbx_struct_assembly_gen.assembly_id',
266 | '_entity.pdbx_description',
267 | '_exptl.method',
268 | '_em_3d_reconstruction.resolution',
269 | '_refine_hist.pdbx_refine_id',
270 | '_refine.pdbx_refine_id']
271 | keylength_list = [ 9,
272 | 20,
273 | 21,
274 | 15,
275 | 37,
276 | 24,
277 | 13,
278 | 32,
279 | 27,
280 | 22]
281 | with open(oldfile) as old_file:
282 | alllines = []
283 | linecount = 0
284 | poundline = 0
285 | flag = 0
286 | for line in old_file:
287 | alllines.append(line)
288 | if linecount == 0:
289 | with open(newfile, 'a') as new_file:
290 | new_file.write(alllines[0])
291 | for entry, keylength in zip(entry_list, keylength_list):
292 | flag = check_and_write_entry(entry, line, alllines, line[0:keylength], flag, range(poundline, linecount), newfile)
293 | if '#' in line[0]:
294 | poundline = linecount
295 | linecount += 1
296 | with open(newfile, 'a') as new_file:
297 | new_file.write('#\n')
298 | #
299 | def check_and_write_entry(entry, line, alllines, key, flag, linerange, newfile):
300 | """
301 | Checks if a specific entry is present in the current line of a CIF and writes relevant lines to a new file.
302 |
303 | Parameters:
304 | -----------
305 | entry : str
306 | The specific CIF entry to look for in the line (e.g., '_entry.id').
307 | line : str
308 | The current line being read from the CIF.
309 | alllines : list
310 | A list of all lines read so far from the CIF.
311 | key : str
312 | The substring of the current line that is compared to the entry.
313 | flag : int
314 | A flag indicating whether the desired entry has been found (1 if found, 0 otherwise).
315 | linerange : range
316 | The range of lines from `alllines` to write to the new file if the entry is found.
317 | newfile : str
318 | The path to the new CIF where the relevant lines will be written.
319 |
320 | Returns:
321 | -----------
322 | flag : int
323 | The new flag value indicating whether the entry was found or not.
324 | """
325 | if entry in key:
326 | flag = 1
327 | elif (flag==1) and '#' in line[0]:
328 | with open(newfile, 'a') as new_file:
329 | for i in linerange:
330 | new_file.write(alllines[i])
331 | flag=0
332 | return flag
333 |
--------------------------------------------------------------------------------
/src/alignmentutils.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 | import os
4 | import time
5 | from Bio import SeqIO
6 | import numpy as np
7 |
8 | # AA Map from 3 letter amino acid id to 1 letter id
9 | # it also includes nucleic acids, including post-tranlational modifications,
10 | # which are mapped to ACTUG.
11 | def ResnConvert(resn):
12 | """
13 | Converts the 3 letter amino acid id into a singular letter ID.
14 |
15 | Parameters:
16 | -----------
17 | resn : str
18 | The 3 letter amino acid id
19 |
20 | Returns:
21 | --------
22 | ans : str
23 | The one-letter ID representing the 3 letter amino acid ID. If there's none, "X" is
24 | returned.
25 |
26 | """
27 | AA = {}
28 | AA["UNK"] = "X"
29 | AA["ALA"] = "A"
30 | AA["ARG"] = "R"
31 | AA["ASN"] = "N"
32 | AA["ASP"] = "D"
33 | AA["CYS"] = "C"
34 | AA["GLN"] = "Q"
35 | AA["GLU"] = "E"
36 | AA["GLY"] = "G"
37 | AA["HIS"] = "H"
38 | AA["ILE"] = "I"
39 | AA["LEU"] = "L"
40 | AA["LYS"] = "K"
41 | AA["MET"] = "M"
42 | AA["PHE"] = "F"
43 | AA["PRO"] = "P"
44 | AA["SER"] = "S"
45 | AA["THR"] = "T"
46 | AA["TRP"] = "W"
47 | AA["TYR"] = "Y"
48 | AA["VAL"] = "V"
49 | AA["A"] = "A"
50 | AA["C"] = "C"
51 | AA["U"] = "U"
52 | AA["G"] = "G"
53 | AA["2MA"] = "A"
54 | AA["3AU"] = "U"
55 | AA["4AC"] = "C"
56 | AA["4OC"] = "C"
57 | AA["4SU"] = "U"
58 | AA["5MC"] = "C"
59 | AA["5MU"] = "U"
60 | AA["6IA"] = "A"
61 | AA["6MZ"] = "U"
62 | AA["7MG"] = "G"
63 | AA["8AN"] = "A"
64 | AA["CM0"] = "C"
65 | AA["G7M"] = "G"
66 | AA["H2U"] = "U"
67 | AA["MIA"] = "A"
68 | AA["OMC"] = "C"
69 | AA["OMG"] = "C"
70 | AA["PSU"] = "U"
71 | AA["QUO"] = "G"
72 | AA["T6A"] = "A"
73 | AA["U8U"] = "U"
74 | AA["YG"] = "G"
75 |
76 | if resn not in AA:
77 | ans = "X"
78 | else:
79 | ans = AA[resn]
80 | return ans
81 | # END AA Map from 3 letter amino acid id to 1 letter id
82 |
83 | def AlignSequences(sequence_vec):
84 | """
85 | Takes a list of sequence strings and performs a MUSCLE alignment,
86 | outputting a vector of aligned sequence strings.
87 |
88 | Parameters:
89 | -----------
90 | sequence_vec : list of str
91 | list of sequences to be aligned
92 |
93 | Returns:
94 | --------
95 | aligned_seq : list of str
96 | A list containing the aligned sequences
97 | """
98 |
99 | with open("Seq.fa", 'w') as newfafile:
100 | for seq in sequence_vec:
101 | newfafile.write("> Seq" + "\n")
102 | newfafile.write(seq + "\n")
103 |
104 | process=os.popen('muscle -align Seq.fa -output Seq.afa')
105 |
106 | process
107 |
108 | time.sleep(1) #FAPA
109 |
110 | #FAPA START
111 | while not os.path.exists("Seq.afa"):
112 | time.sleep(1) #FAPA, WAITING LESS TIME
113 | print("waiting...")
114 |
115 | while not os.path.getsize("Seq.afa") >= os.path.getsize("Seq.fa"):
116 | time.sleep(1) #FAPA, WAITING LESS TIME
117 | print("waiting even more...")
118 |
119 | #FAPA ENDS
120 |
121 | aligned_seq = []
122 | with open("Seq.afa") as seqfile:
123 | seq = ""
124 | for line in seqfile:
125 | if (line[0] == ">"):
126 | if (seq != ""):
127 | aligned_seq.append(seq)
128 | seq = ""
129 | else:
130 | seq += line.strip()
131 | aligned_seq.append(seq)
132 |
133 | process.close()
134 | return (aligned_seq)
135 | # END AlignSequences
136 |
137 | def AlignSequences_v2(sequence_vec, file_name, this_chainsseq_list_ids):
138 | """
139 | Takes a list of sequence strings and performs a MUSCLE alignment, outputting
140 | a vector of aligned sequence strings.
141 |
142 | Parameters:
143 | -----------
144 | sequence_vec : list of str
145 | list of sequences to be aligned
146 | file_name : str
147 | Name given to FASTA file
148 | this_chainsseq_list_ids : list of str
149 | List of identifiers for each sequence which are also used as headers in the
150 | FASTA file.
151 |
152 | Returns:
153 | --------
154 | aligned_seq_map : dict
155 | A dictionary where the keys are the sequence identifiers from `this_chainsseq_list_ids`
156 | and the values are the corresponding aligned sequence strings.
157 | """
158 | # Takes a list of sequence strings and performs a MUSCLE alignment, outputting a vector of aligned sequence strings
159 | with open(file_name+".fa", 'w') as newfafile:
160 | i = 0
161 | for seq in sequence_vec:
162 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n")
163 | newfafile.write(seq + "\n")
164 | i += 1
165 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta"
166 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" #FAPA 26AUG2025
167 |
168 | process = os.popen(command)
169 |
170 | process
171 |
172 | #FAPA START
173 | while not os.path.exists(file_name+".fasta"):
174 | time.sleep(10)
175 | print("waiting...")
176 |
177 | while not os.path.getsize(file_name+".fasta") > 0:
178 | time.sleep(10)
179 | print("waiting even more...")
180 |
181 | aligned_seq_map = {}
182 | aligned_seq = []
183 | seq = ""
184 | with open(file_name + ".fasta") as seqfile:
185 | for line in seqfile:
186 | if (line[0] == ">"):
187 | # Very first line
188 | if (seq == ""):
189 | line = line.strip()
190 | line = line.split()
191 | key = line[2]
192 | else:
193 | aligned_seq_map[key] = seq
194 | seq = ""
195 | line = line.strip()
196 | line = line.split()
197 | key = line[2]
198 | else:
199 | seq += line.strip()
200 | aligned_seq_map[key] = seq
201 |
202 | for item in (aligned_seq_map.keys()):
203 | aligned_seq.append(aligned_seq_map[item])
204 |
205 | process.close()
206 | return (aligned_seq_map)
207 |
208 | # END AlignSequences
209 |
210 | # FAPA MAY TEST BEGIN
211 |
212 | def AlignSequences_v3(sequence_vec, file_name, this_chainsseq_list_ids):
213 | """
214 | Takes a list of sequence strings and performs a MUSCLE alignment,
215 | outputting a vector of aligned sequence strings. This version checks
216 | if an alignment has already been provided, before running muscle, and
217 | in that case, just reads the existing alignment.
218 |
219 | Parameters:
220 | -----------
221 | sequence_vec : list of str
222 | list of sequences to be aligned
223 | file_name : str
224 | Name given to FASTA file
225 | this_chainsseq_list_ids : list of str
226 | List of identifiers for each sequence which are also used as headers in the
227 | FASTA file.
228 |
229 | Returns:
230 | --------
231 | aligned_seq_map : dict
232 | A dictionary where the keys are sequence identifiers from `this_chainsseq_list_ids` and
233 | the values are the corresponding aligned sequence strings.
234 | """
235 | if os.path.exists(file_name+".fasta") == False:
236 |
237 | with open(file_name+".fa", 'w') as newfafile:
238 | i = 0
239 | for seq in sequence_vec:
240 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n")
241 | newfafile.write(seq + "\n")
242 | i += 1
243 |
244 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta"
245 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" # FAPA 26AUG2025
246 | process = os.popen(command)
247 | process
248 |
249 | while not os.path.exists(file_name+".fasta"):
250 | time.sleep(10)
251 | print("waiting...")
252 |
253 | while not os.path.getsize(file_name+".fasta") > 0:
254 | time.sleep(10)
255 | print("waiting even more...")
256 | process.close()
257 |
258 | else:
259 | print("Alignment already exists, so I will use that one!")
260 |
261 | aligned_seq_map = {}
262 | aligned_seq = []
263 | seq = ""
264 | with open(file_name + ".fasta") as seqfile:
265 | for line in seqfile:
266 | if (line[0] == ">"):
267 | # Very first line
268 | if (seq == ""):
269 | line = line.strip()
270 | line = line.split()
271 | key = line[2]
272 | else:
273 | aligned_seq_map[key] = seq
274 | seq = ""
275 | line = line.strip()
276 | line = line.split()
277 | key = line[2]
278 | else:
279 | seq += line.strip()
280 | aligned_seq_map[key] = seq
281 |
282 | for item in (aligned_seq_map.keys()):
283 | aligned_seq.append(aligned_seq_map[item])
284 |
285 |
286 | return (aligned_seq_map)
287 |
288 |
289 | # FAPA JULY TEST STARTS HERE
290 |
291 | def AlignSequences_v4(sequence_vec, file_name, this_chainsseq_list_ids):
292 | """
293 | Takes a list of sequence strings and performs a MUSCLE alignment, outputting
294 | a vector of aligned sequence strings. This version checks if an alignment has
295 | already been provided, before running muscle, and in that case just reads the
296 | existing alignment.
297 |
298 | Parameters:
299 | -----------
300 | sequence_vec : list of str
301 | list containing sequences from FASTA files
302 | file_name : str
303 | Name given to FASTA file
304 | this_chainsseq_list_ids : list of str
305 | List of identifiers for each sequence which are also used as headers in the
306 | FASTA file.
307 |
308 | Returns:
309 | --------
310 | aligned_seq_map : dict
311 | A dictionary where the keys are the sequence identifiers from `this_chainsseq_list_ids`
312 | and the values are the corresponding aligned sequence strings
313 | gap_percentages : np.ndarray
314 | An array where each element represents the percentage of gaps at that position
315 | across all sequences.
316 | """
317 | if os.path.exists(file_name+".fasta") == False:
318 |
319 | with open(file_name+".fa", 'w') as newfafile:
320 | i = 0
321 | for seq in sequence_vec:
322 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n")
323 | newfafile.write(seq + "\n")
324 | i += 1
325 |
326 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta"
327 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" # FAPA 26AUG2025
328 | process = os.popen(command)
329 | process
330 |
331 | #FAPA START
332 | while not os.path.exists(file_name+".fasta"):
333 | time.sleep(10)
334 | print("waiting...")
335 |
336 | while not os.path.getsize(file_name+".fasta") > 0:
337 | time.sleep(10)
338 | print("waiting even more...")
339 |
340 | #FAPA ENDS
341 | process.close()
342 |
343 | else:
344 | print("Alignment already exists, so I will use that one!")
345 |
346 | aligned_seq_map = {}
347 | aligned_seq = []
348 | seq = ""
349 | with open(file_name + ".fasta") as seqfile:
350 | for line in seqfile:
351 | if (line[0] == ">"):
352 | # Very first line
353 | if (seq == ""):
354 | line = line.strip()
355 | line = line.split()
356 | key = line[2]
357 | else:
358 | aligned_seq_map[key] = seq
359 | seq = ""
360 | line = line.strip()
361 | line = line.split()
362 | key = line[2]
363 | else:
364 | seq += line.strip()
365 | aligned_seq_map[key] = seq
366 |
367 | for item in (aligned_seq_map.keys()):
368 | aligned_seq.append(aligned_seq_map[item])
369 |
370 | print(file_name)
371 |
372 | sequences = read_fasta_files( file_name + ".fasta")
373 | gap_percentages = calculate_gap_percentages(sequences)
374 |
375 | #print(aligned_seq_map)
376 | print("Gap percentages per position:")
377 | print(gap_percentages)
378 |
379 | return (aligned_seq_map,gap_percentages)
380 |
381 | # The functions below are used to calculate the percentage of gaps per position
382 | def read_fasta_files(fasta_file):
383 | """
384 | Reads FASTA files and extracts the sequences into a list.
385 |
386 | Parameters:
387 | -----------
388 | fasta_file : str
389 | Path to FASTA file containing all the sequences
390 |
391 | Returns:
392 | --------
393 | sequences : list of str
394 | list of sequences from a FASTA file
395 | """
396 | sequences = []
397 | for record in SeqIO.parse(fasta_file, "fasta"):
398 | sequences.append(str(record.seq))
399 | return sequences
400 |
401 | def calculate_gap_percentages(sequences):
402 | """
403 | Calculates the percentage of gaps at each position in a list of sequences.
404 |
405 | Parameters:
406 | -----------
407 | sequences : list of str
408 | list of sequences extracted from a FASTA file
409 |
410 | Returns:
411 | --------
412 | gap_percentages : np.ndarray
413 | An array where each element represents the percentage of gaps at that position
414 | across all sequences.
415 | """
416 | sequence_length = len(sequences[0])
417 | gap_counts = np.zeros(sequence_length)
418 |
419 | for sequence in sequences:
420 | for i, char in enumerate(sequence):
421 | if char == '-':
422 | gap_counts[i] += 1
423 |
424 | total_sequences = len(sequences)
425 | gap_percentages = (gap_counts / total_sequences) * 100
426 | return gap_percentages
427 |
428 | # FAPA JULY TEST ENDS
429 |
430 | def ScoreSequenceAlignment(seq1, seq2):
431 | """
432 | Compares the reference sequence to another sequence and counts for similarity based on
433 | exact matches between corresponding elements from the two sequences.
434 |
435 | Parameters:
436 | -----------
437 | seq1 : str
438 | The reference sequence
439 | seq2 : str
440 | The sequence being compared
441 |
442 | Returns:
443 | --------
444 | score : float
445 | The similarity score between the reference sequence and the sequence being compared.
446 | """
447 | # Scores based on exact identity. Should maybe be updated to take longer
448 | # of sequences so that it can be used with unaligned seq strings too
449 | score = 0
450 | for i in range(len(seq1)):
451 | if (seq1[i] == seq2[i]):
452 | score += 1
453 | score = score/len(seq1)
454 | return score
455 |
--------------------------------------------------------------------------------
/Notebooks/Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "2d065185",
6 | "metadata": {},
7 | "source": [
8 | "# Assign MolID to the entities found in the CIF files (1) "
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "729c6719",
14 | "metadata": {},
15 | "source": [
16 | "## What is the goal of this notebook?\n",
17 | "\n",
18 | "We will run `PDBClean_MolID_CIF.py` to re-assign the MolID to the entities found in our new ensemble of CIF files. \n",
19 | "The script goes over all the CIF files and collects all entities. The user can then decide what MolID to assign them. \n",
20 | "\n",
21 | "There are also some other benefits from running this script: \n",
22 | "\n",
23 | "- You can assign the same MolID to different entities. In that case these entities will be concatenated. User needs to accept each concatenation manually. \n",
24 | "- Inspecting the list of entities will allow users to identify structures that needs to be removed from the ensemble.\n",
25 | "- Make sure that the MolIDs of the structures in the ensemble are consistent (the same chain is named always the same, even in different structures).\n",
26 | "\n",
27 | "This notebook will go over the cases described above. \n",
28 | "\n",
29 | ">**NOTE:** For this tutorial, we will not use the whole ensemble we downloaded. We will use a subsample of only 7 structures. The next cells will create the new directory. Notice that we are choosing these 7 sctructures from the ones we downloaded. We chose these ones to highlight some possible issues you may run into when running this script."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "id": "74c0c396",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from PDBClean import pdbclean_io"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "id": "0b4b831c",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "PROJDIR=\"./TIM/\""
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "id": "ee39d3ed",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='simple_bank_sub')"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "3ce20e15",
65 | "metadata": {},
66 | "source": [
67 | "> Let's copy some structures from our simple_bank into the newly created 'simple_bank_sub' directory"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "id": "ee9230a2",
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "!cp $PROJDIR/simple_bank/1klg+00.cif $PROJDIR/simple_bank_sub/\n",
78 | "!cp $PROJDIR/simple_bank/2y62+00.cif $PROJDIR/simple_bank_sub/\n",
79 | "!cp $PROJDIR/simple_bank/1ag1+00.cif $PROJDIR/simple_bank/1aw1+01.cif $PROJDIR/simple_bank_sub/\n",
80 | "!cp $PROJDIR/simple_bank/1aw1+02.cif $PROJDIR/simple_bank/1aw1+03.cif $PROJDIR/simple_bank_sub/\n",
81 | "!cp $PROJDIR/simple_bank/1aw1+04.cif $PROJDIR/simple_bank_sub/"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 5,
87 | "id": "ecf54395",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "id": "2e6c151d",
97 | "metadata": {},
98 | "source": [
99 | "## Running PDBClean_MolID_CIF.py \n",
100 | "\n",
101 | "Notice that the way to run this script in the terminal is as following:\n",
102 | "\n",
103 | "> PDBClean_MolID_CIF.py `{Input Directory}` `{Output Directory}`\n",
104 | "\n",
105 | "The input directory contains the structures that we generated in Step 1. The output directory is where the new structures will be stored. \n",
106 | "\n",
107 | "Running this script will print a menu to screen. In the next cell we run the script and give 2 as input, so that we can select option `2) Show only unassigned conversions`. Then we `QUIT` the program. \n",
108 | "\n",
109 | "**Note:** We recommend running the script directly on the terminal. We are running it from the notebook just for demonstration purpose."
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "id": "f7b40709",
116 | "metadata": {
117 | "scrolled": false
118 | },
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 7)\n",
125 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 7)\n",
126 | "Reading: ./TIM//simple_bank_sub/1klg+00.cif (3 of 7)\n",
127 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (4 of 7)\n",
128 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (5 of 7)\n",
129 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (6 of 7)\n",
130 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (7 of 7)\n",
131 | "PDBClean MolID Conversion Build Menu\n",
132 | " Select one of the following options to proceed:\n",
133 | " 1) Show full conversion\n",
134 | " 2) Show only unassigned conversions\n",
135 | " 3) Enter input file\n",
136 | " 4) Search MolID to add chain ID conversion\n",
137 | " 5) Go entry by entry to add chain ID conversion\n",
138 | " 6) Remove a chain ID conversion\n",
139 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
140 | " \n",
141 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
142 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
143 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
144 | "2:GLYCEROL:\n",
145 | "4:WATER:\n",
146 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
147 | "1:PHOSPHATE ION:\n",
148 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN:\n",
149 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN:\n",
150 | "1:TRIOSEPHOSPHATE ISOMERASE PEPTIDE:\n",
151 | "1:ENTEROTOXIN TYPE C-3:\n",
152 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
153 | "You need to accept 12 entity conversions\n",
154 | "You need to accept 18 total chain conversions\n",
155 | "PDBClean MolID Conversion Build Menu\n",
156 | " Select one of the following options to proceed:\n",
157 | " 1) Show full conversion\n",
158 | " 2) Show only unassigned conversions\n",
159 | " 3) Enter input file\n",
160 | " 4) Search MolID to add chain ID conversion\n",
161 | " 5) Go entry by entry to add chain ID conversion\n",
162 | " 6) Remove a chain ID conversion\n",
163 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
164 | " \n",
165 | "Option Number: "
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "! echo '2\\nQUIT' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank\n"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "id": "ad8cc03d",
176 | "metadata": {},
177 | "source": [
178 | "## What does the output mean?\n",
179 | "\n",
180 | "`1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
181 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
182 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
183 | "2:GLYCEROL:\n",
184 | "4:WATER:\n",
185 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
186 | "1:PHOSPHATE ION:\n",
187 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN:\n",
188 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN:\n",
189 | "1:TRIOSEPHOSPHATE ISOMERASE PEPTIDE:\n",
190 | "1:ENTEROTOXIN TYPE C-3:\n",
191 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
192 | "You need to accept 12 entity conversions\n",
193 | "You need to accept 18 total chain conversions`\n",
194 | "\n",
195 | "\n",
196 | "The output printed to screen, and reproduced right above in this cell, tells us how many MolIDs (think of them as chains) are part of each entity. For example, the first line tells us that in one of the file, there is one entity `TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM` that contains one MolID. We also see that in the case of `WATER`, there are 4 MolIDs that we need to assign. \n",
197 | "\n",
198 | "The last two lines tell us how many entities were found as well as the total amount of chains."
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "id": "140a987e",
204 | "metadata": {},
205 | "source": [
206 | "## Inspect the entities in your ensemble. A way to detect outliers:\n",
207 | "\n",
208 | "Another advantage of reading this list, is that we can take a look at all the entities that are present in our ensemble. In our tutorial example, we used the keyword 'triosephosphate isomerase'. If you read this list, you may find some suspicious entitities, such as `HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN`. A closer inspection to the list, we can see also `TRIOSEPHOSPHATE ISOMERASE PEPTIDE`, which suggests that it only contains a fragment of the protein. \n",
209 | "\n",
210 | "Since these are suspicious entries, we can further inspect the CIF files that contain these entities. First, we need to figure out which are the CIF files. The next cell shows a way to do it:"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 7,
216 | "id": "6b576fa4",
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "./TIM//simple_bank_sub/1klg+00.cif:1 'HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN'\n",
224 | "./TIM//simple_bank_sub/1klg+00.cif:2 'HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN'\n",
225 | "./TIM//simple_bank_sub/1klg+00.cif:3 'TRIOSEPHOSPHATE ISOMERASE PEPTIDE'\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "! grep \"HLA CLASS II HISTOCOMPATIBILITY ANTIGEN\" $PROJDIR/simple_bank_sub/*cif \n",
231 | "! grep \"TRIOSEPHOSPHATE ISOMERASE PEPTIDE\" $PROJDIR/simple_bank_sub/*cif "
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "4d1e8d83",
237 | "metadata": {},
238 | "source": [
239 | "These entities come from one single CIF file: 1klg+00.cif \n",
240 | "\n",
241 | "By reading the CIF file (run the cell below, removing the '#') , or using a molecular visualization tool, the user can see that this is an outlier. It was selected because there is a small fragment of the triosephosphate isomerase, but the main structure is of the HLA Class II Histocompatibility antigen. It is best to remove these structures from our ensemble. "
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 8,
247 | "id": "3ceb29e8",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "# ! cat $PROJDIR/simple_bank_sub/1klg+00.cif"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 9,
257 | "id": "f0e21fee",
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# Remove problematic CIF file\n",
262 | "\n",
263 | "! rm $PROJDIR/simple_bank_sub/1klg+00.cif \n"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "id": "8eb4a8da",
269 | "metadata": {},
270 | "source": [
271 | "## How to assign new MolID? \n",
272 | "\n",
273 | "Let's rerun `PDBClean_MolID_CIF.py` with our subsampled ensemble, now with only 6 structures. "
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 10,
279 | "id": "e923fda1",
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 6)\n",
287 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 6)\n",
288 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (3 of 6)\n",
289 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (4 of 6)\n",
290 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (5 of 6)\n",
291 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (6 of 6)\n",
292 | "PDBClean MolID Conversion Build Menu\n",
293 | " Select one of the following options to proceed:\n",
294 | " 1) Show full conversion\n",
295 | " 2) Show only unassigned conversions\n",
296 | " 3) Enter input file\n",
297 | " 4) Search MolID to add chain ID conversion\n",
298 | " 5) Go entry by entry to add chain ID conversion\n",
299 | " 6) Remove a chain ID conversion\n",
300 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
301 | " \n",
302 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
303 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
304 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
305 | "2:GLYCEROL:\n",
306 | "2:WATER:\n",
307 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
308 | "1:PHOSPHATE ION:\n",
309 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
310 | "You need to accept 8 entity conversions\n",
311 | "You need to accept 12 total chain conversions\n",
312 | "PDBClean MolID Conversion Build Menu\n",
313 | " Select one of the following options to proceed:\n",
314 | " 1) Show full conversion\n",
315 | " 2) Show only unassigned conversions\n",
316 | " 3) Enter input file\n",
317 | " 4) Search MolID to add chain ID conversion\n",
318 | " 5) Go entry by entry to add chain ID conversion\n",
319 | " 6) Remove a chain ID conversion\n",
320 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
321 | " \n",
322 | "Option Number: "
323 | ]
324 | }
325 | ],
326 | "source": [
327 | "! echo '2\\nQUIT' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "id": "9191e8cc",
333 | "metadata": {},
334 | "source": [
335 | "### Renaming MolID, how to choose a name? \n",
336 | "\n",
337 | "This is a personal decision. You can decide how name each entity. For example, the easiest way is to assign a different MolID to each entity, as shown in the table below:\n",
338 | "\n",
339 | "| New MolID | ENTITIES |\n",
340 | "|---|:---|\n",
341 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n",
342 | "| B | 1:SN-GLYCEROL-3-PHOSPHATE: |\n",
343 | "| C | 1:SN-GLYCEROL-1-PHOSPHATE: |\n",
344 | "| D,E | 2:GLYCEROL: |\n",
345 | "| F,G | 2:WATER: |\n",
346 | "| H,I | 2:TRIOSEPHOSPHATE ISOMERASE: |\n",
347 | "| J | 1:PHOSPHATE ION: |\n",
348 | "| K,L | 2:2-PHOSPHOGLYCOLIC ACID: | \n",
349 | "\n",
350 | "\n",
351 | "We need to input the new assignment manually when it is printed on screen. Notice that in the next cell, `echo` allows us to type the input in advance. \n",
352 | "\n",
353 | "`2) Show only unassigned conversions` -> `5) Go entry by entry to add chain ID conversion` -> `Letters we chose on the table in this cell` -> `7) Continue to next step of curation` -> `6) Finalize Curation`\n"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 11,
359 | "id": "fabe944a",
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 6)\n",
367 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 6)\n",
368 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (3 of 6)\n",
369 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (4 of 6)\n",
370 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (5 of 6)\n",
371 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (6 of 6)\n",
372 | "PDBClean MolID Conversion Build Menu\n",
373 | " Select one of the following options to proceed:\n",
374 | " 1) Show full conversion\n",
375 | " 2) Show only unassigned conversions\n",
376 | " 3) Enter input file\n",
377 | " 4) Search MolID to add chain ID conversion\n",
378 | " 5) Go entry by entry to add chain ID conversion\n",
379 | " 6) Remove a chain ID conversion\n",
380 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
381 | " \n",
382 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
383 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
384 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
385 | "2:GLYCEROL:\n",
386 | "2:WATER:\n",
387 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
388 | "1:PHOSPHATE ION:\n",
389 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
390 | "You need to accept 8 entity conversions\n",
391 | "You need to accept 12 total chain conversions\n",
392 | "PDBClean MolID Conversion Build Menu\n",
393 | " Select one of the following options to proceed:\n",
394 | " 1) Show full conversion\n",
395 | " 2) Show only unassigned conversions\n",
396 | " 3) Enter input file\n",
397 | " 4) Search MolID to add chain ID conversion\n",
398 | " 5) Go entry by entry to add chain ID conversion\n",
399 | " 6) Remove a chain ID conversion\n",
400 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
401 | " \n",
402 | "Option Number: Enter chain IDs for each of the following MolID.\n",
403 | "Comma separated, no spaces\n",
404 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n",
405 | " conversion templates. You can proceed to the next section\n",
406 | " by selection option 7 or, continue to edit your conversion\n",
407 | " template through this menu\n",
408 | " \n",
409 | "PDBClean MolID Conversion Build Menu\n",
410 | " Select one of the following options to proceed:\n",
411 | " 1) Show full conversion\n",
412 | " 2) Show only unassigned conversions\n",
413 | " 3) Enter input file\n",
414 | " 4) Search MolID to add chain ID conversion\n",
415 | " 5) Go entry by entry to add chain ID conversion\n",
416 | " 6) Remove a chain ID conversion\n",
417 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
418 | " \n",
419 | " 7) Continue to next step of curation\n",
420 | "Option Number: 6) Finalize Curation\n",
421 | "Option Number: Finalizing Curation ...\n"
422 | ]
423 | }
424 | ],
425 | "source": [
426 | "! echo '2\\n5\\nA\\nB\\nC\\nD,E\\nF,G\\nH,I\\nJ\\nK,L\\n7\\n6\\n' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "id": "46092d65",
433 | "metadata": {},
434 | "outputs": [],
435 | "source": []
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": null,
440 | "id": "be78b076",
441 | "metadata": {},
442 | "outputs": [],
443 | "source": []
444 | }
445 | ],
446 | "metadata": {
447 | "kernelspec": {
448 | "display_name": "PDBCleanV2",
449 | "language": "python",
450 | "name": "pdbcleanv2"
451 | },
452 | "language_info": {
453 | "codemirror_mode": {
454 | "name": "ipython",
455 | "version": 3
456 | },
457 | "file_extension": ".py",
458 | "mimetype": "text/x-python",
459 | "name": "python",
460 | "nbconvert_exporter": "python",
461 | "pygments_lexer": "ipython3",
462 | "version": "3.10.5"
463 | }
464 | },
465 | "nbformat": 4,
466 | "nbformat_minor": 5
467 | }
468 |
--------------------------------------------------------------------------------
/Notebooks/Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "fcc80ba7",
6 | "metadata": {},
7 | "source": [
8 | "# 2. Clean Structures and Create one CIFF file per Biological Assembly"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "37b09ec8",
14 | "metadata": {},
15 | "source": [
16 | "## What is the goal of this notebook? \n",
17 | "\n",
18 | "This notebook achieves 2 goals:\n",
19 | "\n",
20 | "1. The first step 'cleans' the CIF files we downloaded in step 0. This step will remove some of the **data names** \n",
21 | "and **data blocks** included in the raw CIF files. A new directory 'clean_bank' is created in this step.\n",
22 | "\n",
23 | "\n",
24 | "2. The second step will create a new CIF files for each [biological assembly](https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-BioUnit) present in any structure. Each new structure will be saved with the suffix +0x.cif (where x is the number of the biological assembly). This step also standardizes the **data names** and **data blocks**. In particular how the coordinate portion of the file is printed.\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "fb5c24aa",
30 | "metadata": {},
31 | "source": [
32 | "## Import library and create working directory"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "id": "90edccb0",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "from PDBClean import pdbclean_io, pdbutils, cleanutils"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "id": "101c2903",
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "# Path to project directory\n",
53 | "PROJDIR=\"./TIM/\""
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "id": "71d410ed",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Create directory where we will stored the 'clean' cif files.\n",
64 | "pdbclean_io.check_project(projdir=PROJDIR, level='clean_bank')"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "f3503582",
70 | "metadata": {},
71 | "source": [
72 | "## Clean CIF files (standardize data blocks)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "id": "3dbc8c16",
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "[1/244]: 7rpn.cif\n",
86 | "[2/244]: 2v2d.cif\n",
87 | "[3/244]: 4poc.cif\n",
88 | "[4/244]: 5zfx.cif\n",
89 | "[5/244]: 1o5x.cif\n",
90 | "[6/244]: 4gnj.cif\n",
91 | "[7/244]: 1ydv.cif\n",
92 | "[8/244]: 4zvj.cif\n",
93 | "[9/244]: 4ff7.cif\n",
94 | "[10/244]: 7qon.cif\n",
95 | "[11/244]: 1klu.cif\n",
96 | "[12/244]: 3qsr.cif\n",
97 | "[13/244]: 4o54.cif\n",
98 | "[14/244]: 2x1s.cif\n",
99 | "[15/244]: 3py2.cif\n",
100 | "[16/244]: 2vfh.cif\n",
101 | "[17/244]: 1hg3.cif\n",
102 | "[18/244]: 4obt.cif\n",
103 | "[19/244]: 6up5.cif\n",
104 | "[20/244]: 7sx1.cif\n",
105 | "[21/244]: 4hhp.cif\n",
106 | "[22/244]: 4o57.cif\n",
107 | "[23/244]: 1nf0.cif\n",
108 | "[24/244]: 4iot.cif\n",
109 | "[25/244]: 5tim.cif\n",
110 | "[26/244]: 1ml1.cif\n",
111 | "[27/244]: 2vfi.cif\n",
112 | "[28/244]: 2x1r.cif\n",
113 | "[29/244]: 3gvg.cif\n",
114 | "[30/244]: 1m7p.cif\n",
115 | "[31/244]: 1aw2.cif\n",
116 | "[32/244]: 7pek.cif\n",
117 | "[33/244]: 4zz9.cif\n",
118 | "[34/244]: 4o4v.cif\n",
119 | "[35/244]: 4o53.cif\n",
120 | "[36/244]: 1ney.cif\n",
121 | "[37/244]: 6upf.cif\n",
122 | "[38/244]: 4mva.cif\n",
123 | "[39/244]: 2y63.cif\n",
124 | "[40/244]: 5i3k.cif\n",
125 | "[41/244]: 4jeq.cif\n",
126 | "[42/244]: 4owg.cif\n",
127 | "[43/244]: 3qst.cif\n",
128 | "[44/244]: 5i3j.cif\n",
129 | "[45/244]: 2y62.cif\n",
130 | "[46/244]: 7tim.cif\n",
131 | "[47/244]: 4o52.cif\n",
132 | "[48/244]: 6up1.cif\n",
133 | "[49/244]: 4o4w.cif\n",
134 | "[50/244]: 4pod.cif\n",
135 | "[51/244]: 1ssd.cif\n",
136 | "[52/244]: 4br1.cif\n",
137 | "[53/244]: 7pej.cif\n",
138 | "[54/244]: 2v2c.cif\n",
139 | "[55/244]: 2x16.cif\n",
140 | "[56/244]: 2x1u.cif\n",
141 | "[57/244]: 1aw1.cif\n",
142 | "[58/244]: 3kxq.cif\n",
143 | "[59/244]: 6oog.cif\n",
144 | "[60/244]: 4o50.cif\n",
145 | "[61/244]: 5bmx.cif\n",
146 | "[62/244]: 5i3h.cif\n",
147 | "[63/244]: 3m9y.cif\n",
148 | "[64/244]: 3ta6.cif\n",
149 | "[65/244]: 1klg.cif\n",
150 | "[66/244]: 5i3i.cif\n",
151 | "[67/244]: 2y61.cif\n",
152 | "[68/244]: 7t0q.cif\n",
153 | "[69/244]: 6nee.cif\n",
154 | "[70/244]: 7rgc.cif\n",
155 | "[71/244]: 1ssg.cif\n",
156 | "[72/244]: 2x1t.cif\n",
157 | "[73/244]: 2j27.cif\n",
158 | "[74/244]: 1vga.cif\n",
159 | "[75/244]: 2vxn.cif\n",
160 | "[76/244]: 1mss.cif\n",
161 | "[77/244]: 5ujw.cif\n",
162 | "[78/244]: 1b9b.cif\n",
163 | "[79/244]: 3tim.cif\n",
164 | "[80/244]: 4mkn.cif\n",
165 | "[81/244]: 2i9e.cif\n",
166 | "[82/244]: 6w4u.cif\n",
167 | "[83/244]: 2v5b.cif\n",
168 | "[84/244]: 1su5.cif\n",
169 | "[85/244]: 2j24.cif\n",
170 | "[86/244]: 3pf3.cif\n",
171 | "[87/244]: 5gzp.cif\n",
172 | "[88/244]: 2ypi.cif\n",
173 | "[89/244]: 5ibx.cif\n",
174 | "[90/244]: 7az3.cif\n",
175 | "[91/244]: 1btm.cif\n",
176 | "[92/244]: 1tph.cif\n",
177 | "[93/244]: 1ci1.cif\n",
178 | "[94/244]: 3psv.cif\n",
179 | "[95/244]: 4ywi.cif\n",
180 | "[96/244]: 1trd.cif\n",
181 | "[97/244]: 3uwz.cif\n",
182 | "[98/244]: 5vwn.cif\n",
183 | "[99/244]: 1iih.cif\n",
184 | "[100/244]: 7aza.cif\n",
185 | "[101/244]: 4ohq.cif\n",
186 | "[102/244]: 6nxy.cif\n",
187 | "[103/244]: 2ian.cif\n",
188 | "[104/244]: 3s6d.cif\n",
189 | "[105/244]: 4z0s.cif\n",
190 | "[106/244]: 6nxx.cif\n",
191 | "[107/244]: 3krs.cif\n",
192 | "[108/244]: 1tre.cif\n",
193 | "[109/244]: 3psw.cif\n",
194 | "[110/244]: 4yxg.cif\n",
195 | "[111/244]: 1tim.cif\n",
196 | "[112/244]: 4nvt.cif\n",
197 | "[113/244]: 3uwy.cif\n",
198 | "[114/244]: 1if2.cif\n",
199 | "[115/244]: 5gv4.cif\n",
200 | "[116/244]: 7az4.cif\n",
201 | "[117/244]: 6nlh.cif\n",
202 | "[118/244]: 2iam.cif\n",
203 | "[119/244]: 1lzo.cif\n",
204 | "[120/244]: 8tim.cif\n",
205 | "[121/244]: 2dp3.cif\n",
206 | "[122/244]: 7rmn.cif\n",
207 | "[123/244]: 1sw7.cif\n",
208 | "[124/244]: 3uwu.cif\n",
209 | "[125/244]: 1tpc.cif\n",
210 | "[126/244]: 1iig.cif\n",
211 | "[127/244]: 1yya.cif\n",
212 | "[128/244]: 3th6.cif\n",
213 | "[129/244]: 4tim.cif\n",
214 | "[130/244]: 5brb.cif\n",
215 | "[131/244]: 4z0j.cif\n",
216 | "[132/244]: 6nxw.cif\n",
217 | "[133/244]: 7az9.cif\n",
218 | "[134/244]: 1wob.cif\n",
219 | "[135/244]: 1lyx.cif\n",
220 | "[136/244]: 4y90.cif\n",
221 | "[137/244]: 1tpb.cif\n",
222 | "[138/244]: 1tpu.cif\n",
223 | "[139/244]: 6cg9.cif\n",
224 | "[140/244]: 2vom.cif\n",
225 | "[141/244]: 6jox.cif\n",
226 | "[142/244]: 1tpw.cif\n",
227 | "[143/244]: 3uwv.cif\n",
228 | "[144/244]: 7r9b.cif\n",
229 | "[145/244]: 7rcq.cif\n",
230 | "[146/244]: 4g1k.cif\n",
231 | "[147/244]: 5eyw.cif\n",
232 | "[148/244]: 1r2s.cif\n",
233 | "[149/244]: 1r2r.cif\n",
234 | "[150/244]: 5upr.cif\n",
235 | "[151/244]: 1woa.cif\n",
236 | "[152/244]: 6bve.cif\n",
237 | "[153/244]: 1ag1.cif\n",
238 | "[154/244]: 1tri.cif\n",
239 | "[155/244]: 1tpv.cif\n",
240 | "[156/244]: 3uww.cif\n",
241 | "[157/244]: 6c2g.cif\n",
242 | "[158/244]: 4unk.cif\n",
243 | "[159/244]: 6d43.cif\n",
244 | "[160/244]: 2v5l.cif\n",
245 | "[161/244]: 1sux.cif\n",
246 | "[162/244]: 1tpe.cif\n",
247 | "[163/244]: 1tsi.cif\n",
248 | "[164/244]: 4y9a.cif\n",
249 | "[165/244]: 1n55.cif\n",
250 | "[166/244]: 7qh0.cif\n",
251 | "[167/244]: 1wyi.cif\n",
252 | "[168/244]: 7abx.cif\n",
253 | "[169/244]: 6nxq.cif\n",
254 | "[170/244]: 4y96.cif\n",
255 | "[171/244]: 7r7m.cif\n",
256 | "[172/244]: 1sw0.cif\n",
257 | "[173/244]: 1tpd.cif\n",
258 | "[174/244]: 4unl.cif\n",
259 | "[175/244]: 6tim.cif\n",
260 | "[176/244]: 2oma.cif\n",
261 | "[177/244]: 1tpf.cif\n",
262 | "[178/244]: 2jgq.cif\n",
263 | "[179/244]: 4ymz.cif\n",
264 | "[180/244]: 4y8f.cif\n",
265 | "[181/244]: 6nxs.cif\n",
266 | "[182/244]: 1r2t.cif\n",
267 | "[183/244]: 6nxr.cif\n",
268 | "[184/244]: 7skj.cif\n",
269 | "[185/244]: 4x22.cif\n",
270 | "[186/244]: 2jk2.cif\n",
271 | "[187/244]: 1sw3.cif\n",
272 | "[188/244]: 1hti.cif\n",
273 | "[189/244]: 5csr.cif\n",
274 | "[190/244]: 4bi5.cif\n",
275 | "[191/244]: 2ven.cif\n",
276 | "[192/244]: 1spq.cif\n",
277 | "[193/244]: 5zg4.cif\n",
278 | "[194/244]: 5cg7.cif\n",
279 | "[195/244]: 5zg5.cif\n",
280 | "[196/244]: 1tti.cif\n",
281 | "[197/244]: 3ypi.cif\n",
282 | "[198/244]: 1dkw.cif\n",
283 | "[199/244]: 5css.cif\n",
284 | "[200/244]: 1m7o.cif\n",
285 | "[201/244]: 4k6a.cif\n",
286 | "[202/244]: 4bi6.cif\n",
287 | "[203/244]: 2vem.cif\n",
288 | "[204/244]: 5zga.cif\n",
289 | "[205/244]: 1sq7.cif\n",
290 | "[206/244]: 5bmw.cif\n",
291 | "[207/244]: 5i3g.cif\n",
292 | "[208/244]: 3tao.cif\n",
293 | "[209/244]: 5i3f.cif\n",
294 | "[210/244]: 6ooi.cif\n",
295 | "[211/244]: 1ttj.cif\n",
296 | "[212/244]: 2vel.cif\n",
297 | "[213/244]: 4e41.cif\n",
298 | "[214/244]: 7n8u.cif\n",
299 | "[215/244]: 4bi7.cif\n",
300 | "[216/244]: 1tmh.cif\n",
301 | "[217/244]: 1m6j.cif\n",
302 | "[218/244]: 1mo0.cif\n",
303 | "[219/244]: 2v0t.cif\n",
304 | "[220/244]: 2vfd.cif\n",
305 | "[221/244]: 6r8h.cif\n",
306 | "[222/244]: 1kv5.cif\n",
307 | "[223/244]: 6up8.cif\n",
308 | "[224/244]: 1i45.cif\n",
309 | "[225/244]: 7rde.cif\n",
310 | "[226/244]: 2vfe.cif\n",
311 | "[227/244]: 2vei.cif\n",
312 | "[228/244]: 3pvf.cif\n",
313 | "[229/244]: 2x2g.cif\n",
314 | "[230/244]: 2vek.cif\n",
315 | "[231/244]: 1tcd.cif\n",
316 | "[232/244]: 2vfg.cif\n",
317 | "[233/244]: 2v2h.cif\n",
318 | "[234/244]: 4wje.cif\n",
319 | "[235/244]: 1qds.cif\n",
320 | "[236/244]: 4p61.cif\n",
321 | "[237/244]: 1w0m.cif\n",
322 | "[238/244]: 2btm.cif\n",
323 | "[239/244]: 1ypi.cif\n",
324 | "[240/244]: 5bnk.cif\n",
325 | "[241/244]: 4rcx.cif\n",
326 | "[242/244]: 2vff.cif\n",
327 | "[243/244]: 3pwa.cif\n",
328 | "[244/244]: 2h6r.cif\n"
329 | ]
330 | }
331 | ],
332 | "source": [
333 | "cleanutils.process(projdir=PROJDIR, step='clean', source='raw_bank', target='clean_bank')"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "id": "af89bbd9",
339 | "metadata": {},
340 | "source": [
341 | "## Simplify and Split into Biological Assemblies "
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 5,
347 | "id": "aa8acf0b",
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "# Create directory to store new structures\n",
352 | "pdbclean_io.check_project(projdir=PROJDIR, level='simple_bank')"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 6,
358 | "id": "d481cd0f",
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "[1/244]: 7rpn.cif\n",
366 | "[2/244]: 2v2d.cif\n",
367 | "[3/244]: 4poc.cif\n",
368 | "[4/244]: 5zfx.cif\n",
369 | "[5/244]: 1o5x.cif\n",
370 | "[6/244]: 4gnj.cif\n",
371 | "[7/244]: 1ydv.cif\n",
372 | "[8/244]: 4zvj.cif\n",
373 | "[9/244]: 4ff7.cif\n",
374 | "[10/244]: 7qon.cif\n",
375 | "[11/244]: 1klu.cif\n",
376 | "[12/244]: 3qsr.cif\n",
377 | "[13/244]: 4o54.cif\n",
378 | "[14/244]: 2x1s.cif\n",
379 | "[15/244]: 3py2.cif\n",
380 | "[16/244]: 2vfh.cif\n",
381 | "[17/244]: 1hg3.cif\n",
382 | "[18/244]: 4obt.cif\n",
383 | "[19/244]: 6up5.cif\n",
384 | "[20/244]: 7sx1.cif\n",
385 | "[21/244]: 4hhp.cif\n",
386 | "[22/244]: 4o57.cif\n",
387 | "[23/244]: 1nf0.cif\n",
388 | "[24/244]: 4iot.cif\n",
389 | "[25/244]: 5tim.cif\n",
390 | "[26/244]: 1ml1.cif\n",
391 | "[27/244]: 2vfi.cif\n",
392 | "[28/244]: 2x1r.cif\n",
393 | "[29/244]: 3gvg.cif\n",
394 | "[30/244]: 1m7p.cif\n",
395 | "[31/244]: 1aw2.cif\n",
396 | "[32/244]: 7pek.cif\n",
397 | "[33/244]: 4zz9.cif\n",
398 | "[34/244]: 4o4v.cif\n",
399 | "[35/244]: 4o53.cif\n",
400 | "[36/244]: 1ney.cif\n",
401 | "[37/244]: 6upf.cif\n",
402 | "[38/244]: 4mva.cif\n",
403 | "[39/244]: 2y63.cif\n",
404 | "[40/244]: 5i3k.cif\n",
405 | "[41/244]: 4jeq.cif\n",
406 | "[42/244]: 4owg.cif\n",
407 | "[43/244]: 3qst.cif\n",
408 | "[44/244]: 5i3j.cif\n",
409 | "[45/244]: 2y62.cif\n",
410 | "[46/244]: 7tim.cif\n",
411 | "[47/244]: 4o52.cif\n",
412 | "[48/244]: 6up1.cif\n",
413 | "[49/244]: 4o4w.cif\n",
414 | "[50/244]: 4pod.cif\n",
415 | "[51/244]: 1ssd.cif\n",
416 | "[52/244]: 4br1.cif\n",
417 | "[53/244]: 7pej.cif\n",
418 | "[54/244]: 2v2c.cif\n",
419 | "[55/244]: 2x16.cif\n",
420 | "[56/244]: 2x1u.cif\n",
421 | "[57/244]: 1aw1.cif\n",
422 | "[58/244]: 3kxq.cif\n",
423 | "[59/244]: 6oog.cif\n",
424 | "[60/244]: 4o50.cif\n",
425 | "[61/244]: 5bmx.cif\n",
426 | "[62/244]: 5i3h.cif\n",
427 | "[63/244]: 3m9y.cif\n",
428 | "[64/244]: 3ta6.cif\n",
429 | "[65/244]: 1klg.cif\n",
430 | "[66/244]: 5i3i.cif\n",
431 | "[67/244]: 2y61.cif\n",
432 | "[68/244]: 7t0q.cif\n",
433 | "[69/244]: 6nee.cif\n",
434 | "[70/244]: 7rgc.cif\n",
435 | "[71/244]: 1ssg.cif\n",
436 | "[72/244]: 2x1t.cif\n",
437 | "[73/244]: 2j27.cif\n",
438 | "[74/244]: 1vga.cif\n",
439 | "[75/244]: 2vxn.cif\n",
440 | "[76/244]: 1mss.cif\n",
441 | "[77/244]: 5ujw.cif\n",
442 | "[78/244]: 1b9b.cif\n",
443 | "[79/244]: 3tim.cif\n",
444 | "[80/244]: 4mkn.cif\n",
445 | "[81/244]: 2i9e.cif\n",
446 | "[82/244]: 6w4u.cif\n",
447 | "[83/244]: 2v5b.cif\n",
448 | "[84/244]: 1su5.cif\n",
449 | "[85/244]: 2j24.cif\n",
450 | "[86/244]: 3pf3.cif\n",
451 | "[87/244]: 5gzp.cif\n",
452 | "[88/244]: 2ypi.cif\n",
453 | "[89/244]: 5ibx.cif\n",
454 | "[90/244]: 7az3.cif\n",
455 | "[91/244]: 1btm.cif\n",
456 | "[92/244]: 1tph.cif\n",
457 | "[93/244]: 1ci1.cif\n",
458 | "[94/244]: 3psv.cif\n",
459 | "[95/244]: 4ywi.cif\n",
460 | "[96/244]: 1trd.cif\n",
461 | "[97/244]: 3uwz.cif\n",
462 | "[98/244]: 5vwn.cif\n",
463 | "[99/244]: 1iih.cif\n",
464 | "[100/244]: 7aza.cif\n",
465 | "[101/244]: 4ohq.cif\n",
466 | "[102/244]: 6nxy.cif\n",
467 | "[103/244]: 2ian.cif\n",
468 | "[104/244]: 3s6d.cif\n",
469 | "[105/244]: 4z0s.cif\n",
470 | "[106/244]: 6nxx.cif\n",
471 | "[107/244]: 3krs.cif\n",
472 | "[108/244]: 1tre.cif\n",
473 | "[109/244]: 3psw.cif\n",
474 | "[110/244]: 4yxg.cif\n",
475 | "[111/244]: 1tim.cif\n",
476 | "[112/244]: 4nvt.cif\n",
477 | "[113/244]: 3uwy.cif\n",
478 | "[114/244]: 1if2.cif\n",
479 | "[115/244]: 5gv4.cif\n",
480 | "[116/244]: 7az4.cif\n",
481 | "[117/244]: 6nlh.cif\n",
482 | "[118/244]: 2iam.cif\n",
483 | "[119/244]: 1lzo.cif\n",
484 | "[120/244]: 8tim.cif\n",
485 | "[121/244]: 2dp3.cif\n",
486 | "[122/244]: 7rmn.cif\n",
487 | "[123/244]: 1sw7.cif\n",
488 | "[124/244]: 3uwu.cif\n",
489 | "[125/244]: 1tpc.cif\n",
490 | "[126/244]: 1iig.cif\n",
491 | "[127/244]: 1yya.cif\n",
492 | "[128/244]: 3th6.cif\n",
493 | "[129/244]: 4tim.cif\n",
494 | "[130/244]: 5brb.cif\n",
495 | "[131/244]: 4z0j.cif\n",
496 | "[132/244]: 6nxw.cif\n",
497 | "[133/244]: 7az9.cif\n",
498 | "[134/244]: 1wob.cif\n",
499 | "[135/244]: 1lyx.cif\n",
500 | "[136/244]: 4y90.cif\n",
501 | "[137/244]: 1tpb.cif\n",
502 | "[138/244]: 1tpu.cif\n",
503 | "[139/244]: 6cg9.cif\n",
504 | "[140/244]: 2vom.cif\n",
505 | "[141/244]: 6jox.cif\n",
506 | "[142/244]: 1tpw.cif\n",
507 | "[143/244]: 3uwv.cif\n",
508 | "[144/244]: 7r9b.cif\n",
509 | "[145/244]: 7rcq.cif\n",
510 | "[146/244]: 4g1k.cif\n",
511 | "[147/244]: 5eyw.cif\n",
512 | "[148/244]: 1r2s.cif\n",
513 | "[149/244]: 1r2r.cif\n",
514 | "[150/244]: 5upr.cif\n",
515 | "[151/244]: 1woa.cif\n",
516 | "[152/244]: 6bve.cif\n",
517 | "[153/244]: 1ag1.cif\n",
518 | "[154/244]: 1tri.cif\n",
519 | "[155/244]: 1tpv.cif\n",
520 | "[156/244]: 3uww.cif\n",
521 | "[157/244]: 6c2g.cif\n",
522 | "[158/244]: 4unk.cif\n",
523 | "[159/244]: 6d43.cif\n",
524 | "[160/244]: 2v5l.cif\n",
525 | "[161/244]: 1sux.cif\n",
526 | "[162/244]: 1tpe.cif\n",
527 | "[163/244]: 1tsi.cif\n",
528 | "[164/244]: 4y9a.cif\n",
529 | "[165/244]: 1n55.cif\n",
530 | "[166/244]: 7qh0.cif\n",
531 | "[167/244]: 1wyi.cif\n",
532 | "[168/244]: 7abx.cif\n",
533 | "[169/244]: 6nxq.cif\n",
534 | "[170/244]: 4y96.cif\n",
535 | "[171/244]: 7r7m.cif\n",
536 | "[172/244]: 1sw0.cif\n",
537 | "[173/244]: 1tpd.cif\n",
538 | "[174/244]: 4unl.cif\n",
539 | "[175/244]: 6tim.cif\n",
540 | "[176/244]: 2oma.cif\n",
541 | "[177/244]: 1tpf.cif\n",
542 | "[178/244]: 2jgq.cif\n",
543 | "[179/244]: 4ymz.cif\n",
544 | "[180/244]: 4y8f.cif\n",
545 | "[181/244]: 6nxs.cif\n",
546 | "[182/244]: 1r2t.cif\n",
547 | "[183/244]: 6nxr.cif\n",
548 | "[184/244]: 7skj.cif\n",
549 | "[185/244]: 4x22.cif\n",
550 | "[186/244]: 2jk2.cif\n",
551 | "[187/244]: 1sw3.cif\n",
552 | "[188/244]: 1hti.cif\n",
553 | "[189/244]: 5csr.cif\n",
554 | "[190/244]: 4bi5.cif\n",
555 | "[191/244]: 2ven.cif\n",
556 | "[192/244]: 1spq.cif\n",
557 | "[193/244]: 5zg4.cif\n",
558 | "[194/244]: 5cg7.cif\n",
559 | "[195/244]: 5zg5.cif\n",
560 | "[196/244]: 1tti.cif\n",
561 | "[197/244]: 3ypi.cif\n",
562 | "[198/244]: 1dkw.cif\n",
563 | "[199/244]: 5css.cif\n",
564 | "[200/244]: 1m7o.cif\n",
565 | "[201/244]: 4k6a.cif\n",
566 | "[202/244]: 4bi6.cif\n",
567 | "[203/244]: 2vem.cif\n",
568 | "[204/244]: 5zga.cif\n",
569 | "[205/244]: 1sq7.cif\n",
570 | "[206/244]: 5bmw.cif\n",
571 | "[207/244]: 5i3g.cif\n",
572 | "[208/244]: 3tao.cif\n",
573 | "[209/244]: 5i3f.cif\n",
574 | "[210/244]: 6ooi.cif\n",
575 | "[211/244]: 1ttj.cif\n",
576 | "[212/244]: 2vel.cif\n",
577 | "[213/244]: 4e41.cif\n",
578 | "[214/244]: 7n8u.cif\n",
579 | "[215/244]: 4bi7.cif\n",
580 | "[216/244]: 1tmh.cif\n",
581 | "[217/244]: 1m6j.cif\n",
582 | "[218/244]: 1mo0.cif\n",
583 | "[219/244]: 2v0t.cif\n",
584 | "[220/244]: 2vfd.cif\n",
585 | "[221/244]: 6r8h.cif\n",
586 | "[222/244]: 1kv5.cif\n",
587 | "[223/244]: 6up8.cif\n",
588 | "[224/244]: 1i45.cif\n",
589 | "[225/244]: 7rde.cif\n",
590 | "[226/244]: 2vfe.cif\n",
591 | "[227/244]: 2vei.cif\n",
592 | "[228/244]: 3pvf.cif\n",
593 | "[229/244]: 2x2g.cif\n",
594 | "[230/244]: 2vek.cif\n",
595 | "[231/244]: 1tcd.cif\n",
596 | "[232/244]: 2vfg.cif\n",
597 | "[233/244]: 2v2h.cif\n",
598 | "[234/244]: 4wje.cif\n",
599 | "[235/244]: 1qds.cif\n",
600 | "[236/244]: 4p61.cif\n",
601 | "[237/244]: 1w0m.cif\n",
602 | "[238/244]: 2btm.cif\n",
603 | "[239/244]: 1ypi.cif\n",
604 | "[240/244]: 5bnk.cif\n",
605 | "[241/244]: 4rcx.cif\n",
606 | "[242/244]: 2vff.cif\n",
607 | "[243/244]: 3pwa.cif\n",
608 | "[244/244]: 2h6r.cif\n"
609 | ]
610 | }
611 | ],
612 | "source": [
613 | "cleanutils.process(projdir=PROJDIR, step='simplify', source='clean_bank', target='simple_bank')"
614 | ]
615 | },
616 | {
617 | "cell_type": "markdown",
618 | "id": "6c5b8e0d",
619 | "metadata": {},
620 | "source": []
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "id": "756e588d",
626 | "metadata": {},
627 | "outputs": [],
628 | "source": []
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": null,
633 | "id": "13d466a2",
634 | "metadata": {},
635 | "outputs": [],
636 | "source": []
637 | }
638 | ],
639 | "metadata": {
640 | "kernelspec": {
641 | "display_name": "PDBCleanV2",
642 | "language": "python",
643 | "name": "pdbcleanv2"
644 | },
645 | "language_info": {
646 | "codemirror_mode": {
647 | "name": "ipython",
648 | "version": 3
649 | },
650 | "file_extension": ".py",
651 | "mimetype": "text/x-python",
652 | "name": "python",
653 | "nbconvert_exporter": "python",
654 | "pygments_lexer": "ipython3",
655 | "version": "3.10.5"
656 | }
657 | },
658 | "nbformat": 4,
659 | "nbformat_minor": 5
660 | }
661 |
--------------------------------------------------------------------------------
/src/pdbcleanresiduestandardizationutils.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 | from Bio.PDB.MMCIFParser import FastMMCIFParser
4 | from PDBClean.alignmentutils import *
5 | from PDBClean.listutils import *
6 | #
7 |
8 | ####################
9 | # INITIALIZE STEPS #
10 | ####################
11 |
12 | def pdb_to_structurelists(filelist):
13 | """
14 | Iterates through a list CIF(s) and retrieves structure IDs, chain IDs, and maps chain IDs to their sequences,
15 | and maps chain IDs to their residue numbers.
16 |
17 | Parameters:
18 | -----------
19 | filelist : list of str
20 | list of file paths for all '.cif' files in specified directory
21 |
22 | Returns:
23 | --------
24 | Structure_Sequences : dict
25 | Contains dictionary where chain ID is mapped to their sequence for each structure.
26 | ChID_ResiNum_Vector : list of dict
27 | Each dictionary maps the chain ID to their residue numbers for a structure
28 | structid_list : list of str
29 | List of unique structure identifiers for each CIF. Format is 'input directory / CIF'
30 | chid_list : list of str
31 | A list containing all the chain IDs from CIF(s)
32 | """
33 |
34 | # Structure_Sequences is the master list of maps from chain IDs to their sequences
35 | Structure_Sequences = {}
36 | ChID_ResiNum_Vector = []
37 | chid_list = []
38 | structid_list = []
39 |
40 | N = 0
41 | for my_file in filelist:
42 | N += 1
43 | print("Reading:" + ' ' + my_file + " (" + str(N) + " of " + str(len(filelist)) + ")")
44 | struct = FastMMCIFParser(auth_residues=False,QUIET=1).get_structure(str(my_file), my_file)
45 | structid_list.append(struct.get_id())
46 | chid_seq_map = {}
47 | chid_resinum_map = {}
48 | # Only written for structures with only one model in them
49 | for chain in struct[0]:
50 | if (chain.get_id() not in chid_resinum_map): #FAPA: HERE WE NEED TO ADD IF TO CHECK IF pdbx_PDB_ins_code != '?'
51 | chid_resinum_map[chain.get_id()] = []
52 | key = str(struct.get_id()) + "_" + str(chain.get_id())
53 | resinum_list = []
54 | seq = ""
55 | for residue in chain:
56 | #print("printing residue ids: "+ str(residue.get_id()[2]))
57 | resinum_list.append(residue.get_id()[1])
58 | #chid_resinum_map[chain.get_id()].append(residue.get_id()[1])
59 | # For each residue we extract both the residue number and the associated "letter" (pdbx_PDB_ins_code)
60 | chid_resinum_map[chain.get_id()].append(str(residue.get_id()[1])+str(residue.get_id()[2])) #FAPA 17 oct 2024
61 | #resinum_list.append(residue.id()[1])
62 | #chid_resinum_map[chain.get_id()].append(residue.id()[1])
63 | seq += ResnConvert(residue.get_resname())
64 | Structure_Sequences[key] = seq
65 | # chid_list is a master list of all chainIDs used
66 | chid_list.append(chain.get_id())
67 | ChID_ResiNum_Vector.append(chid_resinum_map)
68 | chid_set = set(chid_list)
69 | chid_list = sorted(list(chid_set))
70 | #print(ChID_ResiNum_Vector)
71 | return Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list
72 |
73 | #########################################
74 | # INTERACTIVE STANDARDIZATION FUNCTIONS #
75 | #########################################
76 |
77 | def perform_multiple_alignment(Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list, check):
78 | """
79 | Interactive user interface for performing multiple alignments
80 |
81 | Parameters:
82 | -----------
83 | Structure_Sequences : dict
84 | Contains dictionary where chain ID is mapped to their sequence for each structure.
85 | ChID_ResiNum_Vector : list of dict
86 | Each dictionary maps the chain ID to their residue numbers for a structure
87 | structid_list : list of str
88 | List of unique structure identifiers for each CIF. Format is 'input directory / CIF'
89 | chid_list : list of str
90 | A list containing all the chain IDs from CIF(s)
91 | check : str
92 | Option chosen by user which opens the submenu
93 |
94 | Returns:
95 | --------
96 | Structure_Sequences_Aligned : dict
97 | A dictionary where each key is a combination of structure identifier and chain ID,
98 | and the value is the aligned sequence for that chain.
99 | Structure_ConversionTemplate : dict
100 | A dictionary mapping each structure identifier to a conversion template,
101 | which contains mappings of residue numbers from the original sequence to the aligned sequence.
102 | chid_list : list of str
103 | Updated list of chain IDs where some may have been removed based on the user's options
104 | check : str
105 | Updated string representing the state of the main menu, set to '1' to indicate a state change.
106 | """
107 | Structure_Sequences_Aligned = {}
108 | Structure_ConversionTemplate = {}
109 | Structure_Sequences_GAPS = {}
110 | input_submenu = ""
111 | while(input_submenu != "QUIT"):
112 | print(" Perform multiple alignments to identify residues",
113 | " 1) Show list of chains to be standardized",
114 | " 2) Remove chain IDs from list of chains to be standardized",
115 | " 3) Input file of chain IDs to remove from list of chains to be standardized",
116 | " 4) Perform multiple alignments",
117 | sep="\n")
118 | input_submenu = input('Option Number: ')
119 | if (input_submenu == "1"):
120 | show_list(chid_list)
121 | elif (input_submenu == "2"):
122 | chid_list = remove_user_defined_chain_from_list(chid_list)
123 | elif (input_submenu == "3"):
124 | chid_list = remove_file_defined_chain_from_list(chid_list)
125 | elif (input_submenu == "4"):
126 | print(" Choose occupancy threshold for residue renumbering",
127 | " Input an integer number between 1 and 100",
128 | sep="\n")
129 | user_gap = input('Occupancy threshold: ')
130 | user_gap = int(user_gap)
131 |
132 | for chid in chid_list:
133 | this_chainsseq_list = []
134 | this_chainsseq_list_ids = [] #FAPA
135 | this_chainsseq_aligned_list = []
136 | for I in range(len(structid_list)):
137 | key = str(structid_list[I]) + "_" + chid
138 | if key in Structure_Sequences:
139 | this_chainsseq_list.append(Structure_Sequences[key])
140 | this_chainsseq_list_ids.append(structid_list[I]) # FAPA
141 | #this_chainsseq_aligned_list_map = AlignSequences_v2(this_chainsseq_list, chid,this_chainsseq_list_ids )
142 | #this_chainsseq_aligned_list_map = AlignSequences_v3(this_chainsseq_list, chid, this_chainsseq_list_ids) #FAPA MAY2024
143 | this_chainsseq_aligned_list_map, this_chainseq_gap_percentages = AlignSequences_v4(this_chainsseq_list, chid,
144 | this_chainsseq_list_ids) # FAPA JULY2024
145 | i = 0
146 | for I in range(len(structid_list)):
147 | key = str(structid_list[I]) + "_" + chid
148 | if key in Structure_Sequences:
149 | #Structure_Sequences_Aligned[key] = this_chainsseq_aligned_list[i]
150 | Structure_Sequences_Aligned[key] = this_chainsseq_aligned_list_map[str(structid_list[I])]
151 | Structure_Sequences_GAPS[key] = this_chainseq_gap_percentages #FAPA
152 | i += 1
153 |
154 | #THIS IS THE VERSION THAT WORKS, COMMENTED SO WE TRY SOMETHING NEW
155 | #for I in range(len(structid_list)):
156 | # conversion_template = {}
157 | # for chain in ChID_ResiNum_Vector[I]:
158 | # resinum_aligned_list = []
159 | # key = str(structid_list[I]) + "_" + str(chain)
160 | # if key in Structure_Sequences_Aligned:
161 | # seq = Structure_Sequences_Aligned[key]
162 | # i = 0
163 | # for resn in seq:
164 | # i += 1
165 | # if (resn != "-"):
166 | # resinum_aligned_list.append(i)
167 | # i = 0
168 | # for residue in ChID_ResiNum_Vector[I][chain]:
169 | # key2 = chain + "_" + str(residue)
170 | # conversion_template[key2] = resinum_aligned_list[i]
171 | # i += 1
172 | # Structure_ConversionTemplate[structid_list[I]] = conversion_template
173 |
174 | # MY TEST STARTS HERE, WITH VARIATIONS OF THE CODE ABOVE
175 |
176 | for I in range(len(structid_list)):
177 | conversion_template = {}
178 | for chain in ChID_ResiNum_Vector[I]:
179 | #print(ChID_ResiNum_Vector[I])
180 | #residue_numbers_users = [residue.get_id()[1] for residue in chain.get_residues()]
181 | #print(residue_numbers_users)
182 | resinum_aligned_list = []
183 | key = str(structid_list[I]) + "_" + str(chain)
184 | if key in Structure_Sequences_Aligned:
185 | seq = Structure_Sequences_Aligned[key]
186 | gaps = Structure_Sequences_GAPS[key]
187 | i = 0
188 | counter=1
189 | new_res_num=[]
190 | freq_tracker=1
191 | gap_tracker=0
192 | gap_letter=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
193 | for freq in gaps:
194 | #freq_tracker = 1
195 | #gap_tracker = 0
196 | #print(freq)
197 | if freq < 100-user_gap: # accepted gap percentage based on user defined occupancy threshold
198 | new_res_num.append(counter)
199 | counter += 1
200 | freq_tracker=freq
201 | gap_tracker=0
202 | #print("hello")
203 | else:
204 | #print(gap_tracker)
205 | #new_res_num.append(str(counter)+"_"+str(gap_tracker))
206 | new_res_num.append(str(counter-1) +" "+str(gap_letter[gap_tracker]))
207 | gap_tracker+=1
208 | freq_tracker=freq
209 |
210 | #print(new_res_num)
211 | #print(len(new_res_num))
212 | #print(len(seq))
213 |
214 |
215 | for resn in seq:
216 | #print(new_res_num[i])
217 | if (resn != "-"):
218 | resinum_aligned_list.append(new_res_num[i])
219 | i += 1
220 |
221 | #print("resinum_aligned_list")
222 | #print(len(resinum_aligned_list))
223 | #print(resinum_aligned_list)
224 |
225 | #i = 0
226 |
227 | #print("THIS IS THE CHAIN NUMBER TEST")
228 | #print(ChID_ResiNum_Vector[I][chain]) #FAPA
229 |
230 | for residue in range(len(resinum_aligned_list)):
231 | #i +=1
232 | #print("this is the value of i:", i)
233 | #print("this is resimnum_aligned_list[i]:", resinum_aligned_list[residue] )
234 | #print("this is the value of residue:", residue)
235 | #print("this is what is in the structure:",ChID_ResiNum_Vector[I][chain][residue])
236 |
237 | key2 = chain + "_" + str(ChID_ResiNum_Vector[I][chain][residue])
238 | #print("key 2 is:",key2)
239 | #print(key2) #FAPA
240 | #print(resinum_aligned_list[i]) #FAPA
241 | conversion_template[key2] = resinum_aligned_list[residue]
242 | i += 1
243 | #print(conversion_template)
244 |
245 |
246 |
247 | #for residue in ChID_ResiNum_Vector[I][chain]:
248 | # #i +=1
249 | # print("this is the value of i:", i)
250 | # print("this is resimnum_aligned_list[i]:", resinum_aligned_list[i] )
251 | # print("this is the value of residue:", residue)
252 | # if i == len(resinum_aligned_list)-1:
253 | # break
254 | # elif resinum_aligned_list[i] in ChID_ResiNum_Vector[I][chain]:
255 | # key2 = chain + "_" + str(residue)
256 | # #print(key2) #FAPA
257 | # #print(resinum_aligned_list[i]) #FAPA
258 | # conversion_template[key2] = resinum_aligned_list[i]
259 | # i += 1
260 | #print(conversion_template)
261 |
262 | Structure_ConversionTemplate[structid_list[I]] = conversion_template
263 |
264 |
265 |
266 |
267 | check = "1"
268 | input_submenu = "QUIT"
269 | return Structure_Sequences_Aligned, Structure_ConversionTemplate, chid_list, check
270 |
271 | def show_conversiontemplate(Structure_ConversionTemplate):
272 | """
273 | Prints the conversion template to screen
274 |
275 | Paramters:
276 | ----------
277 | Structure_ConversionTemplate : dict
278 | A dictionary mapping each structure identifier to a conversion template,
279 | which contains mappings of residue numbers from the original sequence to the aligned sequence.
280 |
281 | Returns:
282 | --------
283 | None
284 | """
285 |
286 | for structid in Structure_ConversionTemplate:
287 | print(structid)
288 | for key in Structure_ConversionTemplate[structid]:
289 | print(key + ":" + str(Structure_ConversionTemplate[structid][key]))
290 |
291 | ## FAPA
292 | def write_and_show_conversiontemplate(Structure_ConversionTemplate, target_dir, write_csv=True):
293 | """
294 | Writes and displays a mapping of old residue IDs to new residue IDs for each structure.
295 |
296 | This function prints the mapping to the console and optionally writes it to a CSV file
297 | in the specified target directory.
298 |
299 | Parameters:
300 | -----------
301 | Structure_ConversionTemplate : dict
302 | list of file paths for all '.cif' files in specified directory
303 | target_dir : str
304 | Directory where the new files will be saved
305 | write_csv : bool, optional
306 | Writes the mapping to a CSV file named 'OldResID_NewResID_Map.csv' if True. Default is 'True'.
307 |
308 | Returns:
309 | --------
310 | None
311 | """
312 |
313 | if write_csv:
314 | with open(f'{target_dir}/OldResID_NewResID_Map.csv', 'w') as fout:
315 | fout.write('OldResID:NewResId:File\n')
316 |
317 |
318 | for structid in Structure_ConversionTemplate:
319 | #print(structid)
320 | for key in Structure_ConversionTemplate[structid]:
321 | #print(key + ":" + str(Structure_ConversionTemplate[structid][key]))
322 | #structid_for_print=[x.split("/")[-1] for x in structid]
323 | structid_for_print = structid.split("/")[-1]
324 | if write_csv:
325 | with open(f'{target_dir}/OldResID_NewResID_Map.csv', 'a') as fout:
326 | fout.write(f'{key}:{str(Structure_ConversionTemplate[structid][key])}:{structid_for_print}\n')
327 |
328 | # FAPA
329 |
330 | #################
331 | # FINALIZE STEP #
332 | #################
333 |
334 | def conversiontemplate_to_pdb(filelist, Structure_ConversionTemplate, target_dir=None):
335 | """
336 | Saves the conversion template into re-written CIF(s) which are placed into the target directory
337 |
338 | Parameters:
339 | -----------
340 | filelist : str
341 | list of file paths for all '.cif' files in specified directory
342 | Structure_ConversionTemplate : dict
343 | A dictionary mapping each structure identifier to a conversion template,
344 | which contains mappings of residue numbers from the original sequence to the aligned sequence.
345 | target_dir : str, optional
346 | Directory where the new files will be saved. If none, no files will be saved.
347 |
348 | Returns:
349 | --------
350 | None
351 | """
352 | for my_files in filelist:
353 | newciffilename=target_dir+'/'+my_files.split('/')[-1]
354 | with open(my_files) as myfile:
355 | with open(newciffilename, 'w') as newciffile:
356 | # Now figure out which file is which template
357 | conversion_template = Structure_ConversionTemplate[myfile.name]
358 | for line in myfile:
359 | if (line[0:4] == "ATOM") or (line[0:6] == "HETATM"):
360 | # Chains outside map should not exist but just in case
361 | line_split = line.strip()
362 | line_split = line.split()
363 | key = line_split[17] + "_" + str(line_split[15])
364 | if key in conversion_template:
365 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + line_split[7] + " " + line_split[8] + " " + line_split[9] + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n"
366 | newciffile.write(newline)
367 | else:
368 | newciffile.write(line)
369 | else:
370 | newciffile.write(line)
371 |
372 | # FAPA MAY 2024 TEST STARTS
373 |
374 | def conversiontemplate_to_pdb_FAPA(filelist, Structure_ConversionTemplate, target_dir=None):
375 | """
376 | Saves the conversion template into re-written CIF(s) which are placed into the target directory.
377 | This function considers cases where a residue number also includes a letter.
378 |
379 | Parameters:
380 | -----------
381 | filelist : str
382 | list of file paths for all '.cif' files in specified directory
383 | Structure_ConversionTemplate : dict
384 | A dictionary mapping each structure identifier to a conversion template,
385 | which contains mappings of residue numbers from the original sequence to the aligned sequence.
386 | target_dir : str, optional
387 | Directory where the new files will be saved. If none, no files will be saved.
388 |
389 | Returns:
390 | --------
391 | None
392 | """
393 | for my_files in filelist:
394 | newciffilename=target_dir+'/'+my_files.split('/')[-1]
395 | with open(my_files) as myfile:
396 | with open(newciffilename, 'w') as newciffile:
397 | # Now figure out which file is which template
398 | conversion_template = Structure_ConversionTemplate[myfile.name]
399 | #print(conversion_template)
400 |
401 |
402 | for line in myfile:
403 | resnum=1
404 | #old_line_resnum=0
405 | if (line[0:4] == "ATOM") or (line[0:6] == "HETATM"):
406 | # Chains outside map should not exist but just in case
407 | #line_split = line.strip()
408 | line_split = line.split()
409 | #print(line_split[8])
410 | #new_line_resnum=int(line_split[8])
411 |
412 | #if new_line_resnum == start_line_resnum:
413 |
414 | # We need to consider the value of pdbx_PDB_ins_code, in column 9
415 | # This is considered in the key
416 | # and original value will be overwritten with '?'
417 | # in next version, we will add the letter value to column 9
418 |
419 | if str(line_split[9]) == '?':
420 | key = line_split[6] + "_" + str(line_split[8]) + " " # FAPA: WE WANT CHAINID_RESID TO BE THE KEY
421 | else:
422 | key = line_split[6] + "_" + str(line_split[8]) + str(line_split[9])
423 |
424 |
425 | #key = line_split[6] + "_" + str(line_split[8]) #FAPA: WE WANT CHAINID_RESID TO BE THE KEY
426 | #key = line_split[6] + "_" + str(resnum) # FAPA: WE WANT CHAINID_RESID TO BE THE KEY
427 |
428 | #print(len(line_split))
429 |
430 |
431 |
432 | if key in conversion_template:
433 | #print(key, conversion_template[key])
434 | if len(line_split) == 18:
435 | if len(str(conversion_template[key]).split()) < 2:
436 | #print(conversion_template[key])
437 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + \
438 | line_split[7] + " " + str(conversion_template[key]) + " " + "?" + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + \
439 | " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + "\n"
440 | newciffile.write(newline)
441 | else:
442 | # the key already contains two columns (this is a test)
443 | #print("i am here: "+ str(conversion_template[key].split()[1]))
444 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \
445 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \
446 | line_split[6] + " " + \
447 | line_split[7] + " " + str(conversion_template[key].split()[0]) + " " + \
448 | str(conversion_template[key].split()[1])+ " " + \
449 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \
450 | line_split[13] + \
451 | " " + line_split[14] + " " + str(conversion_template[key].split()[0]) + " " + \
452 | line_split[16] + " " + line_split[17] + " " + "\n"
453 | newciffile.write(newline)
454 | else:
455 | if len(str(conversion_template[key]).split()) < 2:
456 | #print(conversion_template[key])
457 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + \
458 | line_split[7] + " " + str(conversion_template[key]) + " " + "?" + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + \
459 | " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n"
460 | newciffile.write(newline)
461 | else:
462 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \
463 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \
464 | line_split[6] + " " + \
465 | line_split[7] + " " + str(conversion_template[key].split()[0]) + " " + \
466 | str(conversion_template[key].split()[1]) + " " + \
467 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \
468 | line_split[13] + \
469 | " " + line_split[14] + " " + str(conversion_template[key].split()[0]) + " " + \
470 | line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n"
471 | newciffile.write(newline)
472 | else:
473 | if len(line_split) == 18:
474 | newciffile.write(line)
475 | else:
476 | if line_split[8] == ".":
477 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \
478 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \
479 | line_split[6] + " " + \
480 | line_split[7] + " " + line_split[15] + " " + \
481 | line_split[9] + " " + \
482 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \
483 | line_split[13] + \
484 | " " + line_split[14] + " " + line_split[15] + " " + \
485 | line_split[16] + " " + line_split[17] + " " + line_split[18] + " "+ line_split[19] +"\n"
486 | newciffile.write(newline)
487 | else:
488 | newciffile.write(line)
489 | else:
490 | newciffile.write(line)
491 |
492 | # FAPA MAY 2024 TEST ENDS
--------------------------------------------------------------------------------
/Notebooks/Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1a9e8a53",
6 | "metadata": {},
7 | "source": [
8 | "# Assign MolID to the entities found in the CIF files (2) "
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "bcec58a5",
14 | "metadata": {},
15 | "source": [
16 | "## What is the goal of this notebook?\n",
17 | "\n",
18 | "This is a continuation from `Assign MolID to the entities found in the CIF files (1)`.\n",
19 | "In this notebook we will show what happens when you assign the same name to different chains, because you want to concatenate them. For example, if you want to make all the waters or ions be in the same chain. \n",
20 | "\n",
21 | "**Note:** Make sure to run part 1 of this step in advance."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "id": "3771f378",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "## First, import library and setup directories"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "id": "05789e90",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from PDBClean import pdbclean_io"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "id": "ee726354",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "PROJDIR=\"./TIM/\""
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "id": "2fa40724",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='simple_bank_sub2')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 5,
67 | "id": "968b5aae",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# Let's copy the same structures we selected on step 2.1 \n",
72 | "\n",
73 | "! cp $PROJDIR/simple_bank_sub/*cif $PROJDIR/simple_bank_sub2/"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 6,
79 | "id": "afbff1c4",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank2')"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "id": "abfe9d89",
89 | "metadata": {},
90 | "source": [
91 | "### Running PDBClean_MolID_CIF.py\n",
92 | "\n",
93 | "Remember that the way to run this script in the terminal is as following:\n",
94 | "\n",
95 | "> PDBClean_MolID_CIF.py `{Input Directory}` `{Output Directory}`\n",
96 | "\n",
97 | "The input directory contains the structures that we generated in Step 1. The output directory is where the new structures will be stored. "
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "id": "db035e81",
103 | "metadata": {},
104 | "source": [
105 | "### Renaming MolID, how to choose a name? \n",
106 | "\n",
107 | "This is a personal decision. You can decide how name each entity. In part 2.1 we assigned a different MolID to each entity, as shown in the table below:\n",
108 | "\n",
109 | "| New MolID | ENTITIES |\n",
110 | "|---|:---|\n",
111 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n",
112 | "| B | 1:SN-GLYCEROL-3-PHOSPHATE: |\n",
113 | "| C | 1:SN-GLYCEROL-1-PHOSPHATE: |\n",
114 | "| D,E | 2:GLYCEROL: |\n",
115 | "| F,G | 2:WATER: |\n",
116 | "| H,I | 2:TRIOSEPHOSPHATE ISOMERASE: |\n",
117 | "| J | 1:PHOSPHATE ION: |\n",
118 | "| K,L | 2:2-PHOSPHOGLYCOLIC ACID: | \n",
119 | "\n",
120 | "\n",
121 | "For this example, let's try assigning the same MolID to different entities: \n",
122 | "\n",
123 | "| New MolID | ENTITIES |\n",
124 | "|---|:---|\n",
125 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n",
126 | "| D | 1:SN-GLYCEROL-3-PHOSPHATE: |\n",
127 | "| D | 1:SN-GLYCEROL-1-PHOSPHATE: |\n",
128 | "| D,D | 2:GLYCEROL: |\n",
129 | "| C,C | 2:WATER: |\n",
130 | "| A,B | 2:TRIOSEPHOSPHATE ISOMERASE: |\n",
131 | "| D | 1:PHOSPHATE ION: |\n",
132 | "| D,D | 2:2-PHOSPHOGLYCOLIC ACID: | \n",
133 | "\n"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 7,
139 | "id": "6dcca91c",
140 | "metadata": {
141 | "scrolled": false
142 | },
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Reading: ./TIM//simple_bank_sub2/2y62+00.cif (1 of 6)\n",
149 | "Reading: ./TIM//simple_bank_sub2/1ag1+00.cif (2 of 6)\n",
150 | "Reading: ./TIM//simple_bank_sub2/1aw1+04.cif (3 of 6)\n",
151 | "Reading: ./TIM//simple_bank_sub2/1aw1+02.cif (4 of 6)\n",
152 | "Reading: ./TIM//simple_bank_sub2/1aw1+03.cif (5 of 6)\n",
153 | "Reading: ./TIM//simple_bank_sub2/1aw1+01.cif (6 of 6)\n",
154 | "PDBClean MolID Conversion Build Menu\n",
155 | " Select one of the following options to proceed:\n",
156 | " 1) Show full conversion\n",
157 | " 2) Show only unassigned conversions\n",
158 | " 3) Enter input file\n",
159 | " 4) Search MolID to add chain ID conversion\n",
160 | " 5) Go entry by entry to add chain ID conversion\n",
161 | " 6) Remove a chain ID conversion\n",
162 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
163 | " \n",
164 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
165 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
166 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
167 | "2:GLYCEROL:\n",
168 | "2:WATER:\n",
169 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
170 | "1:PHOSPHATE ION:\n",
171 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
172 | "You need to accept 8 entity conversions\n",
173 | "You need to accept 12 total chain conversions\n",
174 | "PDBClean MolID Conversion Build Menu\n",
175 | " Select one of the following options to proceed:\n",
176 | " 1) Show full conversion\n",
177 | " 2) Show only unassigned conversions\n",
178 | " 3) Enter input file\n",
179 | " 4) Search MolID to add chain ID conversion\n",
180 | " 5) Go entry by entry to add chain ID conversion\n",
181 | " 6) Remove a chain ID conversion\n",
182 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
183 | " \n",
184 | "Option Number: Enter chain IDs for each of the following MolID.\n",
185 | "Comma separated, no spaces\n",
186 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n",
187 | " conversion templates. You can proceed to the next section\n",
188 | " by selection option 7 or, continue to edit your conversion\n",
189 | " template through this menu\n",
190 | " \n",
191 | "PDBClean MolID Conversion Build Menu\n",
192 | " Select one of the following options to proceed:\n",
193 | " 1) Show full conversion\n",
194 | " 2) Show only unassigned conversions\n",
195 | " 3) Enter input file\n",
196 | " 4) Search MolID to add chain ID conversion\n",
197 | " 5) Go entry by entry to add chain ID conversion\n",
198 | " 6) Remove a chain ID conversion\n",
199 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
200 | " \n",
201 | " 7) Continue to next step of curation\n",
202 | "Option Number: PDBClean Concatenations Menu\n",
203 | " -------------------------------\n",
204 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n",
205 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n",
206 | " or ACCEPT concatenations.\n",
207 | "\n",
208 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n",
209 | " be completed.\n",
210 | "\n",
211 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n",
212 | " been accepted. It will also give you the proper format of the input for option 3.\n",
213 | "\n",
214 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n",
215 | " will be printed to screen as they are being accepted. \n",
216 | "\n",
217 | " Select one of the following options to proceed:\n",
218 | " 1) Show all conversions\n",
219 | " 2) Show only unaccepted concatenations\n",
220 | " 3) Search and modify destination chainIDs of proposed concatenations\n",
221 | " 4) Accept proposed concatenation one by one\n",
222 | " (Repeat this step until finalizing option appears)\n",
223 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n",
224 | " \n",
225 | "Option Number: ./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n",
226 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n",
227 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n",
228 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n",
229 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n",
230 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n",
231 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n",
232 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n",
233 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n",
234 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n",
235 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n",
236 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n",
237 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n",
238 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n",
239 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n",
240 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n",
241 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n",
242 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n",
243 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n",
244 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n",
245 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n",
246 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n",
247 | "You need to accept 22 concatenations\n",
248 | "PDBClean Concatenations Menu\n",
249 | " -------------------------------\n",
250 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n",
251 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n",
252 | " or ACCEPT concatenations.\n",
253 | "\n",
254 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n",
255 | " be completed.\n",
256 | "\n",
257 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n",
258 | " been accepted. It will also give you the proper format of the input for option 3.\n",
259 | "\n",
260 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n",
261 | " will be printed to screen as they are being accepted. \n",
262 | "\n",
263 | " Select one of the following options to proceed:\n",
264 | " 1) Show all conversions\n",
265 | " 2) Show only unaccepted concatenations\n",
266 | " 3) Search and modify destination chainIDs of proposed concatenations\n",
267 | " 4) Accept proposed concatenation one by one\n",
268 | " (Repeat this step until finalizing option appears)\n",
269 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n",
270 | " \n",
271 | "Option Number: "
272 | ]
273 | }
274 | ],
275 | "source": [
276 | "! echo '2\\n5\\nA\\nD\\nD\\nD,D\\nC,C\\nA,B\\nD\\nD,D\\n7\\n2\\nQUIT\\n' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub2 $PROJDIR/standard_MolID_bank2"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "id": "dd0f37da",
282 | "metadata": {},
283 | "source": [
284 | "## A pause to explain what is going on:\n",
285 | "\n",
286 | "Notice that a new menu appears when we assign the same MolID to more than one entity. We need to either give a new MolID to the entities, or accept a concatenation. We want to guarantee that you did not assign the same MolID by mistake, so you need to approve each case one by one. \n",
287 | "\n",
288 | "In the cell above, we chose option `2) Show only unaccepted concatenations`. Let's take a look at the output:\n",
289 | "\n",
290 | "`\n",
291 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n",
292 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n",
293 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n",
294 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n",
295 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n",
296 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n",
297 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n",
298 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n",
299 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n",
300 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n",
301 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n",
302 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n",
303 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n",
304 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n",
305 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n",
306 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n",
307 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n",
308 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n",
309 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n",
310 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n",
311 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n",
312 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n",
313 | "You need to accept 22 concatenations\n",
314 | "`\n",
315 | "\n",
316 | "Notice that the format is: \n",
317 | "\n",
318 | "`file name` : `entity` : `New MolID we just assigned`: `order of entity with same MolID in CIF file`\n",
319 | "\n",
320 | "To continue running the script, you will need to accept each of these concatenations. For this notebook we only show how to accept three of the proposed concatenations. We recommend doing this step on the terminal, and approve each concatenation one by one. \n",
321 | "\n",
322 | "Choosing menu `4) Accept proposed concatenation one by one` will print one of the concatenations that stills need to be approved.A new menu will appear, we need to choose option `2) Accept planned concatenation`. This will bring us back to the concatenation menu. We need to repeat this step (choose option 4, and then 2), until the finalize option appears. \n",
323 | "\n",
324 | "Even though it is beneficial to check each merge one by one, it can be very tedious (in the case of ions, users would have to approve hundreds of merges), so we also provide the option to accept all the merges automatically `5) Accept ALL`. PDBCleanV2 will still print all merges to the screen, so users can verify that everything is fine.\n",
325 | "\n",
326 | "Once all concatenations have been accepted, an option to finalize the curation will appear.\n"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 8,
332 | "id": "7821c381",
333 | "metadata": {
334 | "scrolled": false
335 | },
336 | "outputs": [
337 | {
338 | "name": "stdout",
339 | "output_type": "stream",
340 | "text": [
341 | "Reading: ./TIM//simple_bank_sub2/2y62+00.cif (1 of 6)\n",
342 | "Reading: ./TIM//simple_bank_sub2/1ag1+00.cif (2 of 6)\n",
343 | "Reading: ./TIM//simple_bank_sub2/1aw1+04.cif (3 of 6)\n",
344 | "Reading: ./TIM//simple_bank_sub2/1aw1+02.cif (4 of 6)\n",
345 | "Reading: ./TIM//simple_bank_sub2/1aw1+03.cif (5 of 6)\n",
346 | "Reading: ./TIM//simple_bank_sub2/1aw1+01.cif (6 of 6)\n",
347 | "PDBClean MolID Conversion Build Menu\n",
348 | " Select one of the following options to proceed:\n",
349 | " 1) Show full conversion\n",
350 | " 2) Show only unassigned conversions\n",
351 | " 3) Enter input file\n",
352 | " 4) Search MolID to add chain ID conversion\n",
353 | " 5) Go entry by entry to add chain ID conversion\n",
354 | " 6) Remove a chain ID conversion\n",
355 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
356 | " \n",
357 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n",
358 | "1:SN-GLYCEROL-3-PHOSPHATE:\n",
359 | "1:SN-GLYCEROL-1-PHOSPHATE:\n",
360 | "2:GLYCEROL:\n",
361 | "2:WATER:\n",
362 | "2:TRIOSEPHOSPHATE ISOMERASE:\n",
363 | "1:PHOSPHATE ION:\n",
364 | "2:2-PHOSPHOGLYCOLIC ACID:\n",
365 | "You need to accept 8 entity conversions\n",
366 | "You need to accept 12 total chain conversions\n",
367 | "PDBClean MolID Conversion Build Menu\n",
368 | " Select one of the following options to proceed:\n",
369 | " 1) Show full conversion\n",
370 | " 2) Show only unassigned conversions\n",
371 | " 3) Enter input file\n",
372 | " 4) Search MolID to add chain ID conversion\n",
373 | " 5) Go entry by entry to add chain ID conversion\n",
374 | " 6) Remove a chain ID conversion\n",
375 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
376 | " \n",
377 | "Option Number: Enter chain IDs for each of the following MolID.\n",
378 | "Comma separated, no spaces\n",
379 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n",
380 | " conversion templates. You can proceed to the next section\n",
381 | " by selection option 7 or, continue to edit your conversion\n",
382 | " template through this menu\n",
383 | " \n",
384 | "PDBClean MolID Conversion Build Menu\n",
385 | " Select one of the following options to proceed:\n",
386 | " 1) Show full conversion\n",
387 | " 2) Show only unassigned conversions\n",
388 | " 3) Enter input file\n",
389 | " 4) Search MolID to add chain ID conversion\n",
390 | " 5) Go entry by entry to add chain ID conversion\n",
391 | " 6) Remove a chain ID conversion\n",
392 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n",
393 | " \n",
394 | " 7) Continue to next step of curation\n",
395 | "Option Number: PDBClean Concatenations Menu\n",
396 | " -------------------------------\n",
397 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n",
398 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n",
399 | " or ACCEPT concatenations.\n",
400 | "\n",
401 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n",
402 | " be completed.\n",
403 | "\n",
404 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n",
405 | " been accepted. It will also give you the proper format of the input for option 3.\n",
406 | "\n",
407 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n",
408 | " will be printed to screen as they are being accepted. \n",
409 | "\n",
410 | " Select one of the following options to proceed:\n",
411 | " 1) Show all conversions\n",
412 | " 2) Show only unaccepted concatenations\n",
413 | " 3) Search and modify destination chainIDs of proposed concatenations\n",
414 | " 4) Accept proposed concatenation one by one\n",
415 | " (Repeat this step until finalizing option appears)\n",
416 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n",
417 | " \n",
418 | "Option Number: ./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n",
419 | "Select one of the following options to proceed:\n",
420 | " 1) Perform new search\n",
421 | " 2) Accept planned concatenation\n",
422 | " \n",
423 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n",
424 | "Select one of the following options to proceed:\n",
425 | " 1) Perform new search\n",
426 | " 2) Accept planned concatenation\n",
427 | " \n",
428 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n",
429 | "Select one of the following options to proceed:\n",
430 | " 1) Perform new search\n",
431 | " 2) Accept planned concatenation\n",
432 | " \n",
433 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n",
434 | "Select one of the following options to proceed:\n",
435 | " 1) Perform new search\n",
436 | " 2) Accept planned concatenation\n",
437 | " \n",
438 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n",
439 | "Select one of the following options to proceed:\n",
440 | " 1) Perform new search\n",
441 | " 2) Accept planned concatenation\n",
442 | " \n",
443 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n",
444 | "Select one of the following options to proceed:\n",
445 | " 1) Perform new search\n",
446 | " 2) Accept planned concatenation\n",
447 | " \n",
448 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n",
449 | "Select one of the following options to proceed:\n",
450 | " 1) Perform new search\n",
451 | " 2) Accept planned concatenation\n",
452 | " \n",
453 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n",
454 | "Select one of the following options to proceed:\n",
455 | " 1) Perform new search\n",
456 | " 2) Accept planned concatenation\n",
457 | " \n",
458 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n",
459 | "Select one of the following options to proceed:\n",
460 | " 1) Perform new search\n",
461 | " 2) Accept planned concatenation\n",
462 | " \n",
463 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n",
464 | "Select one of the following options to proceed:\n",
465 | " 1) Perform new search\n",
466 | " 2) Accept planned concatenation\n",
467 | " \n",
468 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n",
469 | "Select one of the following options to proceed:\n",
470 | " 1) Perform new search\n",
471 | " 2) Accept planned concatenation\n",
472 | " \n",
473 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n",
474 | "Select one of the following options to proceed:\n",
475 | " 1) Perform new search\n",
476 | " 2) Accept planned concatenation\n",
477 | " \n",
478 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n",
479 | "Select one of the following options to proceed:\n",
480 | " 1) Perform new search\n",
481 | " 2) Accept planned concatenation\n",
482 | " \n",
483 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n",
484 | "Select one of the following options to proceed:\n",
485 | " 1) Perform new search\n",
486 | " 2) Accept planned concatenation\n",
487 | " \n",
488 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n",
489 | "Select one of the following options to proceed:\n",
490 | " 1) Perform new search\n",
491 | " 2) Accept planned concatenation\n",
492 | " \n",
493 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n",
494 | "Select one of the following options to proceed:\n",
495 | " 1) Perform new search\n",
496 | " 2) Accept planned concatenation\n",
497 | " \n",
498 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n",
499 | "Select one of the following options to proceed:\n",
500 | " 1) Perform new search\n",
501 | " 2) Accept planned concatenation\n",
502 | " \n",
503 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n",
504 | "Select one of the following options to proceed:\n",
505 | " 1) Perform new search\n",
506 | " 2) Accept planned concatenation\n",
507 | " \n",
508 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n",
509 | "Select one of the following options to proceed:\n",
510 | " 1) Perform new search\n",
511 | " 2) Accept planned concatenation\n",
512 | " \n",
513 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n",
514 | "Select one of the following options to proceed:\n",
515 | " 1) Perform new search\n",
516 | " 2) Accept planned concatenation\n",
517 | " \n",
518 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n",
519 | "Select one of the following options to proceed:\n",
520 | " 1) Perform new search\n",
521 | " 2) Accept planned concatenation\n",
522 | " \n",
523 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n",
524 | "Select one of the following options to proceed:\n",
525 | " 1) Perform new search\n",
526 | " 2) Accept planned concatenation\n",
527 | " \n",
528 | "0\n",
529 | "Congratulations! You have successfully constructed your\n",
530 | " conversion templates.You can proceed to the next section\n",
531 | " by selection option 6 or, continue to edit your conversion\n",
532 | " template through this menu\n",
533 | " \n",
534 | "PDBClean Concatenations Menu\n",
535 | " -------------------------------\n",
536 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n",
537 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n",
538 | " or ACCEPT concatenations.\n",
539 | "\n",
540 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n",
541 | " be completed.\n",
542 | "\n",
543 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n",
544 | " been accepted. It will also give you the proper format of the input for option 3.\n",
545 | "\n",
546 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n",
547 | " will be printed to screen as they are being accepted. \n",
548 | "\n",
549 | " Select one of the following options to proceed:\n",
550 | " 1) Show all conversions\n",
551 | " 2) Show only unaccepted concatenations\n",
552 | " 3) Search and modify destination chainIDs of proposed concatenations\n",
553 | " 4) Accept proposed concatenation one by one\n",
554 | " (Repeat this step until finalizing option appears)\n",
555 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n",
556 | " \n",
557 | " 6) Finalize Curation\n",
558 | "Option Number: Finalizing Curation ...\n"
559 | ]
560 | }
561 | ],
562 | "source": [
563 | "! echo \"2\\n5\\nA\\nD\\nD\\nD,D\\nC,C\\nA,B\\nD\\nD,D\\n7\\n5\\n6\" | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub2 $PROJDIR/standard_MolID_bank2"
564 | ]
565 | },
566 | {
567 | "cell_type": "code",
568 | "execution_count": null,
569 | "id": "a65e7eaa",
570 | "metadata": {},
571 | "outputs": [],
572 | "source": []
573 | }
574 | ],
575 | "metadata": {
576 | "kernelspec": {
577 | "display_name": "PDBCleanV2",
578 | "language": "python",
579 | "name": "pdbcleanv2"
580 | },
581 | "language_info": {
582 | "codemirror_mode": {
583 | "name": "ipython",
584 | "version": 3
585 | },
586 | "file_extension": ".py",
587 | "mimetype": "text/x-python",
588 | "name": "python",
589 | "nbconvert_exporter": "python",
590 | "pygments_lexer": "ipython3",
591 | "version": "3.10.5"
592 | }
593 | },
594 | "nbformat": 4,
595 | "nbformat_minor": 5
596 | }
597 |
--------------------------------------------------------------------------------