├── .idea ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── PDBClean-0.0.2.iml ├── images └── FlowChart.png ├── Notebooks ├── images │ ├── JalviewTIMB_2.png │ ├── TIMJalview0_2.png │ └── TIM_PyMOL_CatalyticResidues.png ├── CheckProject_CheckCreateDelete.ipynb ├── Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb ├── Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb └── Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb ├── src ├── __init__.py ├── listutils.py ├── pdbclean_io.py ├── pdbutils.py ├── cleanutils.py ├── alignmentutils.py └── pdbcleanresiduestandardizationutils.py ├── LICENSE ├── CONTRIBUTING.md ├── scripts ├── PDBClean_ResidueStandardization_CIF.py ├── PDBClean_ChainStandardization_CIF.py └── PDBClean_MolID_CIF.py ├── CODE_OF_CONDUCT.md ├── setup.py ├── environment_M1.yml ├── environment.yml └── README.md /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /images/FlowChart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/images/FlowChart.png -------------------------------------------------------------------------------- /Notebooks/images/JalviewTIMB_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/JalviewTIMB_2.png -------------------------------------------------------------------------------- /Notebooks/images/TIMJalview0_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/TIMJalview0_2.png -------------------------------------------------------------------------------- /Notebooks/images/TIM_PyMOL_CatalyticResidues.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fatipardo/PDBCleanV2/HEAD/Notebooks/images/TIM_PyMOL_CatalyticResidues.png -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/PDBClean-0.0.2.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # PDBClean base dir 2 | 3 | import logging 4 | import glob 5 | import os 6 | import sys 7 | 8 | 9 | # list all the files included in PDBClean 10 | __all__ = [os.path.basename(f)[:-3] for f in glob.glob(os.path.dirname(__file__) + "/*.py") if not f.endswith('__init__.py')] 11 | 12 | # set up the logger 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | 16 | sh = logging.StreamHandler(sys.stdout) 17 | formatter = logging.Formatter(fmt='%(asctime)s - %(message)s', datefmt="%H:%M:%S") 18 | sh.setFormatter(formatter) 19 | 20 | logger.addHandler(sh) 21 | logger.propagate = False 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Michael Levitt's Lab at Stanford University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/listutils.py: -------------------------------------------------------------------------------- 1 | # 2 | import os 3 | # 4 | def remove_file_defined_chain_from_list(list): 5 | """ 6 | Removes specified chain IDs from list based on user's file input. 7 | 8 | The user is prompted to enter the file name containing the chain IDs they want to remove. 9 | The chain IDs in the file will be removed from the chain ID list. 10 | 11 | Parameters: 12 | ----------- 13 | list : list 14 | Contains all the chain IDs from CIF(s) 15 | 16 | Returns: 17 | -------- 18 | list : list 19 | Updated list without the chain ID from user's input 20 | """ 21 | remove_chid = [] 22 | print(" Enter the file name containing the list of chain IDs you want removed from Standard Sequences.") 23 | user_input = input('File: ') 24 | if (os.path.isfile(user_input) == True): 25 | my_file = open(user_input) 26 | for line in my_file: 27 | remove_chid.append(line.strip()) 28 | else: 29 | print("File does not exist.") 30 | list = remove_chid_from_list(list, remove_chid) 31 | return list 32 | 33 | def remove_user_defined_chain_from_list(list): 34 | """ 35 | Removes the chain ID from list based off of user's input of chain ID 36 | 37 | The user is prompted to input the chain ID which they wish to remove 38 | 39 | Parameter: 40 | ---------- 41 | list : list 42 | Contains all the chain IDs from CIF(s) 43 | 44 | Returns: 45 | -------- 46 | list : list 47 | Updated list without the chain ID from user's input 48 | """ 49 | remove_chid = [] 50 | user_input = "" 51 | print(" Enter chain IDs of the chains you want removed. When done, enter DONE.") 52 | while (user_input != "DONE"): 53 | user_input = input('Chain ID: ') 54 | remove_chid.append(user_input) 55 | list = remove_chid_from_list(list, remove_chid) 56 | return list 57 | 58 | def remove_chid_from_list(list, remove_list): 59 | """ 60 | Removes the chain ID from the list 61 | 62 | Parameters: 63 | ----------- 64 | list : list 65 | contains the chain IDs 66 | remove_list : list 67 | contains the chain IDs to be removed 68 | 69 | Returns: 70 | -------- 71 | list : list 72 | Updated list without the chain ID from user's input 73 | """ 74 | for elt in remove_list: 75 | if elt in list: 76 | list.remove(elt) 77 | return list 78 | 79 | def show_list(list): 80 | """ 81 | This function prints each item in a list. 82 | 83 | Parameters: 84 | ----------- 85 | list : list 86 | A list of items to be printed. 87 | 88 | Returns: 89 | -------- 90 | None 91 | """ 92 | for elt in list: 93 | print(elt) 94 | 95 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute to PDBCleanV2 2 | 3 | Hello! Thank you for helping us improve our project! 4 | 5 | Please read and follow our guidelines to ensure a positive experience for contributors and maintainers. 6 | 7 | ## :page_with_curl: Code of Conduct 8 | 9 | Before you start, review our [Code of Conduct](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/CODE_OF_CONDUCT.md). Bad behavior will not be tolerated. 10 | 11 | ## :incoming_envelope: Opening an Issue 12 | 13 | Sign in to your GitHub account and create a **New Issue** in [GitHub issues](https://github.com/fatipardo/PDBClean-0.0.2/issues) 14 | 15 | Before you create a new entry, please check if the issue has already been reported in [GitHub](https://github.com/fatipardo/PDBClean-0.0.2/issues). And review our [documentation](https://github.com/fatipardo/PDBClean-0.0.2/tree/master/Notebooks). 16 | 17 | Make sure to select an appropriate label for the issue before you submit it (e.g. bug, enhancement, etc). 18 | 19 | ### :bug: Bug reports 20 | 21 | Please write a bug report that you would like to receive. 22 | 23 | - Include the operating system and processor on which the code is being run on. 24 | - Include the location of where the issue arose. 25 | - If you get an error message, include it in the issue. 26 | - Use [GitHub-flavored Markdown](https://help.github.com/en/github/writing-on-github/basic-writing-and-formatting-syntax). Especially put code blocks and console outputs in backticks (```). This improves readability. 27 | 28 | > **If an issue already exists.** 29 | Comment on the existing issue to add more information or leave a reaction on their issue. This helps us become aware of which issue is being faced more commonly and allows us to prioritize which issues to solve first. 30 | 31 | ### :art: Feature request 32 | 33 | - Make sure to select the enhancement label when submitting an issue. 34 | - Be precise about the proposed outcome of the feature and how it relates to existing features. Include implementation details if possible. 35 | - Do not open a duplicate feature request. Search for existing feature requests first. If you find your feature (or one very similar) previously requested, comment on that issue. 36 | 37 | ## :construction: Do you want to help fix an issue? 38 | 39 | - Comment "take" to the issue you want to fix and we will assign it to you. 40 | - When you submit a pull request, add the GitHub issue number in the title. 41 | - **Only submit a pull request to issues that have been asssigned to you** 42 | 43 | 44 | ## :purple_heart: Credits 45 | 46 | Written by [@fatipardo](https://github.com/fatipardo) and [@gdkwxn](https://github.com/gdkwxn). 47 | 48 | Many of the ideas and prose for the statements in this document were based on or inspired by the [contributing page](https://github.com/jessesquires/.github/blob/main/CONTRIBUTING.md) written by [@jessesquires](https://github.com/jessesquires) 49 | -------------------------------------------------------------------------------- /scripts/PDBClean_ResidueStandardization_CIF.py: -------------------------------------------------------------------------------- 1 | #!/Users/fatima/anaconda3/envs/PDBCleanV2/bin/python 2 | # coding: utf-8 3 | 4 | from __future__ import print_function 5 | from __future__ import division 6 | import sys, glob 7 | from PDBClean import pdbcleanresiduestandardizationutils as resstd 8 | 9 | ######################## 10 | # READ INPUT ARGUMENTS # 11 | ######################## 12 | n_arg = len(sys.argv) 13 | if(n_arg<3): 14 | print('Usage error: {0} '.format(sys.argv[0])) 15 | sys.exit() 16 | source_dir=sys.argv[1] 17 | target_dir=sys.argv[2] 18 | 19 | 20 | ############################################# 21 | # READ PDB FILES AND DEFINE STRUCTURE LISTS # 22 | ############################################# 23 | filelist=glob.glob(source_dir+'/*.cif') 24 | Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list = resstd.pdb_to_structurelists(filelist) 25 | 26 | 27 | ############################################ 28 | # INTERACTIVE RESIDUE STANDARDIZATION MENU # 29 | ############################################ 30 | input_menu = "" 31 | input_menu_check = "" 32 | 33 | while(input_menu != "QUIT"): 34 | print("PDBClean Residue Number Standardization Menu", 35 | " After checking all structures are loaded, select option 1 to proceed:", 36 | " 1) Proceed to multiple alignment menu", 37 | sep="\n") 38 | if(input_menu_check == "1"): 39 | print(" 2) View conversion template") 40 | print(" 3) Perform residue number standardization") 41 | print(" 4) Save conversion template") 42 | print(" OR Type QUIT to exit") 43 | input_menu = input('Option Number: ') 44 | if (input_menu == "1"): 45 | Structure_Sequences_Aligned, Structure_ConversionTemplate, chid_list, input_menu_check = resstd.perform_multiple_alignment(Structure_Sequences, 46 | ChID_ResiNum_Vector, 47 | structid_list, 48 | chid_list, 49 | input_menu_check) 50 | elif (input_menu == "2" and input_menu_check == "1"): 51 | resstd.show_conversiontemplate(Structure_ConversionTemplate) 52 | #elif (input_menu == "3" and input_menu_check == "1"): 53 | # resstd.conversiontemplate_to_pdb(filelist, Structure_ConversionTemplate, target_dir=target_dir) 54 | elif (input_menu == "4" and input_menu_check == "1"): 55 | resstd.write_and_show_conversiontemplate(Structure_ConversionTemplate,target_dir,True) 56 | elif (input_menu == "3" and input_menu_check == "1"): 57 | resstd.conversiontemplate_to_pdb_FAPA(filelist, Structure_ConversionTemplate, target_dir=target_dir) 58 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. Enforcement of these policies may include warnings and/or banning. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | 60 | ## Attribution 61 | 62 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 63 | version 2.0, available at 64 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 65 | 66 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 67 | enforcement ladder](https://github.com/mozilla/diversity). 68 | 69 | [homepage]: https://www.contributor-covenant.org 70 | 71 | For answers to common questions about this code of conduct, see the FAQ at 72 | https://www.contributor-covenant.org/faq. Translations are available at 73 | https://www.contributor-covenant.org/translations. 74 | -------------------------------------------------------------------------------- /scripts/PDBClean_ChainStandardization_CIF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # 4 | from __future__ import print_function 5 | from __future__ import division 6 | import sys, glob 7 | from PDBClean import pdbcleanchainstandardizationutils as chainstd 8 | 9 | ######################## 10 | # READ INPUT ARGUMENTS # 11 | ######################## 12 | n_arg = len(sys.argv) 13 | if(n_arg<3): 14 | print('Usage error: {0} '.format(sys.argv[0])) 15 | sys.exit() 16 | source_dir=sys.argv[1] 17 | target_dir=sys.argv[2] 18 | 19 | 20 | ############################################# 21 | # READ PDB FILES AND DEFINE STRUCTURE LISTS # 22 | ############################################# 23 | 24 | filelist=glob.glob(source_dir+'/*.cif') 25 | Structure_Sequences, structid_list, chid_list = chainstd.pdb_to_structurelists(filelist) 26 | Standard_Sequences = {} 27 | 28 | 29 | ############################################ 30 | # INTERACTIVE ChainID STANDARDIZATION MENU # 31 | ############################################ 32 | input_menu = "" 33 | input_menu_check_1 = "" 34 | 35 | while(input_menu != "QUIT"): 36 | print("PDBClean ChainID Standardization Menu", 37 | " Select one of the following options to proceed:", 38 | " 1) Select Standard Sequences from a chosen input structure", 39 | " 2) Generate Standard Sequences based on all the input structures", 40 | sep="\n") 41 | if(input_menu_check_1 == "1"): 42 | print(" 3) Inspect/Edit Standard Sequences", 43 | " 4) Perform Standardization of Chain IDs", 44 | sep="\n") 45 | input_menu = input('Option Number: ') 46 | if (input_menu == "1"): 47 | Standard_Sequences, input_menu_check_1 = chainstd.select_standard_seq_from_reference(Structure_Sequences, 48 | Standard_Sequences, 49 | structid_list, 50 | input_menu_check_1) 51 | elif (input_menu == "2"): 52 | Standard_Sequences, input_menu_check_1 = chainstd.create_standard_seq_from_consensus(Structure_Sequences, 53 | Standard_Sequences, 54 | chid_list, 55 | input_menu_check_1) 56 | print("These are the standard sequences:") 57 | print(Standard_Sequences) 58 | elif (input_menu == "3" and input_menu_check_1 == "1"): 59 | chainstd.review_standard_seq(Structure_Sequences, Standard_Sequences) 60 | 61 | elif (input_menu == "4" and input_menu_check_1=="1"): 62 | chainstd.align_to_std_seq_and_save_to_disk(Structure_Sequences, 63 | Standard_Sequences, 64 | structid_list, 65 | filelist, 66 | target_dir=target_dir) 67 | print("Done!") 68 | input_menu = "QUIT" 69 | -------------------------------------------------------------------------------- /src/pdbclean_io.py: -------------------------------------------------------------------------------- 1 | import sys, os, shutil, datetime 2 | # 3 | 4 | def check_project(projdir=None, level='top', action='create', verbose=True): 5 | """ 6 | Manages the project directory by creating, cleaning, or deleting directories. 7 | 8 | Parameters: 9 | ----------- 10 | projdir : str, optional 11 | The path to the project directory. If None, a message will display asking to provide the path. 12 | level : str, optional 13 | Specifies the directory level. Default is 'top', meaning the project directory itself. 14 | You can specify a subdirectory within the project directory. 15 | action : str, optional 16 | The action to perform on the directory. Options are: 17 | - 'create': Create the directory if it doesn't already exist. 18 | - 'clean': Remove all files in the directory, leaving it empty. 19 | - 'delete': Deletes the directory and everything within it. 20 | verbose : bool, optional 21 | If True, prints informative messages about the actions being performed. Default is True. 22 | 23 | Returns: 24 | -------- 25 | None 26 | """ 27 | 28 | if projdir is None: 29 | print("Please provide a project directory path") 30 | else: 31 | dirname = projdir 32 | if(level!='top'): 33 | dirname=dirname+'/'+level 34 | if(action=='create'): 35 | create_dir(dirname, verbose=verbose) 36 | elif(action=='clean'): 37 | clean_dir(dirname, verbose=verbose) 38 | elif(action=='delete'): 39 | delete_dir(dirname, verbose=verbose) 40 | 41 | def create_dir(dirpath, verbose=True): 42 | """ 43 | Creates a directory if it does not exist, and writes a creation timestamp in 'info.txt'. 44 | 45 | Parameters: 46 | ----------- 47 | dirpath : str 48 | The path of the directory to create. 49 | verbose : bool, optional 50 | If True, prints informative messages about the action taken. Default is True. 51 | 52 | Returns: 53 | -------- 54 | None 55 | """ 56 | 57 | if not os.path.exists(dirpath): 58 | os.mkdir(dirpath) 59 | if verbose: 60 | now=datetime.datetime.now() 61 | f = open(dirpath+'/info.txt', 'w') 62 | f.write('directory created on {0}'.format(now)) 63 | f.close() 64 | else: 65 | if verbose: 66 | print('{0} already exists, with content:'.format(dirpath)) 67 | print(os.listdir(dirpath)) 68 | 69 | def clean_dir(dirpath, verbose=True): 70 | """ 71 | Removes all files from the specified directory, leaving it empty. 72 | 73 | Parameters: 74 | ----------- 75 | dirpath : str 76 | The path of the directory to clean. 77 | verbose : bool, optional 78 | If True, a message is printed regarding the action taken. Default is True. 79 | 80 | Returns: 81 | -------- 82 | None 83 | """ 84 | 85 | if os.path.exists(dirpath): 86 | listfile = (file for file in os.listdir(dirpath) if os.path.isfile(os.path.join(dirpath, file))) 87 | if verbose: 88 | print('Cleaning {0}...'.format(dirpath)) 89 | for f in listfile: 90 | os.remove(dirpath+'/'+f) 91 | 92 | def delete_dir(dirpath, verbose=True): 93 | """ 94 | Deletes the specified directory and all of its contents. 95 | 96 | Parameters: 97 | ----------- 98 | dirpath : str 99 | The path of the directory to delete. 100 | verbose : bool, optional 101 | If True, a message is printed regarding the action taken. Default is True. 102 | 103 | Returns: 104 | -------- 105 | None 106 | """ 107 | 108 | if os.path.exists(dirpath): 109 | shutil.rmtree(dirpath) 110 | if verbose: 111 | print('Deleting {0}...'.format(dirpath)) 112 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | setup.py: Install PDBClean 3 | """ 4 | 5 | import os 6 | import sys 7 | import re 8 | import subprocess 9 | from os.path import join as pjoin 10 | from glob import glob 11 | 12 | from distutils.extension import Extension 13 | from distutils.core import setup 14 | 15 | from Cython.Distutils import build_ext 16 | import numpy 17 | 18 | # ------------------------------------------------------------------------------ 19 | # HEADER 20 | # 21 | 22 | VERSION = "0.0.2" 23 | ISRELEASED = False 24 | DISABLE_CUDA = True 25 | __author__ = "Levitt Lab, Stanford" 26 | __version__ = VERSION 27 | 28 | metadata = { 29 | 'name': 'PDBClean', 30 | 'version': VERSION, 31 | 'author': __author__, 32 | 'author_email': 'fpardo@stanford.edu', 33 | 'license': 'MIT', 34 | 'url': 'https://github.com/fatipardo/PDBClean-0.0.2', 35 | 'download_url': 'https://github.com/fatipardo/PDBClean-0.0.2', 36 | 'platforms': ['Linux', 'OSX'], 37 | 'description': "PDB curation tools", 38 | 'long_description': """PDBClean offers curation tools for structural ensemble deposited in the Protein Data Bank."""} 39 | 40 | # ------------------------------------------------------------------------------ 41 | # HELPER FUNCTIONS -- path finding, git, python version, readthedocs 42 | # 43 | 44 | class bcolors: 45 | HEADER = '\033[95m' 46 | OKBLUE = '\033[94m' 47 | OKGREEN = '\033[92m' 48 | WARNING = '\033[93m' 49 | FAIL = '\033[91m' 50 | ENDC = '\033[0m' 51 | 52 | 53 | def print_warning(string): 54 | print(bcolors.WARNING + string + bcolors.ENDC) 55 | 56 | 57 | def find_in_path(name, path): 58 | "Find a file in a search path" 59 | #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 60 | for dir in path.split(os.pathsep): 61 | binpath = pjoin(dir, name) 62 | if os.path.exists(binpath): 63 | return os.path.abspath(binpath) 64 | return None 65 | 66 | 67 | def get_numpy_include(): 68 | """ 69 | Obtain the numpy include directory. This logic works across numpy versions. 70 | """ 71 | try: 72 | numpy_include = numpy.get_include() 73 | except AttributeError: 74 | numpy_include = numpy.get_numpy_include() 75 | return numpy_include 76 | 77 | 78 | def git_version(): 79 | """ 80 | Return the git revision as a string. 81 | Copied from numpy setup.py 82 | """ 83 | 84 | def _minimal_ext_cmd(cmd): 85 | # construct minimal environment 86 | env = {} 87 | for k in ['SYSTEMROOT', 'PATH']: 88 | v = os.environ.get(k) 89 | if v is not None: 90 | env[k] = v 91 | # LANGUAGE is used on win32 92 | env['LANGUAGE'] = 'C' 93 | env['LANG'] = 'C' 94 | env['LC_ALL'] = 'C' 95 | out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0] 96 | return out 97 | 98 | try: 99 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 100 | GIT_REVISION = out.strip().decode('ascii') 101 | except OSError: 102 | GIT_REVISION = "Unknown" 103 | 104 | return GIT_REVISION 105 | 106 | # ----------------------------------------------------------------------------- 107 | # INSTALL 108 | 109 | metadata['packages'] = ['PDBClean'] 110 | metadata['package_dir'] = {'PDBClean' : 'src'} 111 | metadata['ext_modules'] = [] 112 | metadata['scripts'] = [s for s in glob('scripts/*') if not s.endswith('__.py')] 113 | #metadata['data_files'] = [('reference', glob('./reference/*'))] 114 | #metadata['cmdclass'] = {'build_ext': custom_build_ext} 115 | 116 | # ------------------------------------------------------------------------------ 117 | # 118 | # Finally, print a warning at the *end* of the build if something fails 119 | # 120 | 121 | def print_warnings(): 122 | print("\n") 123 | 124 | if __name__ == '__main__': 125 | setup(**metadata) # ** will unpack dictionary 'metadata' providing the values as arguments 126 | print_warnings() 127 | -------------------------------------------------------------------------------- /environment_M1.yml: -------------------------------------------------------------------------------- 1 | name: PDBCleanV2 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | dependencies: 6 | - appnope=0.1.3 7 | - argon2-cffi=21.3.0 8 | - argon2-cffi-bindings=21.2.0 9 | - asttokens=2.0.5 10 | - attrs=21.4.0 11 | - backcall=0.2.0 12 | - backports=1.0 13 | - backports.functools_lru_cache=1.6.4 14 | - beautifulsoup4=4.11.1 15 | - biopython=1.83 16 | - bleach=5.0.1 17 | - brotli=1.0.9 18 | - brotli-bin=1.0.9 19 | - bzip2=1.0.8 20 | - ca-certificates=2023.5.7 21 | - certifi=2023.5.7 22 | - cffi=1.15.1 23 | - cvxopt=1.3.0 24 | - cycler=0.11.0 25 | - cython=0.29.30 26 | - debugpy=1.6.0 27 | - decorator=5.1.1 28 | - defusedxml=0.7.1 29 | - dsdp=5.8 30 | - entrypoints=0.4 31 | - executing=0.8.3 32 | - fftw=3.3.10 33 | - flit-core=3.7.1 34 | - fonttools=4.34.4 35 | - freetype=2.10.4 36 | - gettext=0.19.8.1 37 | - giflib=5.2.1 38 | - glib=2.72.1 39 | - glib-tools=2.72.1 40 | - glpk=4.65 41 | - gmp=6.2.1 42 | - gsl=2.7 43 | - gst-plugins-base=1.20.3 44 | - gstreamer=1.20.3 45 | - icu=70.1 46 | - importlib-metadata=4.11.4 47 | - importlib_resources=5.8.0 48 | - ipykernel=6.15.1 49 | - ipython=8.4.0 50 | - ipython_genutils=0.2.0 51 | - ipywidgets=7.7.1 52 | - jedi=0.18.1 53 | - jinja2=3.1.2 54 | - jpeg=9e 55 | - jsonschema=4.7.2 56 | - jupyter=1.0.0 57 | - jupyter_client=7.3.4 58 | - jupyter_console=6.4.4 59 | - jupyter_core=4.10.0 60 | - jupyterlab_pygments=0.2.2 61 | - jupyterlab_widgets=1.1.1 62 | - kiwisolver=1.4.3 63 | - krb5=1.19.3 64 | - lcms2=2.12 65 | - lerc=3.0 66 | - libblas=3.9.0 67 | - libbrotlicommon=1.0.9 68 | - libbrotlidec=1.0.9 69 | - libbrotlienc=1.0.9 70 | - libcblas=3.9.0 71 | - libclang=14.0.6 72 | - libclang13=14.0.6 73 | - libcxx=14.0.6 74 | - libdeflate=1.12 75 | - libedit=3.1.20191231 76 | - libffi=3.4.2 77 | - libgfortran=5.0.0 78 | - libgfortran5 79 | - libglib=2.72.1 80 | - libiconv=1.16 81 | - liblapack=3.9.0 82 | - libllvm14=14.0.6 83 | - libogg=1.3.4 84 | - libopenblas=0.3.20 85 | - libopus=1.3.1 86 | - libpng=1.6.37 87 | - libpq=14.4 88 | - libsodium=1.0.18 89 | - libtiff=4.4.0 90 | - libvorbis=1.3.7 91 | - libwebp=1.2.2 92 | - libwebp-base=1.2.2 93 | - libxcb=1.13 94 | - libzlib=1.2.12 95 | - llvm-openmp=14.0.4 96 | - lz4-c=1.9.3 97 | - markupsafe=2.1.1 98 | - matplotlib=3.5.2 99 | - matplotlib-base=3.5.2 100 | - matplotlib-inline=0.1.3 101 | - metis=5.1.0 102 | - mistune=0.8.4 103 | - mmseqs2=14.7e284 104 | - mpfr=4.1.0 105 | - munkres=1.1.4 106 | - mysql-common=8.0.29 107 | - mysql-libs=8.0.29 108 | - nbclient=0.6.6 109 | - nbconvert=6.5.0 110 | - nbconvert-core=6.5.0 111 | - nbconvert-pandoc=6.5.0 112 | - nbformat=5.4.0 113 | - ncurses=6.3 114 | - nest-asyncio=1.5.5 115 | - notebook=6.4.12 116 | - nspr=4.32 117 | - nss=3.78 118 | - numpy=1.23.1 119 | - openjpeg=2.4.0 120 | - openssl=1.1.1u 121 | - packaging=21.3 122 | - pandas=2.0.0 123 | - pandoc 124 | - pandocfilters=1.5.0 125 | - parso=0.8.3 126 | - patsy=0.5.3 127 | - pcre=8.45 128 | - pexpect=4.8.0 129 | - pickleshare=0.7.5 130 | - pillow=9.2.0 131 | - ply=3.11 132 | - prometheus_client=0.14.1 133 | - prompt-toolkit=3.0.30 134 | - prompt_toolkit=3.0.30 135 | - psutil=5.9.1 136 | - pthread-stubs=0.4 137 | - ptyprocess=0.7.0 138 | - pure_eval=0.2.2 139 | - pycparser=2.21 140 | - pygments=2.12.0 141 | - pyparsing=3.0.9 142 | - pyqt=5.15.7 143 | - pyrsistent=0.18.1 144 | - python=3.10.5 145 | - python-dateutil=2.8.2 146 | - python-fastjsonschema=2.15.3 147 | - python-tzdata=2023.3 148 | - python_abi=3.10 149 | - pytz=2023.3 150 | - pyzmq=23.2.0 151 | - qt-main=5.15.4 152 | - qtconsole=5.3.1 153 | - qtconsole-base=5.3.1 154 | - qtpy=2.1.0 155 | - readline=8.1.2 156 | - scipy=1.8.1 157 | - seaborn=0.12.2 158 | - seaborn-base=0.12.2 159 | - send2trash=1.8.0 160 | - setuptools=63.1.0 161 | - sip=6.6.2 162 | - six=1.16.0 163 | - soupsieve=2.3.1 164 | - sqlite=3.39.0 165 | - stack_data=0.3.0 166 | - statsmodels=0.14.0 167 | - suitesparse=5.10.1 168 | - tbb=2021.5.0 169 | - terminado=0.15.0 170 | - tinycss2=1.1.1 171 | - tk=8.6.12 172 | - toml=0.10.2 173 | - tornado=6.2 174 | - traitlets=5.3.0 175 | - typing_extensions=4.6.3 176 | - tzdata=2022a 177 | - unicodedata2=14.0.0 178 | - wcwidth=0.2.5 179 | - webencodings=0.5.1 180 | - wheel=0.37.1 181 | - widgetsnbextension=3.6.1 182 | - xorg-libxau=1.0.9 183 | - xorg-libxdmcp=1.1.3 184 | - xz=5.2.5 185 | - zeromq=4.3.4 186 | - zipp=3.8.0 187 | - zlib=1.2.12 188 | - zstd=1.5.2 189 | - pip: 190 | - matching==1.4 191 | - pip==22.1.2 192 | - pyqt5-sip==12.11.0 193 | prefix: ~/opt/anaconda3/envs/PDBCleanV2 194 | -------------------------------------------------------------------------------- /src/pdbutils.py: -------------------------------------------------------------------------------- 1 | # 2 | import os 3 | import shutil 4 | import re 5 | import numpy as np 6 | from urllib.request import urlopen 7 | from contextlib import closing, suppress 8 | # 9 | def download_pdb_from_metadata(metadata, projdir=None): 10 | """ 11 | Downloads PDB files based on metadata, in this case the description lines of the fasta files, 12 | and saves them in the specified project directory. 13 | 14 | Parameters: 15 | ----------- 16 | metadata : list of str 17 | A list of metadata strings, from which PDB IDs will be extracted. 18 | projdir : str, optional 19 | The path to the project directory where the PDB files will be saved. If None, a message will display. 20 | 21 | Returns: 22 | -------- 23 | None 24 | """ 25 | if projdir is None: 26 | print("Please provide a project directory ...") 27 | else: 28 | download_dir=projdir+'/raw_bank' 29 | if not os.path.exists(download_dir): 30 | os.mkdir(download_dir) 31 | idset = get_idset_from_metadata(metadata) 32 | for pdbid in idset: 33 | download_pdb_from_id(pdbid, download_dir=download_dir) 34 | 35 | def download_pdb_from_id(pdbid, pdbformat='.cif', download_dir=None): 36 | """ 37 | Downloads a specific PDB file using its ID and saves it in the specified directory. 38 | 39 | Parameters: 40 | ----------- 41 | pdbid : str 42 | The PDB ID of the file that will be downloaded. 43 | pdbformat : str, optional 44 | The format of the PDB file to be downloaded. Default is '.cif'. 45 | download_dir : str, optional 46 | The directory where the downloaded file will be saved. If None, a message will display. 47 | 48 | Returns: 49 | -------- 50 | None 51 | """ 52 | download_url='https://files.rcsb.org/download/' 53 | if download_dir is None: 54 | print("Please provide a directory where to store downloaded files...") 55 | else: 56 | target = download_dir+'/'+pdbid+pdbformat 57 | source = download_url+pdbid.upper()+pdbformat 58 | download_from_url(source, target) 59 | 60 | def get_idset_from_metadata(metadata): 61 | """ 62 | Extracts and returns a set of unique PDB IDs from the provided metadata. 63 | 64 | Parameters: 65 | ----------- 66 | metadata : list of str 67 | A list of metadata strings, each containing a PDB ID. 68 | 69 | Returns: 70 | -------- 71 | idlist : list of str 72 | A sorted list of unique PDB IDs extracted from the metadata. 73 | """ 74 | idlist = [] 75 | for elt in metadata: 76 | idlist.append(elt[1:5]) 77 | return sorted(set(idlist)) 78 | 79 | # 80 | def retrieve_sequence_from_PDB(keyword, mode='sequence', update=True, seqfile=None): 81 | """ 82 | Retrieves sequences or metadata from a PDB sequence file based on the keyword match. 83 | 84 | Parameters: 85 | --------- 86 | keyword : str 87 | The keyword to search for in the sequence or metadata. 88 | mode : str, optional 89 | Specifies whether to match the keyword in the 'sequence' or 'metadata'. Default is 'sequence'. 90 | update : bool, optional 91 | If True, the sequence file will be downloaded or updated before searching. Default is True. 92 | seqfile : str, optional 93 | The path to the sequence file. If None, the file will be downloaded if update is True. 94 | 95 | Returns: 96 | -------- 97 | sequence : numpy.ndarray 98 | A list of sequences that match the keyword. 99 | metadata : numpy.ndarray 100 | A list of metadata associated with the matching sequences (fasta files description line). 101 | """ 102 | if update: 103 | with suppress(FileNotFoundError): 104 | os.remove(seqfile) # remove existing seqfile if any 105 | seqfile = retrieve_seqfile(seqfile=seqfile) 106 | metadata = [] 107 | sequence = [] 108 | with open(seqfile) as f: 109 | nextline=False 110 | prevline='#' 111 | for line in f: 112 | if nextline: 113 | sequence.append(line) 114 | nextline=False 115 | else: 116 | hit = re.findall(keyword, line, flags=re.I) 117 | if hit: 118 | if(mode=='sequence'): 119 | metadata.append(prevline) 120 | sequence.append(line) 121 | elif(mode=='metadata'): 122 | metadata.append(line) 123 | nextline = True 124 | prevline=line 125 | return np.atleast_1d(sequence), np.atleast_1d(metadata) 126 | # 127 | def retrieve_seqfile(seqfile=None): 128 | """ 129 | Downloads the PDB sequence file from the official RCSB FTP site. 130 | 131 | Parameters: 132 | ----------- 133 | seqfile : str, optional 134 | The path where the sequence file will be saved. If None, the file will be saved as 'seqfile.txt'. 135 | 136 | Returns: 137 | -------- 138 | seqfile : str 139 | The path to the downloaded sequence file. 140 | """ 141 | #sequrl='ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' FAPA CHANGED ADDRESS FEB 2025 142 | sequrl='https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt' 143 | if seqfile is None: 144 | seqfile='seqfile.txt' 145 | download_from_url(sequrl, seqfile) 146 | return seqfile 147 | # 148 | def download_from_url(source, target): 149 | """ 150 | Downloads a file from a given URL and saves it to a specified target location. 151 | 152 | Parameters: 153 | ----------- 154 | source : str 155 | The URL of the file to be downloaded. 156 | target : str 157 | The path where the downloaded file will be saved. 158 | 159 | Returns: 160 | -------- 161 | None 162 | """ 163 | with closing(urlopen(source)) as r: 164 | with open(target, 'wb') as f: 165 | shutil.copyfileobj(r,f) 166 | print('wrote {0} from {1}'.format(target, source)) 167 | 168 | -------------------------------------------------------------------------------- /Notebooks/CheckProject_CheckCreateDelete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0667c937", 6 | "metadata": {}, 7 | "source": [ 8 | "# How to use 'Check Project'\n", 9 | "\n", 10 | "In this Notebook we will demonstrate how to use 'pdbclean_io.check_project'.\n", 11 | "This function helps you check if a directory exists, create it, or delete it. \n", 12 | "\n", 13 | "Note: We are running this tutorial after finishing steps 1 and 2." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "id": "cbe7e596", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from PDBClean import pdbclean_io" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "82634f8e", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Path to project directory\n", 34 | "PROJDIR=\"./TIM\"" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "id": "c47d581a", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "ls: ./TIM: No such file or directory\r\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "# Let's check first what the project directory contains\n", 53 | "!ls $PROJDIR" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "id": "8c46605c", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Action 'create' will create a directory\n", 64 | "# With Option 'level' you can name the new directory\n", 65 | "# The new directory contains text file 'info.txt' with the date when the directory was created\n", 66 | "\n", 67 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 5, 73 | "id": "6af82697", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "\u001b[34mclean_bank\u001b[m\u001b[m \u001b[34mraw_bank\u001b[m\u001b[m \u001b[34msimple_bank\u001b[m\u001b[m\r\n", 81 | "info.txt seqres.txt \u001b[34mstandard_MolID_bank\u001b[m\u001b[m\r\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# Run the next 2 cells to verify that the directory was created, and that it contains the info.txt file.\n", 87 | "!ls $PROJDIR" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "id": "4504b8bd", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "directory created on 2022-08-23 23:41:44.214555" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "!cat $PROJDIR/standard_MolID_bank/info.txt" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "id": "ee1cd175", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Cleaning ./TIM/standard_MolID_bank...\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "# action 'clean' will remove all files inside the directory 'level'\n", 124 | "pdbclean_io.check_project(projdir=PROJDIR, action='clean', level='standard_MolID_bank')" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 8, 130 | "id": "e275c246", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Run this cell to verify that the directory we just created is now empty\n", 135 | "!ls $PROJDIR/standard_MolID_bank/" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "id": "4015d784", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Deleting ./TIM/standard_MolID_bank...\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# action 'delete' will delete the 'level' directory\n", 154 | "pdbclean_io.check_project(projdir=PROJDIR, action='delete', level='standard_MolID_bank')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 10, 160 | "id": "9964d932", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "\u001b[34mclean_bank\u001b[m\u001b[m info.txt \u001b[34mraw_bank\u001b[m\u001b[m seqres.txt \u001b[34msimple_bank\u001b[m\u001b[m\r\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "# Run this cell to verify that the directory has been removed\n", 173 | "!ls $PROJDIR/" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 11, 179 | "id": "18f26559", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# Now, let's create the directory again\n", 184 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 12, 190 | "id": "82722c14", 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "./TIM/standard_MolID_bank already exists, with content:\n", 198 | "['info.txt']\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "# Notice that if we run the same command twice, or if the directory already exists,\n", 204 | "# the contents of the directory will be printed to screen. \n", 205 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "c6305b7b", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "PDBCleanV2", 220 | "language": "python", 221 | "name": "PDBCleanV2" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.10.4" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 5 238 | } 239 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: PDBCleanV2 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | dependencies: 6 | - appnope=0.1.3=pyhd8ed1ab_0 7 | - argon2-cffi=21.3.0=pyhd8ed1ab_0 8 | - argon2-cffi-bindings=21.2.0=py310h1961e1f_2 9 | - asttokens=2.0.5=pyhd8ed1ab_0 10 | - attrs=21.4.0=pyhd8ed1ab_0 11 | - backcall=0.2.0=pyh9f0ad1d_0 12 | - backports=1.0=py_2 13 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 14 | - beautifulsoup4=4.11.1=pyha770c72_0 15 | - biopython=1.83 16 | - bleach=5.0.1=pyhd8ed1ab_0 17 | - brotli=1.0.9=h5eb16cf_7 18 | - brotli-bin=1.0.9=h5eb16cf_7 19 | - bzip2=1.0.8=h0d85af4_4 20 | - ca-certificates=2023.5.7=h8857fd0_0 21 | - certifi=2023.5.7=pyhd8ed1ab_0 22 | - cffi=1.15.1=py310h96bbf6e_0 23 | - cvxopt=1.3.0=py310hed5b562_1 24 | - cycler=0.11.0=pyhd8ed1ab_0 25 | - cython=0.29.30=py310hd4537e4_0 26 | - debugpy=1.6.0=py310h9d931ec_0 27 | - decorator=5.1.1=pyhd8ed1ab_0 28 | - defusedxml=0.7.1=pyhd8ed1ab_0 29 | - dsdp=5.8=h6e329d1_1203 30 | - entrypoints=0.4=pyhd8ed1ab_0 31 | - executing=0.8.3=pyhd8ed1ab_0 32 | - fftw=3.3.10=nompi_hf082fe4_102 33 | - flit-core=3.7.1=pyhd8ed1ab_0 34 | - fonttools=4.34.4=py310h6c45266_0 35 | - freetype=2.10.4=h4cff582_1 36 | - gettext=0.19.8.1=hd1a6beb_1008 37 | - giflib=5.2.1=hbcb3906_2 38 | - glib=2.72.1=h2292cb8_0 39 | - glib-tools=2.72.1=h2292cb8_0 40 | - glpk=4.65=h0f52abe_1004 41 | - gmp=6.2.1=h2e338ed_0 42 | - gsl=2.7=h93259b0_0 43 | - gst-plugins-base=1.20.3=hda0ba4b_0 44 | - gstreamer=1.20.3=hdc08c3f_0 45 | - icu=70.1=h96cf925_0 46 | - importlib-metadata=4.11.4=py310h2ec42d9_0 47 | - importlib_resources=5.8.0=pyhd8ed1ab_0 48 | - ipykernel=6.15.1=pyh736e0ef_0 49 | - ipython=8.4.0=py310h2ec42d9_0 50 | - ipython_genutils=0.2.0=py_1 51 | - ipywidgets=7.7.1=pyhd8ed1ab_0 52 | - jedi=0.18.1=py310h2ec42d9_1 53 | - jinja2=3.1.2=pyhd8ed1ab_1 54 | - jpeg=9e=hac89ed1_2 55 | - jsonschema=4.7.2=pyhd8ed1ab_0 56 | - jupyter=1.0.0=py310h2ec42d9_7 57 | - jupyter_client=7.3.4=pyhd8ed1ab_0 58 | - jupyter_console=6.4.4=pyhd8ed1ab_0 59 | - jupyter_core=4.10.0=py310h2ec42d9_0 60 | - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 61 | - jupyterlab_widgets=1.1.1=pyhd8ed1ab_0 62 | - kiwisolver=1.4.3=py310habb735a_0 63 | - krb5=1.19.3=hb49756b_0 64 | - lcms2=2.12=h577c468_0 65 | - lerc=3.0=he49afe7_0 66 | - libblas=3.9.0=15_osx64_openblas 67 | - libbrotlicommon=1.0.9=h5eb16cf_7 68 | - libbrotlidec=1.0.9=h5eb16cf_7 69 | - libbrotlienc=1.0.9=h5eb16cf_7 70 | - libcblas=3.9.0=15_osx64_openblas 71 | - libclang=14.0.6=default_h55ffa42_0 72 | - libclang13=14.0.6=default_hb5731bd_0 73 | - libcxx=14.0.6=hce7ea42_0 74 | - libdeflate=1.12=hac89ed1_0 75 | - libedit=3.1.20191231=h0678c8f_2 76 | - libffi=3.4.2=h0d85af4_5 77 | - libgfortran=5.0.0=9_3_0_h6c81a4c_23 78 | - libgfortran5=9.3.0=h6c81a4c_23 79 | - libglib=2.72.1=hfbcb929_0 80 | - libiconv=1.16=haf1e3a3_0 81 | - liblapack=3.9.0=15_osx64_openblas 82 | - libllvm14=14.0.6=h41df66c_0 83 | - libogg=1.3.4=h35c211d_1 84 | - libopenblas=0.3.20=openmp_hb3cd9ec_0 85 | - libopus=1.3.1=hc929b4f_1 86 | - libpng=1.6.37=h5a3d3bf_3 87 | - libpq=14.4=hf6bb32a_0 88 | - libsodium=1.0.18=hbcb3906_1 89 | - libtiff=4.4.0=h9847915_1 90 | - libvorbis=1.3.7=h046ec9c_0 91 | - libwebp=1.2.2=h28dabe5_0 92 | - libwebp-base=1.2.2=h0d85af4_1 93 | - libxcb=1.13=h0d85af4_1004 94 | - libzlib=1.2.12=hfe4f2af_2 95 | - llvm-openmp=14.0.4=ha654fa7_0 96 | - lz4-c=1.9.3=he49afe7_1 97 | - markupsafe=2.1.1=py310h1961e1f_1 98 | - matplotlib=3.5.2=py310h2ec42d9_0 99 | - matplotlib-base=3.5.2=py310h4510841_0 100 | - matplotlib-inline=0.1.3=pyhd8ed1ab_0 101 | - metis=5.1.0=h2e338ed_1006 102 | - mistune=0.8.4=py310he24745e_1005 103 | - mpfr=4.1.0=h0f52abe_1 104 | - munkres=1.1.4=pyh9f0ad1d_0 105 | - muscle=5.1=hb339e23_1 106 | - mysql-common=8.0.29=h924029e_1 107 | - mysql-libs=8.0.29=h3cab752_1 108 | - nbclient=0.6.6=pyhd8ed1ab_0 109 | - nbconvert=6.5.0=pyhd8ed1ab_0 110 | - nbconvert-core=6.5.0=pyhd8ed1ab_0 111 | - nbconvert-pandoc=6.5.0=pyhd8ed1ab_0 112 | - nbformat=5.4.0=pyhd8ed1ab_0 113 | - ncurses=6.3=h96cf925_1 114 | - nest-asyncio=1.5.5=pyhd8ed1ab_0 115 | - notebook=6.4.12=pyha770c72_0 116 | - nspr=4.32=hcd9eead_1 117 | - nss=3.78=ha8197d3_0 118 | - numpy=1.23.1=py310ha3f357c_0 119 | - openjpeg=2.4.0=h6e7aa92_1 120 | - openssl=1.1.1u=h8a1eda9_0 121 | - packaging=21.3=pyhd8ed1ab_0 122 | - pandas=2.0.0=py310hecf8f37_0 123 | - pandoc=2.18=h694c41f_0 124 | - pandocfilters=1.5.0=pyhd8ed1ab_0 125 | - parso=0.8.3=pyhd8ed1ab_0 126 | - patsy=0.5.3=pyhd8ed1ab_0 127 | - pcre=8.45=he49afe7_0 128 | - pexpect=4.8.0=pyh9f0ad1d_2 129 | - pickleshare=0.7.5=py_1003 130 | - pillow=9.2.0=py310hb3240ae_0 131 | - ply=3.11=py_1 132 | - prometheus_client=0.14.1=pyhd8ed1ab_0 133 | - prompt-toolkit=3.0.30=pyha770c72_0 134 | - prompt_toolkit=3.0.30=hd8ed1ab_0 135 | - psutil=5.9.1=py310h6c45266_0 136 | - pthread-stubs=0.4=hc929b4f_1001 137 | - ptyprocess=0.7.0=pyhd3deb0d_0 138 | - pure_eval=0.2.2=pyhd8ed1ab_0 139 | - pycparser=2.21=pyhd8ed1ab_0 140 | - pygments=2.12.0=pyhd8ed1ab_0 141 | - pyparsing=3.0.9=pyhd8ed1ab_0 142 | - pyqt=5.15.7=py310h57cebac_0 143 | - pyrsistent=0.18.1=py310h1961e1f_1 144 | - python=3.10.5=hdaaf3db_0_cpython 145 | - python-dateutil=2.8.2=pyhd8ed1ab_0 146 | - python-fastjsonschema=2.15.3=pyhd8ed1ab_0 147 | - python-tzdata=2023.3=pyhd8ed1ab_0 148 | - python_abi=3.10=2_cp310 149 | - pytz=2023.3=pyhd8ed1ab_0 150 | - pyzmq=23.2.0=py310h85fb675_0 151 | - qt-main=5.15.4=h938c29d_2 152 | - qtconsole=5.3.1=pyhd8ed1ab_0 153 | - qtconsole-base=5.3.1=pyha770c72_0 154 | - qtpy=2.1.0=pyhd8ed1ab_0 155 | - readline=8.1.2=h3899abd_0 156 | - scipy=1.8.1=py310h1f9c157_0 157 | - seaborn=0.12.2=hd8ed1ab_0 158 | - seaborn-base=0.12.2=pyhd8ed1ab_0 159 | - send2trash=1.8.0=pyhd8ed1ab_0 160 | - setuptools=63.1.0=py310h2ec42d9_0 161 | - sip=6.6.2=py310hd4537e4_0 162 | - six=1.16.0=pyh6c4a22f_0 163 | - soupsieve=2.3.1=pyhd8ed1ab_0 164 | - sqlite=3.39.0=hd9f0692_0 165 | - stack_data=0.3.0=pyhd8ed1ab_0 166 | - statsmodels=0.14.0=py310hc1335a1_1 167 | - suitesparse=5.10.1=h7aff33d_1 168 | - tbb=2021.5.0=hbb4e6a2_1 169 | - terminado=0.15.0=py310h2ec42d9_0 170 | - tinycss2=1.1.1=pyhd8ed1ab_0 171 | - tk=8.6.12=h5dbffcc_0 172 | - toml=0.10.2=pyhd8ed1ab_0 173 | - tornado=6.2=py310h6c45266_0 174 | - traitlets=5.3.0=pyhd8ed1ab_0 175 | - typing_extensions=4.6.3=pyha770c72_0 176 | - tzdata=2022a=h191b570_0 177 | - unicodedata2=14.0.0=py310h1961e1f_1 178 | - wcwidth=0.2.5=pyh9f0ad1d_2 179 | - webencodings=0.5.1=py_1 180 | - wheel=0.37.1=pyhd8ed1ab_0 181 | - widgetsnbextension=3.6.1=pyha770c72_0 182 | - xorg-libxau=1.0.9=h35c211d_0 183 | - xorg-libxdmcp=1.1.3=h35c211d_0 184 | - xz=5.2.5=haf1e3a3_1 185 | - zeromq=4.3.4=he49afe7_1 186 | - zipp=3.8.0=pyhd8ed1ab_0 187 | - zlib=1.2.12=hfe4f2af_2 188 | - zstd=1.5.2=ha9df2e0_2 189 | - pip: 190 | - matching==1.4 191 | - pip==22.1.2 192 | - pyqt5-sip==12.11.0 193 | prefix: ~/opt/anaconda3/envs/PDBCleanV2 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDBCleanV2 2 | 3 | With PDBCleanV2, users can create their own self-consistent structure dataset, enabling more straightforward comparison among structures. The library creates separate files for each biological assembly present in a structure file and standardizes chain names and numbering. Our goal is to provide researchers with a consistent dataset that facilitates their analysis. 4 | 5 | ## Table of contents 6 | 7 | * [PDBCleanV2 workflow and tutorial](#pdbcleanv2-workflow) 8 | * [Other tools](#other-tools) 9 | * [Installation](#installation) 10 | * [PDBClean team](#pdbclean-team) 11 | 12 | ## PDBCleanV2 Workflow 13 | 14 | We have created Jupyter Notebooks that provide a step-by-step guide for creating a curated ensemble of structures using PDBCleanV2. 15 | 16 | ![Workflow flowchart](./images/FlowChart.png) 17 | 18 | ### [Step 1. Download structural ensemble form RCSB PDB.](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step1.DownloadStructuralEnsembleFromRCSBPDB.ipynb) 19 | 20 | Download all structures that match the name and sequence of your molecule of interest. 21 | 22 | > **Note:** This notebook sometimes does not display on the Github website, download and open in your browser. 23 | 24 | ### [Step 2. Clean Structures and Create one CIF file per biological assembly.](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb) 25 | 26 | A CIF file may contain multiple biological assemblies within one asymmetric unit. In this step we separate these biological assemblies, and create one CIF file for each one. We also reduce the number of data blocks included in the CIF file. 27 | 28 | ### [Step 3.1. Assign MOLID to the entities found in the CIF files, version 1](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb) 29 | 30 | The script goes over all the CIF files and collects all entities. The user can decide what Mol ID to assign them. In this example, we show the case in which we give a different ID to each entity found. 31 | This step is also important because it lists all the entities that were found in your ensemble, so it allows you to identify if there is a structure that doesn't belong. We show an example of this in this notebook. 32 | 33 | ### [Step 3.2. Assign MOLID to the entities found in the CIF files, version 2](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb) 34 | 35 | Same as Step 3.1, but in our example, we give the same MOL ID to different entities. You may want to do this for example, if you want to give the same MOL ID to all ligands, or water molecules. Doing this will trigger a concatenation menu, which we show how to use. 36 | 37 | ### [Step 3.3. Assign MOLID to the entities found in the CIF files, version 3](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step3.3AssignMolIDToEntitiesFoundInCIFfiles3.ipynb) 38 | 39 | In this notebook we show how to perform concatenations and conversions by using a conversion file (useful when a structure contains many entities). 40 | We also show an option that allows users to keep a record of the changes introduced in this step (track old chain names, new chain names, entity names and file names). 41 | 42 | ### [Step 4. Chain ID standardization](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step4.ChainIDStandardization.ipynb) 43 | 44 | Step 4 allows us to name each entity with whatever name we want. Step 4 makes sure that the chains that are the same (we do sequence alignment to determine similarity) in different CIF files, have a consistent name. Sometimes entities/chains are mislabeled in deposited structures, this step is recommended to identify any such cases. This step can also be used to identify any possible outliers, by seeing how all chains score compared to our reference. 45 | 46 | We divide the tutorial for this step in two parts. The second part shows how to generate the reference sequences, as well as showing how to load them when running the script. Doing this could also help speed up this step, as it allows to run the script in parallel in batches. This is particularly important when working with large datasets, or with molecules with many chains. 47 | 48 | ### [Step 4.2 Chain ID standardization: generate reference sequences and how to load them](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step4.2.ChainIDStandardization.ipynb) 49 | 50 | In this tutorial, we show how the reference sequence is selected by our script, and show how the user can modify it. It also shows how to load the reference sequences, creating the opportunity for running this step in parallel, in batches, speeding up the whole process. 51 | 52 | ### [Step 5. Residue ID Standardization](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Step5.ResidueIDStandardization.ipynb) 53 | 54 | Following step 4, now that we have consistent chain (entity) naming among all structures in the ensemble, we want to make sure that the numbering is also consistent (that the same residue position has the same number in all structures). 55 | 56 | This is also the last step! You have a curated dataset! 57 | 58 | 59 | > **Note:** There are more advanced curation steps and analysis that we will cover in future releases. 60 | 61 | ## Other tools 62 | 63 | [Check project mini tutorial](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/CheckProject_CheckCreateDelete.ipynb). This mini tutorial can be run after doing step 2. `Check_project` checks if a directory has been created, if not it creates the directory and an info.txt file with the creation date. 64 | 65 | [Dataset Summary](https://github.com/fatipardo/PDBClean-0.0.2/blob/master/Notebooks/Analysis.SummaryPDBDataset.ipynb). 66 | This notebook can be run after doing step 0. It creates plots that summarize important information from your dataset such as organism of origin, resolution, year, and method used to solve the structure. The notebook also creates a pandas dataframe so users can create their own personalized plots. 67 | 68 | ## Installation 69 | 70 | We recommend installing PDBClean inside a virtual environment. We provide an `environment.yml` with the libraries you will need. 71 | We have tested the installation on MacOS with intel processors. 72 | Ensuring you have the pre-requisites will facilitate the installation process! 73 | 74 | ### Pre-requisites 75 | 76 | - Xcode including Command Line tools 77 | - Anaconda 78 | - Create SSH Key and link it to your GitHub account 79 | - Jupyter notebook 80 | - If running in a Mac with M1/M2/M3 chips, install Muscle v.5.1.0 with homebrew, using the following command: 81 | > `brew install brewsci/bio/muscle` 82 | 83 | ### Installation steps 84 | 85 | 1. Download PDBClean from GitHub and install environment from YML file 86 | 87 | >git clone git@github.com:fatipardo/PDBCleanV2 88 | 89 | >cd PDBCleanV2 90 | 91 | >conda config --remove channels defaults 92 | 93 | If you are installing in a computer with Intel chip, use the following command: 94 | 95 | >conda env create -f environment.yml 96 | 97 | If you are installing in a computer with M1/M2/M3 chip, use the following command instead: 98 | 99 | >conda env create -f environment_M1.yml 100 | 101 | 2. Activate environment and install PDBClean 102 | 103 | >conda activate PDBCleanV2 104 | 105 | >python setup.py install 106 | 107 | 3. Install Jupyter Notebook kernel 108 | 109 | > python -m ipykernel install --user --name PDBCleanV2 --display-name PDBCleanV2 110 | 111 | 112 | 4. Running notebook: 113 | 114 | > cd Notebooks 115 | 116 | > jupyter notebook 117 | 118 | - Open any notebook you would like to run. 119 | - If Jupyter does not recognize the kernel, select ‘PDBCleanV2’ from the drop down menu. 120 | 121 | 122 | ## PDBClean team 123 | 124 | The code in this repository is based on the code found [here](https://test.pypi.org/project/PDBClean/#files). 125 | The code was originally written by Frédéric Poitevin and Nicholas Corsepius. 126 | Fátima Pardo Avila and Liv Weiner created this repository. Paulina Cabral contributed to the code and documentation. 127 | We all worked on this project while being part of the Levitt Lab at Stanford University. 128 | -------------------------------------------------------------------------------- /scripts/PDBClean_MolID_CIF.py: -------------------------------------------------------------------------------- 1 | #!/Users/fatima/anaconda3/envs/PDBCleanV2/bin/python 2 | # coding: utf-8 3 | # 4 | # ! ! ! master_molID_class_list is very important 5 | # This is the list that contains every file's MolID class 6 | # !! molIDConversion_list is important . . . it contains the objects 7 | # MolIDConversion and is what is going to be updated by the user and evaulated 8 | # to determine when the next step in the program is unlocked 9 | # Create list of MolIDConversion objects using unique_molID_occur_map 10 | 11 | from __future__ import print_function 12 | import sys, glob 13 | from PDBClean import pdbcleanmolidcifutils as molidutils 14 | 15 | 16 | ######################## 17 | # READ INPUT ARGUMENTS # 18 | ######################## 19 | n_arg = len(sys.argv) 20 | if(n_arg<3): 21 | print('Usage error: {0} '.format(sys.argv[0])) 22 | sys.exit() 23 | source_dir = sys.argv[1] 24 | target_dir = sys.argv[2] 25 | 26 | 27 | ######################################### 28 | # READ PDB FILES AND DEFINE MolID LISTS # 29 | ######################################### 30 | 31 | filelist=glob.glob(source_dir+'/*.cif') 32 | master_molID_class_list = molidutils.pdb_to_masterlist(filelist) 33 | unique_molID_occur_map = molidutils.CreateMasterUniqueMolIDMap(master_molID_class_list) 34 | molIDConversion_list = molidutils.uniquelist_to_conversionlist(unique_molID_occur_map) 35 | #FAPA MARCH 2024 36 | MolID_to_files_map = molidutils.CreateMasterUniqueMolIDMapWithFileName(master_molID_class_list) 37 | MolID_occur_dict_of_lists = molidutils.CreateMasterUniqueMolIDOccursLIST(master_molID_class_list) 38 | MolID_ChainID_dict_of_lists = molidutils.CreateMasterUniqueMolIDinitialChainIDsLIST(master_molID_class_list) 39 | 40 | ##################################### 41 | # INTERACTIVE MOLID CONVERSION MENU # 42 | ##################################### 43 | # Goal: 44 | # users complete their molID conversion templates by ensuring that each member of 45 | # molIDConversion_list has status complete = True 46 | input_menu = "" 47 | input_menu_complete = "" 48 | # For use in the next section 49 | concat_menu = "" 50 | final_menu="" 51 | 52 | while(input_menu != "QUIT"): 53 | if (input_menu_complete == "1"): 54 | print("""Congratulations! You have successfully constructed your 55 | conversion templates. You can proceed to the next section 56 | by selection option 7 or, continue to edit your conversion 57 | template through this menu 58 | """) 59 | print("""PDBClean MolID Conversion Build Menu 60 | Select one of the following options to proceed: 61 | 1) Show full conversion 62 | 2) Show only unassigned conversions 63 | 3) Enter input file 64 | 4) Search MolID to add chain ID conversion 65 | 5) Go entry by entry to add chain ID conversion 66 | 6) Remove a chain ID conversion 67 | A) Track changes (original_chain_name:new_chain:entity:file_name) 68 | """) 69 | if (input_menu_complete == "1"): 70 | print(" 7) Continue to next step of curation") 71 | input_menu = input('Option Number: ') 72 | if (input_menu == "1"): 73 | molidutils.show_full_conversion(molIDConversion_list) 74 | elif (input_menu == "2"): 75 | molidutils.show_unassigned_conversion(molIDConversion_list) 76 | elif (input_menu == "3"): 77 | molIDConversion_list = molidutils.add_user_conversion(molIDConversion_list) 78 | elif (input_menu == "4"): 79 | molIDConversion_list = molidutils.edit_conversion_interface(molIDConversion_list, action='add')#FAPA 80 | elif (input_menu == "5"): 81 | molIDConversion_list = molidutils.edit_conversion_manual(molIDConversion_list) 82 | elif (input_menu == "6"): 83 | molIDConversion_list = molidutils.edit_conversion_interface(molIDConversion_list, action='remove') 84 | elif (input_menu == "B"): # SECRET MENU: Print entity:file_name list 85 | molidutils.Print_MolID_To_Files_Map(MolID_to_files_map,target_dir) 86 | elif (input_menu == "C"): # SECRET MENU Print CHAIN-NAME:ENTITY:FILE-NAME 87 | molidutils.show_full_conversion_and_file_list(molIDConversion_list,MolID_to_files_map,target_dir) 88 | elif (input_menu == "D"): # SECRET MENU Print similar to C but print only relevant chain names 89 | molidutils.show_full_conversion_and_file_list_by_number_chains(molIDConversion_list,MolID_to_files_map,MolID_occur_dict_of_lists,target_dir) 90 | elif (input_menu == "A"): 91 | molidutils.TEST_show_full_conversion_and_file_list_by_number_chains(MolID_ChainID_dict_of_lists,molIDConversion_list, MolID_to_files_map, 92 | MolID_occur_dict_of_lists, target_dir) 93 | elif (input_menu == "7"): 94 | if (input_menu_complete == "1"): 95 | master_molID_class_list = molidutils.update_masterlist(master_molID_class_list, molIDConversion_list) 96 | count_problems = molidutils.problem_counter(master_molID_class_list) 97 | if (count_problems == 0): 98 | final_menu = "START" 99 | elif (count_problems != 0): 100 | concat_menu = "START" 101 | input_menu = "QUIT" 102 | input_menu_complete = molidutils.check_complete(molIDConversion_list) 103 | 104 | ######################################### 105 | # New menu to finalize without printing # 106 | # concatenation menu # 107 | ######################################### 108 | 109 | if (final_menu == "START"): 110 | 111 | count_problems = molidutils.problem_counter(master_molID_class_list) 112 | if (count_problems == 0): 113 | final_menu_complete = "1" 114 | 115 | if (final_menu_complete == "1"): 116 | print(" 6) Finalize Curation") 117 | 118 | final_menu = input('Option Number: ') 119 | 120 | if (final_menu == "6"): 121 | print("Finalizing Curation ...") 122 | molidutils.masterlist_to_pdb(filelist, master_molID_class_list, target_dir=target_dir) 123 | final_menu = "QUIT" 124 | else: 125 | print("Sorry, something went wrong, try again") 126 | 127 | 128 | 129 | ######################################## 130 | # INTERACTIVE MOLID CONCATENATION MENU # 131 | ######################################## 132 | 133 | 134 | if (concat_menu == "START"): 135 | # Prepare for concatenation step 136 | # We now have to take the information contained in the MolIDConversion objects 137 | # in molIDConversion_list to update the MolID objects in master_molID_class_list 138 | # We then need to mine these updated MolID objects to figure out which ones 139 | # contain concatenated chains. These will be presented to the user in another 140 | # interactive menu section where they can update the planned conversion on 141 | # a file by file basis 142 | 143 | master_molID_class_list = molidutils.update_masterlist(master_molID_class_list, molIDConversion_list) 144 | 145 | concat_menu = "" 146 | concat_menu_complete = "" 147 | 148 | while(concat_menu != "QUIT"): 149 | 150 | count_problems = molidutils.problem_counter(master_molID_class_list) 151 | if (count_problems == 0): 152 | concat_menu_complete = "1" 153 | 154 | if (concat_menu_complete == "1"): 155 | print("""Congratulations! You have successfully constructed your 156 | conversion templates.You can proceed to the next section 157 | by selection option 6 or, continue to edit your conversion 158 | template through this menu 159 | """) 160 | print("""PDBClean Concatenations Menu 161 | ------------------------------- 162 | This menu appeared because you have assigned the same chain name to two (or more) entities. 163 | Note that this will concatenate the entities. So you need to either re-assign chain names, 164 | or ACCEPT concatenations. 165 | 166 | Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can 167 | be completed. 168 | 169 | Before you do anything, we suggest to choose option 2, so you know which concatenations have not 170 | been accepted. It will also give you the proper format of the input for option 3. 171 | 172 | If you are sure that all the concatenations are correct. Option 5 will accept all of them. They 173 | will be printed to screen as they are being accepted. 174 | 175 | Select one of the following options to proceed: 176 | 1) Show all conversions 177 | 2) Show only unaccepted concatenations 178 | 3) Search and modify destination chainIDs of proposed concatenations 179 | 4) Accept proposed concatenation one by one 180 | (Repeat this step until finalizing option appears) 181 | 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations) 182 | """) 183 | if (concat_menu_complete == "1"): 184 | print(" 6) Finalize Curation") 185 | 186 | concat_menu = input('Option Number: ') 187 | 188 | if (concat_menu == "1"): 189 | molidutils.show_full_conversion(master_molID_class_list, step='concatenation') 190 | elif (concat_menu == "2"): 191 | molidutils.show_unassigned_conversion(master_molID_class_list, step='concatenation') 192 | elif (concat_menu == "3"): 193 | master_molID_class_list = molidutils.edit_concatenation_interface(master_molID_class_list, action='try')[0] 194 | elif (concat_menu == "4"): 195 | unassigned_MolID=molidutils.return_unassigned_conversion(master_molID_class_list, step='concatenation')[0] 196 | print("This is the concatenation you need to accept:") 197 | new_order=None 198 | master_molID_class_list = molidutils.list_accept_concatenations(master_molID_class_list, unassigned_MolID, new_order=new_order, action='accept')[0] 199 | # Note for tomorrow: here we need to create a new function in molidutils, so we can go over all concatenations! 200 | elif (concat_menu == "5"): 201 | allnewchains=molidutils.return_unassigned_conversion(master_molID_class_list, step='concatenation') 202 | for newchain in allnewchains: 203 | new_order=None 204 | master_molID_class_list = molidutils.list_accept_concatenations_auto(master_molID_class_list, newchain, new_order=new_order, action='accept')[0] 205 | count_problems = molidutils.problem_counter(master_molID_class_list) 206 | print(count_problems) 207 | elif (concat_menu == "6"): 208 | print("Finalizing Curation ...") 209 | molidutils.masterlist_to_pdb(filelist, master_molID_class_list, target_dir=target_dir) 210 | concat_menu = "QUIT" 211 | -------------------------------------------------------------------------------- /src/cleanutils.py: -------------------------------------------------------------------------------- 1 | import os, glob 2 | import re 3 | from Bio.PDB.MMCIF2Dict import MMCIF2Dict 4 | 5 | def process(projdir=None, step='clean', source='raw_bank', target='clean_bank', pdbformat='.cif', verbose=True): 6 | """ 7 | Processes all CIF files in the source directory through one of the processing steps, 8 | and then saves the results to the target directory. The specified steps include, 'clean' and 'simplify' 9 | 10 | Parameters: 11 | ----------- 12 | projdir : str, optional 13 | The project directory containing source and target subdirectories. If none, the current directory is used. 14 | step : str, optional 15 | The processing step to apply to each CIF(s). If none, the current step is 'clean', which rewrites the CIF(s) 16 | including only a limited set of data blocks. 17 | source : str, optional 18 | The subdirectory within the project directory where the raw CIF(s) are in. The default is the 19 | subdirectory titled 'raw_bank' 20 | target : str, optional 21 | The subdirectory within the project directory where processed CIF(s) will be saved. The default is the 22 | subdirectory titled 'clean_bank' 23 | pdbformat : str, optional 24 | The file extension format for CIF(s). Thr default is '.cif'. 25 | verbose : bool, optional 26 | If True, progress is printed to the console. Default is true. 27 | 28 | Returns: 29 | ----------- 30 | None 31 | """ 32 | if projdir is not None: 33 | source_dir = projdir+'/'+source 34 | target_dir = projdir+'/'+target 35 | input_list = glob.glob(source_dir+'/*'+pdbformat) 36 | i=0 37 | 38 | for input_cif in input_list: 39 | cif_name=os.path.basename(input_cif) 40 | 41 | if verbose: 42 | i+=1 43 | print('[{0}/{1}]: {2}'.format(i,len(input_list),cif_name)) 44 | 45 | output_cif=target_dir+'/'+cif_name 46 | 47 | if(step=='clean'): 48 | if os.path.isfile(output_cif): 49 | os.remove(output_cif) 50 | clean_cif(input_cif, output_cif) 51 | 52 | elif(step=='simplify'): 53 | # missing line: remove all assembly cif already created 54 | simplify_cif(input_cif, output_cif, pdbformat) 55 | 56 | def simplify_cif(oldfile, newfile, pdbformat): 57 | """ 58 | Separate all biological assemblies in a CIF into separate files. 59 | 60 | Parameters: 61 | ----------- 62 | oldfile : str 63 | Path to the original CIF(s) needed to be simplified. 64 | newfile: str 65 | Path where the new, simplified CIF(s) will be saved. The function creates multiple files if there are 66 | more than one biological assemblies. 67 | pdbformat : str 68 | The file format extension used when saving the new CIF. 69 | 70 | Returns: 71 | ----------- 72 | None 73 | """ 74 | mmcif_dict = MMCIF2Dict(oldfile) 75 | 76 | # Create map from asym_id to assembly_id 77 | # Convert assembly_id to a list, as it can be either a string or a list 78 | 79 | asym_assembly_map = {} 80 | assembly_id = mmcif_dict['_pdbx_struct_assembly_gen.assembly_id'] 81 | 82 | if not isinstance(assembly_id, list): 83 | assembly_id_list = [] 84 | asym_id_list = [] 85 | assembly_id_list.append(assembly_id) 86 | asym_id_list.append(mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list']) 87 | 88 | else: 89 | assembly_id_list = assembly_id 90 | asym_id_list = mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list'] 91 | 92 | # Convert asym_id entry into a list of asym_ids 93 | for i in range(len(assembly_id_list)): 94 | asym_id = asym_id_list[i] 95 | asym_id = asym_id.strip() 96 | asym_id = re.sub(' ', '', asym_id) 97 | asym_id = asym_id.split(',') 98 | for ident in asym_id: 99 | asym_assembly_map[ident] = assembly_id_list[i] 100 | 101 | for assembly in assembly_id_list: 102 | 103 | if (len(assembly_id_list)==1): 104 | newciffilename = str(re.sub(pdbformat, '', newfile))+"+00" 105 | else: 106 | newciffilename = str(re.sub(pdbformat, '', newfile))+"+0"+str(assembly) 107 | 108 | newciffile = open(newciffilename+pdbformat, 'w') 109 | newciffile.write("data_"+newciffilename+"\n") 110 | 111 | # Writes entry.id 112 | newciffile.write("#\n") 113 | 114 | # Changes the list format to str 115 | L = str(mmcif_dict['_entry.id']) 116 | entryid = '_entry.id ' + L 117 | newciffile.write(entryid + "\n") 118 | 119 | # Write Audit category 120 | newciffile.write("#\n") 121 | newciffile.write("loop_\n") 122 | newciffile.write("_citation_author.name\n") 123 | 124 | if '_citation_author.name' in mmcif_dict: 125 | L = mmcif_dict['_citation_author.name'] 126 | else: 127 | L = "???" 128 | 129 | 130 | if isinstance(L, list): 131 | for i in L: 132 | newciffile.write("'" + re.sub("'", "", i) + "'" + "\n") 133 | else: 134 | newciffile.write("'" + re.sub("'", "", L) + "'" + "\n") 135 | 136 | # Writes Citation category 137 | newciffile.write("#" + "\n") 138 | newciffile.write("loop_" + "\n") 139 | newciffile.write("_citation.title" + "\n") 140 | newciffile.write("_citation.year" + "\n") 141 | newciffile.write("_citation.pdbx_database_id_DOI" + "\n") 142 | L1 = mmcif_dict['_citation.title'] 143 | L2 = mmcif_dict['_citation.year'] 144 | L3 = mmcif_dict['_citation.pdbx_database_id_DOI'] 145 | if isinstance(L1, list): 146 | for i in range(len(L1)): 147 | newciffile.write("'" + re.sub("\n"," ",re.sub("'", "", L1[i])) + "' " + L2[i] + " " + L3[i] + "\n") #FAPA 148 | else: 149 | newciffile.write("'" + re.sub("\n"," ",re.sub("'", "", L1[i])) + "' " + L2[i] + " " + L3[i] + "\n") #FAPA 150 | 151 | # Writes Resolution category 152 | newciffile.write("#" + "\n") 153 | newciffile.write("loop_" + "\n") 154 | newciffile.write("_exptl.method" + "\n") 155 | newciffile.write("_exptl.resolution" + "\n") 156 | if '_exptl.method' in mmcif_dict: 157 | L1 = mmcif_dict['_exptl.method'] 158 | elif '_refine_hist.pdbx_refine_id' in mmcif_dict: 159 | L1 = mmcif_dict['_refine_hist.pdbx_refine_id'] 160 | else: 161 | L1 = mmcif_dict['_refine.pdbx_refine_id'] 162 | if '_refine.ls_d_res_high' in mmcif_dict: 163 | L2 = mmcif_dict['_refine.ls_d_res_high'] 164 | elif '_em_3d_reconstruction.resolution' in mmcif_dict: 165 | L2 = mmcif_dict['_em_3d_reconstruction.resolution'] 166 | elif '_refine_hist.d_res_high' in mmcif_dict: 167 | L2 = mmcif_dict['_refine_hist.d_res_high'] 168 | else: 169 | L2 = '????' 170 | if isinstance(L1, list) and isinstance(L2, list): 171 | for i in range(len(L1)): 172 | newciffile.write("'" + L1[i] + "' " + L2[i] + " " + "\n") 173 | elif isinstance(L1, list) and not isinstance(L2,list): 174 | newciffile.write("'" + L1[0] + "' " + L2 + " " + "\n") 175 | elif not isinstance(L1,list) and isinstance(L2,list): 176 | newciffile.write("'" + L1 + "' " + L2[0] + " " + "\n") 177 | else: 178 | newciffile.write("'" + L1 + "' " + L2 + " " + "\n") 179 | 180 | # Writes Entity category 181 | newciffile.write("#" + "\n") 182 | newciffile.write("loop_" + "\n") 183 | newciffile.write("_entity.id" + "\n") 184 | newciffile.write("_entity.pdbx_description" + "\n") 185 | L1 = mmcif_dict['_entity.id'] 186 | L2 = mmcif_dict['_entity.pdbx_description'] 187 | for i in range(len(L1)): 188 | L2[i] = L2[i].upper() 189 | L2[i] = L2[i].replace(":", "") 190 | newciffile.write(L1[i] + " '" + L2[i].replace("'", "") + "'\n") 191 | 192 | # Writes the coordinate portion of the file 193 | newciffile.write("#" + "\n") 194 | newciffile.write("loop_" + "\n") 195 | newciffile.write("_atom_site.group_PDB" + "\n") 196 | newciffile.write("_atom_site.id" + "\n") 197 | newciffile.write("_atom_site.type_symbol" + "\n") 198 | newciffile.write("_atom_site.label_atom_id" + "\n") 199 | newciffile.write("_atom_site.label_alt_id" + "\n") 200 | newciffile.write("_atom_site.label_comp_id" + "\n") 201 | newciffile.write("_atom_site.label_asym_id" + "\n") 202 | newciffile.write("_atom_site.label_entity_id" + "\n") 203 | newciffile.write("_atom_site.label_seq_id" + "\n") 204 | newciffile.write("_atom_site.pdbx_PDB_ins_code" + "\n") 205 | newciffile.write("_atom_site.Cartn_x" + "\n") 206 | newciffile.write("_atom_site.Cartn_y" + "\n") 207 | newciffile.write("_atom_site.Cartn_z" + "\n") 208 | newciffile.write("_atom_site.occupancy" + "\n") 209 | newciffile.write("_atom_site.B_iso_or_equiv" + "\n") 210 | newciffile.write("_atom_site.auth_seq_id" + "\n") 211 | newciffile.write("_atom_site.auth_comp_id" + "\n") 212 | newciffile.write("_atom_site.auth_asym_id" + "\n") 213 | newciffile.write("_atom_site.auth_atom_id" + "\n") 214 | newciffile.write("_atom_site.pdbx_PDB_model_num" + "\n") 215 | L1 = mmcif_dict['_atom_site.group_PDB'] 216 | L2 = mmcif_dict['_atom_site.id'] 217 | L3 = mmcif_dict['_atom_site.type_symbol'] 218 | L4 = mmcif_dict['_atom_site.label_atom_id'] 219 | L5 = mmcif_dict['_atom_site.label_alt_id'] 220 | L6 = mmcif_dict['_atom_site.label_comp_id'] 221 | L7 = mmcif_dict['_atom_site.label_asym_id'] 222 | L8 = mmcif_dict['_atom_site.label_entity_id'] 223 | L9 = mmcif_dict['_atom_site.label_seq_id'] 224 | L10 = mmcif_dict['_atom_site.pdbx_PDB_ins_code'] 225 | L11 = mmcif_dict['_atom_site.Cartn_x'] 226 | L12 = mmcif_dict['_atom_site.Cartn_y'] 227 | L13 = mmcif_dict['_atom_site.Cartn_z'] 228 | L14 = mmcif_dict['_atom_site.occupancy'] 229 | L15 = mmcif_dict['_atom_site.B_iso_or_equiv'] 230 | L16 = mmcif_dict['_atom_site.auth_seq_id'] 231 | L17 = mmcif_dict['_atom_site.auth_comp_id'] 232 | L18 = mmcif_dict['_atom_site.auth_asym_id'] 233 | L19 = mmcif_dict['_atom_site.auth_atom_id'] 234 | L20 = mmcif_dict['_atom_site.pdbx_PDB_model_num'] 235 | 236 | # This section is necessary to print the biological assemblies on separate files 237 | BioAssembly = mmcif_dict['_pdbx_struct_assembly_gen.asym_id_list'] 238 | 239 | for i in range(len(L1)): 240 | if (L7[i] in BioAssembly[int(assembly)-1].split(',')): 241 | newciffile.write(L1[i] + " " + L2[i] + " " + L3[i] + ' "' + L4[i] + '" ' + L5[i] + " " + L6[i] + " " + L7[i] + " " + L8[i] + " " + L9[i] + " " + L10[i] + " " + L11[i] + " " + L12[i] + " " + L13[i] + " " + L14[i] + " " + L15[i] + " " + L16[i] + " " + L17[i] + " " + L18[i] + ' "' + L19[i] + '" ' + L20[i] + "\n") 242 | newciffile.write("#" + "\n") 243 | 244 | 245 | # 246 | def clean_cif(oldfile, newfile): 247 | """ 248 | Rewrites CIF, including only a limited set of data blocks. 249 | 250 | Parameters: 251 | ----------- 252 | oldfile : str 253 | The path to the original CIF(s) needed to be cleaned. 254 | newfile : str 255 | The path where the cleaned CIF(s) will be written. 256 | 257 | Returns: 258 | ----------- 259 | None 260 | """ 261 | entry_list = ['_entry.id', 262 | '_atom_site.group_PDB', 263 | '_citation_author.name', 264 | '_citation.title', 265 | '_pdbx_struct_assembly_gen.assembly_id', 266 | '_entity.pdbx_description', 267 | '_exptl.method', 268 | '_em_3d_reconstruction.resolution', 269 | '_refine_hist.pdbx_refine_id', 270 | '_refine.pdbx_refine_id'] 271 | keylength_list = [ 9, 272 | 20, 273 | 21, 274 | 15, 275 | 37, 276 | 24, 277 | 13, 278 | 32, 279 | 27, 280 | 22] 281 | with open(oldfile) as old_file: 282 | alllines = [] 283 | linecount = 0 284 | poundline = 0 285 | flag = 0 286 | for line in old_file: 287 | alllines.append(line) 288 | if linecount == 0: 289 | with open(newfile, 'a') as new_file: 290 | new_file.write(alllines[0]) 291 | for entry, keylength in zip(entry_list, keylength_list): 292 | flag = check_and_write_entry(entry, line, alllines, line[0:keylength], flag, range(poundline, linecount), newfile) 293 | if '#' in line[0]: 294 | poundline = linecount 295 | linecount += 1 296 | with open(newfile, 'a') as new_file: 297 | new_file.write('#\n') 298 | # 299 | def check_and_write_entry(entry, line, alllines, key, flag, linerange, newfile): 300 | """ 301 | Checks if a specific entry is present in the current line of a CIF and writes relevant lines to a new file. 302 | 303 | Parameters: 304 | ----------- 305 | entry : str 306 | The specific CIF entry to look for in the line (e.g., '_entry.id'). 307 | line : str 308 | The current line being read from the CIF. 309 | alllines : list 310 | A list of all lines read so far from the CIF. 311 | key : str 312 | The substring of the current line that is compared to the entry. 313 | flag : int 314 | A flag indicating whether the desired entry has been found (1 if found, 0 otherwise). 315 | linerange : range 316 | The range of lines from `alllines` to write to the new file if the entry is found. 317 | newfile : str 318 | The path to the new CIF where the relevant lines will be written. 319 | 320 | Returns: 321 | ----------- 322 | flag : int 323 | The new flag value indicating whether the entry was found or not. 324 | """ 325 | if entry in key: 326 | flag = 1 327 | elif (flag==1) and '#' in line[0]: 328 | with open(newfile, 'a') as new_file: 329 | for i in linerange: 330 | new_file.write(alllines[i]) 331 | flag=0 332 | return flag 333 | -------------------------------------------------------------------------------- /src/alignmentutils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | import os 4 | import time 5 | from Bio import SeqIO 6 | import numpy as np 7 | 8 | # AA Map from 3 letter amino acid id to 1 letter id 9 | # it also includes nucleic acids, including post-tranlational modifications, 10 | # which are mapped to ACTUG. 11 | def ResnConvert(resn): 12 | """ 13 | Converts the 3 letter amino acid id into a singular letter ID. 14 | 15 | Parameters: 16 | ----------- 17 | resn : str 18 | The 3 letter amino acid id 19 | 20 | Returns: 21 | -------- 22 | ans : str 23 | The one-letter ID representing the 3 letter amino acid ID. If there's none, "X" is 24 | returned. 25 | 26 | """ 27 | AA = {} 28 | AA["UNK"] = "X" 29 | AA["ALA"] = "A" 30 | AA["ARG"] = "R" 31 | AA["ASN"] = "N" 32 | AA["ASP"] = "D" 33 | AA["CYS"] = "C" 34 | AA["GLN"] = "Q" 35 | AA["GLU"] = "E" 36 | AA["GLY"] = "G" 37 | AA["HIS"] = "H" 38 | AA["ILE"] = "I" 39 | AA["LEU"] = "L" 40 | AA["LYS"] = "K" 41 | AA["MET"] = "M" 42 | AA["PHE"] = "F" 43 | AA["PRO"] = "P" 44 | AA["SER"] = "S" 45 | AA["THR"] = "T" 46 | AA["TRP"] = "W" 47 | AA["TYR"] = "Y" 48 | AA["VAL"] = "V" 49 | AA["A"] = "A" 50 | AA["C"] = "C" 51 | AA["U"] = "U" 52 | AA["G"] = "G" 53 | AA["2MA"] = "A" 54 | AA["3AU"] = "U" 55 | AA["4AC"] = "C" 56 | AA["4OC"] = "C" 57 | AA["4SU"] = "U" 58 | AA["5MC"] = "C" 59 | AA["5MU"] = "U" 60 | AA["6IA"] = "A" 61 | AA["6MZ"] = "U" 62 | AA["7MG"] = "G" 63 | AA["8AN"] = "A" 64 | AA["CM0"] = "C" 65 | AA["G7M"] = "G" 66 | AA["H2U"] = "U" 67 | AA["MIA"] = "A" 68 | AA["OMC"] = "C" 69 | AA["OMG"] = "C" 70 | AA["PSU"] = "U" 71 | AA["QUO"] = "G" 72 | AA["T6A"] = "A" 73 | AA["U8U"] = "U" 74 | AA["YG"] = "G" 75 | 76 | if resn not in AA: 77 | ans = "X" 78 | else: 79 | ans = AA[resn] 80 | return ans 81 | # END AA Map from 3 letter amino acid id to 1 letter id 82 | 83 | def AlignSequences(sequence_vec): 84 | """ 85 | Takes a list of sequence strings and performs a MUSCLE alignment, 86 | outputting a vector of aligned sequence strings. 87 | 88 | Parameters: 89 | ----------- 90 | sequence_vec : list of str 91 | list of sequences to be aligned 92 | 93 | Returns: 94 | -------- 95 | aligned_seq : list of str 96 | A list containing the aligned sequences 97 | """ 98 | 99 | with open("Seq.fa", 'w') as newfafile: 100 | for seq in sequence_vec: 101 | newfafile.write("> Seq" + "\n") 102 | newfafile.write(seq + "\n") 103 | 104 | process=os.popen('muscle -align Seq.fa -output Seq.afa') 105 | 106 | process 107 | 108 | time.sleep(1) #FAPA 109 | 110 | #FAPA START 111 | while not os.path.exists("Seq.afa"): 112 | time.sleep(1) #FAPA, WAITING LESS TIME 113 | print("waiting...") 114 | 115 | while not os.path.getsize("Seq.afa") >= os.path.getsize("Seq.fa"): 116 | time.sleep(1) #FAPA, WAITING LESS TIME 117 | print("waiting even more...") 118 | 119 | #FAPA ENDS 120 | 121 | aligned_seq = [] 122 | with open("Seq.afa") as seqfile: 123 | seq = "" 124 | for line in seqfile: 125 | if (line[0] == ">"): 126 | if (seq != ""): 127 | aligned_seq.append(seq) 128 | seq = "" 129 | else: 130 | seq += line.strip() 131 | aligned_seq.append(seq) 132 | 133 | process.close() 134 | return (aligned_seq) 135 | # END AlignSequences 136 | 137 | def AlignSequences_v2(sequence_vec, file_name, this_chainsseq_list_ids): 138 | """ 139 | Takes a list of sequence strings and performs a MUSCLE alignment, outputting 140 | a vector of aligned sequence strings. 141 | 142 | Parameters: 143 | ----------- 144 | sequence_vec : list of str 145 | list of sequences to be aligned 146 | file_name : str 147 | Name given to FASTA file 148 | this_chainsseq_list_ids : list of str 149 | List of identifiers for each sequence which are also used as headers in the 150 | FASTA file. 151 | 152 | Returns: 153 | -------- 154 | aligned_seq_map : dict 155 | A dictionary where the keys are the sequence identifiers from `this_chainsseq_list_ids` 156 | and the values are the corresponding aligned sequence strings. 157 | """ 158 | # Takes a list of sequence strings and performs a MUSCLE alignment, outputting a vector of aligned sequence strings 159 | with open(file_name+".fa", 'w') as newfafile: 160 | i = 0 161 | for seq in sequence_vec: 162 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n") 163 | newfafile.write(seq + "\n") 164 | i += 1 165 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta" 166 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" #FAPA 26AUG2025 167 | 168 | process = os.popen(command) 169 | 170 | process 171 | 172 | #FAPA START 173 | while not os.path.exists(file_name+".fasta"): 174 | time.sleep(10) 175 | print("waiting...") 176 | 177 | while not os.path.getsize(file_name+".fasta") > 0: 178 | time.sleep(10) 179 | print("waiting even more...") 180 | 181 | aligned_seq_map = {} 182 | aligned_seq = [] 183 | seq = "" 184 | with open(file_name + ".fasta") as seqfile: 185 | for line in seqfile: 186 | if (line[0] == ">"): 187 | # Very first line 188 | if (seq == ""): 189 | line = line.strip() 190 | line = line.split() 191 | key = line[2] 192 | else: 193 | aligned_seq_map[key] = seq 194 | seq = "" 195 | line = line.strip() 196 | line = line.split() 197 | key = line[2] 198 | else: 199 | seq += line.strip() 200 | aligned_seq_map[key] = seq 201 | 202 | for item in (aligned_seq_map.keys()): 203 | aligned_seq.append(aligned_seq_map[item]) 204 | 205 | process.close() 206 | return (aligned_seq_map) 207 | 208 | # END AlignSequences 209 | 210 | # FAPA MAY TEST BEGIN 211 | 212 | def AlignSequences_v3(sequence_vec, file_name, this_chainsseq_list_ids): 213 | """ 214 | Takes a list of sequence strings and performs a MUSCLE alignment, 215 | outputting a vector of aligned sequence strings. This version checks 216 | if an alignment has already been provided, before running muscle, and 217 | in that case, just reads the existing alignment. 218 | 219 | Parameters: 220 | ----------- 221 | sequence_vec : list of str 222 | list of sequences to be aligned 223 | file_name : str 224 | Name given to FASTA file 225 | this_chainsseq_list_ids : list of str 226 | List of identifiers for each sequence which are also used as headers in the 227 | FASTA file. 228 | 229 | Returns: 230 | -------- 231 | aligned_seq_map : dict 232 | A dictionary where the keys are sequence identifiers from `this_chainsseq_list_ids` and 233 | the values are the corresponding aligned sequence strings. 234 | """ 235 | if os.path.exists(file_name+".fasta") == False: 236 | 237 | with open(file_name+".fa", 'w') as newfafile: 238 | i = 0 239 | for seq in sequence_vec: 240 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n") 241 | newfafile.write(seq + "\n") 242 | i += 1 243 | 244 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta" 245 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" # FAPA 26AUG2025 246 | process = os.popen(command) 247 | process 248 | 249 | while not os.path.exists(file_name+".fasta"): 250 | time.sleep(10) 251 | print("waiting...") 252 | 253 | while not os.path.getsize(file_name+".fasta") > 0: 254 | time.sleep(10) 255 | print("waiting even more...") 256 | process.close() 257 | 258 | else: 259 | print("Alignment already exists, so I will use that one!") 260 | 261 | aligned_seq_map = {} 262 | aligned_seq = [] 263 | seq = "" 264 | with open(file_name + ".fasta") as seqfile: 265 | for line in seqfile: 266 | if (line[0] == ">"): 267 | # Very first line 268 | if (seq == ""): 269 | line = line.strip() 270 | line = line.split() 271 | key = line[2] 272 | else: 273 | aligned_seq_map[key] = seq 274 | seq = "" 275 | line = line.strip() 276 | line = line.split() 277 | key = line[2] 278 | else: 279 | seq += line.strip() 280 | aligned_seq_map[key] = seq 281 | 282 | for item in (aligned_seq_map.keys()): 283 | aligned_seq.append(aligned_seq_map[item]) 284 | 285 | 286 | return (aligned_seq_map) 287 | 288 | 289 | # FAPA JULY TEST STARTS HERE 290 | 291 | def AlignSequences_v4(sequence_vec, file_name, this_chainsseq_list_ids): 292 | """ 293 | Takes a list of sequence strings and performs a MUSCLE alignment, outputting 294 | a vector of aligned sequence strings. This version checks if an alignment has 295 | already been provided, before running muscle, and in that case just reads the 296 | existing alignment. 297 | 298 | Parameters: 299 | ----------- 300 | sequence_vec : list of str 301 | list containing sequences from FASTA files 302 | file_name : str 303 | Name given to FASTA file 304 | this_chainsseq_list_ids : list of str 305 | List of identifiers for each sequence which are also used as headers in the 306 | FASTA file. 307 | 308 | Returns: 309 | -------- 310 | aligned_seq_map : dict 311 | A dictionary where the keys are the sequence identifiers from `this_chainsseq_list_ids` 312 | and the values are the corresponding aligned sequence strings 313 | gap_percentages : np.ndarray 314 | An array where each element represents the percentage of gaps at that position 315 | across all sequences. 316 | """ 317 | if os.path.exists(file_name+".fasta") == False: 318 | 319 | with open(file_name+".fa", 'w') as newfafile: 320 | i = 0 321 | for seq in sequence_vec: 322 | newfafile.write("> Seq " + str(this_chainsseq_list_ids[i]) + "\n") 323 | newfafile.write(seq + "\n") 324 | i += 1 325 | 326 | #command = "muscle -align "+file_name+".fa -output "+file_name+".fasta" 327 | command = "muscle -super5 "+file_name+".fa -output "+file_name+".fasta" # FAPA 26AUG2025 328 | process = os.popen(command) 329 | process 330 | 331 | #FAPA START 332 | while not os.path.exists(file_name+".fasta"): 333 | time.sleep(10) 334 | print("waiting...") 335 | 336 | while not os.path.getsize(file_name+".fasta") > 0: 337 | time.sleep(10) 338 | print("waiting even more...") 339 | 340 | #FAPA ENDS 341 | process.close() 342 | 343 | else: 344 | print("Alignment already exists, so I will use that one!") 345 | 346 | aligned_seq_map = {} 347 | aligned_seq = [] 348 | seq = "" 349 | with open(file_name + ".fasta") as seqfile: 350 | for line in seqfile: 351 | if (line[0] == ">"): 352 | # Very first line 353 | if (seq == ""): 354 | line = line.strip() 355 | line = line.split() 356 | key = line[2] 357 | else: 358 | aligned_seq_map[key] = seq 359 | seq = "" 360 | line = line.strip() 361 | line = line.split() 362 | key = line[2] 363 | else: 364 | seq += line.strip() 365 | aligned_seq_map[key] = seq 366 | 367 | for item in (aligned_seq_map.keys()): 368 | aligned_seq.append(aligned_seq_map[item]) 369 | 370 | print(file_name) 371 | 372 | sequences = read_fasta_files( file_name + ".fasta") 373 | gap_percentages = calculate_gap_percentages(sequences) 374 | 375 | #print(aligned_seq_map) 376 | print("Gap percentages per position:") 377 | print(gap_percentages) 378 | 379 | return (aligned_seq_map,gap_percentages) 380 | 381 | # The functions below are used to calculate the percentage of gaps per position 382 | def read_fasta_files(fasta_file): 383 | """ 384 | Reads FASTA files and extracts the sequences into a list. 385 | 386 | Parameters: 387 | ----------- 388 | fasta_file : str 389 | Path to FASTA file containing all the sequences 390 | 391 | Returns: 392 | -------- 393 | sequences : list of str 394 | list of sequences from a FASTA file 395 | """ 396 | sequences = [] 397 | for record in SeqIO.parse(fasta_file, "fasta"): 398 | sequences.append(str(record.seq)) 399 | return sequences 400 | 401 | def calculate_gap_percentages(sequences): 402 | """ 403 | Calculates the percentage of gaps at each position in a list of sequences. 404 | 405 | Parameters: 406 | ----------- 407 | sequences : list of str 408 | list of sequences extracted from a FASTA file 409 | 410 | Returns: 411 | -------- 412 | gap_percentages : np.ndarray 413 | An array where each element represents the percentage of gaps at that position 414 | across all sequences. 415 | """ 416 | sequence_length = len(sequences[0]) 417 | gap_counts = np.zeros(sequence_length) 418 | 419 | for sequence in sequences: 420 | for i, char in enumerate(sequence): 421 | if char == '-': 422 | gap_counts[i] += 1 423 | 424 | total_sequences = len(sequences) 425 | gap_percentages = (gap_counts / total_sequences) * 100 426 | return gap_percentages 427 | 428 | # FAPA JULY TEST ENDS 429 | 430 | def ScoreSequenceAlignment(seq1, seq2): 431 | """ 432 | Compares the reference sequence to another sequence and counts for similarity based on 433 | exact matches between corresponding elements from the two sequences. 434 | 435 | Parameters: 436 | ----------- 437 | seq1 : str 438 | The reference sequence 439 | seq2 : str 440 | The sequence being compared 441 | 442 | Returns: 443 | -------- 444 | score : float 445 | The similarity score between the reference sequence and the sequence being compared. 446 | """ 447 | # Scores based on exact identity. Should maybe be updated to take longer 448 | # of sequences so that it can be used with unaligned seq strings too 449 | score = 0 450 | for i in range(len(seq1)): 451 | if (seq1[i] == seq2[i]): 452 | score += 1 453 | score = score/len(seq1) 454 | return score 455 | -------------------------------------------------------------------------------- /Notebooks/Step3.1.AssignMolIDToEntitiesFoundInCIFfiles1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2d065185", 6 | "metadata": {}, 7 | "source": [ 8 | "# Assign MolID to the entities found in the CIF files (1) " 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "729c6719", 14 | "metadata": {}, 15 | "source": [ 16 | "## What is the goal of this notebook?\n", 17 | "\n", 18 | "We will run `PDBClean_MolID_CIF.py` to re-assign the MolID to the entities found in our new ensemble of CIF files. \n", 19 | "The script goes over all the CIF files and collects all entities. The user can then decide what MolID to assign them. \n", 20 | "\n", 21 | "There are also some other benefits from running this script: \n", 22 | "\n", 23 | "- You can assign the same MolID to different entities. In that case these entities will be concatenated. User needs to accept each concatenation manually. \n", 24 | "- Inspecting the list of entities will allow users to identify structures that needs to be removed from the ensemble.\n", 25 | "- Make sure that the MolIDs of the structures in the ensemble are consistent (the same chain is named always the same, even in different structures).\n", 26 | "\n", 27 | "This notebook will go over the cases described above. \n", 28 | "\n", 29 | ">**NOTE:** For this tutorial, we will not use the whole ensemble we downloaded. We will use a subsample of only 7 structures. The next cells will create the new directory. Notice that we are choosing these 7 sctructures from the ones we downloaded. We chose these ones to highlight some possible issues you may run into when running this script." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "id": "74c0c396", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from PDBClean import pdbclean_io" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "id": "0b4b831c", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "PROJDIR=\"./TIM/\"" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "ee39d3ed", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='simple_bank_sub')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "3ce20e15", 65 | "metadata": {}, 66 | "source": [ 67 | "> Let's copy some structures from our simple_bank into the newly created 'simple_bank_sub' directory" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "id": "ee9230a2", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "!cp $PROJDIR/simple_bank/1klg+00.cif $PROJDIR/simple_bank_sub/\n", 78 | "!cp $PROJDIR/simple_bank/2y62+00.cif $PROJDIR/simple_bank_sub/\n", 79 | "!cp $PROJDIR/simple_bank/1ag1+00.cif $PROJDIR/simple_bank/1aw1+01.cif $PROJDIR/simple_bank_sub/\n", 80 | "!cp $PROJDIR/simple_bank/1aw1+02.cif $PROJDIR/simple_bank/1aw1+03.cif $PROJDIR/simple_bank_sub/\n", 81 | "!cp $PROJDIR/simple_bank/1aw1+04.cif $PROJDIR/simple_bank_sub/" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "id": "ecf54395", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank')" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "2e6c151d", 97 | "metadata": {}, 98 | "source": [ 99 | "## Running PDBClean_MolID_CIF.py \n", 100 | "\n", 101 | "Notice that the way to run this script in the terminal is as following:\n", 102 | "\n", 103 | "> PDBClean_MolID_CIF.py `{Input Directory}` `{Output Directory}`\n", 104 | "\n", 105 | "The input directory contains the structures that we generated in Step 1. The output directory is where the new structures will be stored. \n", 106 | "\n", 107 | "Running this script will print a menu to screen. In the next cell we run the script and give 2 as input, so that we can select option `2) Show only unassigned conversions`. Then we `QUIT` the program. \n", 108 | "\n", 109 | "**Note:** We recommend running the script directly on the terminal. We are running it from the notebook just for demonstration purpose." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "id": "f7b40709", 116 | "metadata": { 117 | "scrolled": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 7)\n", 125 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 7)\n", 126 | "Reading: ./TIM//simple_bank_sub/1klg+00.cif (3 of 7)\n", 127 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (4 of 7)\n", 128 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (5 of 7)\n", 129 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (6 of 7)\n", 130 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (7 of 7)\n", 131 | "PDBClean MolID Conversion Build Menu\n", 132 | " Select one of the following options to proceed:\n", 133 | " 1) Show full conversion\n", 134 | " 2) Show only unassigned conversions\n", 135 | " 3) Enter input file\n", 136 | " 4) Search MolID to add chain ID conversion\n", 137 | " 5) Go entry by entry to add chain ID conversion\n", 138 | " 6) Remove a chain ID conversion\n", 139 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 140 | " \n", 141 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 142 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 143 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 144 | "2:GLYCEROL:\n", 145 | "4:WATER:\n", 146 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 147 | "1:PHOSPHATE ION:\n", 148 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN:\n", 149 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN:\n", 150 | "1:TRIOSEPHOSPHATE ISOMERASE PEPTIDE:\n", 151 | "1:ENTEROTOXIN TYPE C-3:\n", 152 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 153 | "You need to accept 12 entity conversions\n", 154 | "You need to accept 18 total chain conversions\n", 155 | "PDBClean MolID Conversion Build Menu\n", 156 | " Select one of the following options to proceed:\n", 157 | " 1) Show full conversion\n", 158 | " 2) Show only unassigned conversions\n", 159 | " 3) Enter input file\n", 160 | " 4) Search MolID to add chain ID conversion\n", 161 | " 5) Go entry by entry to add chain ID conversion\n", 162 | " 6) Remove a chain ID conversion\n", 163 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 164 | " \n", 165 | "Option Number: " 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "! echo '2\\nQUIT' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank\n" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "ad8cc03d", 176 | "metadata": {}, 177 | "source": [ 178 | "## What does the output mean?\n", 179 | "\n", 180 | "`1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 181 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 182 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 183 | "2:GLYCEROL:\n", 184 | "4:WATER:\n", 185 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 186 | "1:PHOSPHATE ION:\n", 187 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN:\n", 188 | "1:HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN:\n", 189 | "1:TRIOSEPHOSPHATE ISOMERASE PEPTIDE:\n", 190 | "1:ENTEROTOXIN TYPE C-3:\n", 191 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 192 | "You need to accept 12 entity conversions\n", 193 | "You need to accept 18 total chain conversions`\n", 194 | "\n", 195 | "\n", 196 | "The output printed to screen, and reproduced right above in this cell, tells us how many MolIDs (think of them as chains) are part of each entity. For example, the first line tells us that in one of the file, there is one entity `TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM` that contains one MolID. We also see that in the case of `WATER`, there are 4 MolIDs that we need to assign. \n", 197 | "\n", 198 | "The last two lines tell us how many entities were found as well as the total amount of chains." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "140a987e", 204 | "metadata": {}, 205 | "source": [ 206 | "## Inspect the entities in your ensemble. A way to detect outliers:\n", 207 | "\n", 208 | "Another advantage of reading this list, is that we can take a look at all the entities that are present in our ensemble. In our tutorial example, we used the keyword 'triosephosphate isomerase'. If you read this list, you may find some suspicious entitities, such as `HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN`. A closer inspection to the list, we can see also `TRIOSEPHOSPHATE ISOMERASE PEPTIDE`, which suggests that it only contains a fragment of the protein. \n", 209 | "\n", 210 | "Since these are suspicious entries, we can further inspect the CIF files that contain these entities. First, we need to figure out which are the CIF files. The next cell shows a way to do it:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 7, 216 | "id": "6b576fa4", 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "./TIM//simple_bank_sub/1klg+00.cif:1 'HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR ALPHA CHAIN'\n", 224 | "./TIM//simple_bank_sub/1klg+00.cif:2 'HLA CLASS II HISTOCOMPATIBILITY ANTIGEN, DR-1 BETA CHAIN'\n", 225 | "./TIM//simple_bank_sub/1klg+00.cif:3 'TRIOSEPHOSPHATE ISOMERASE PEPTIDE'\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "! grep \"HLA CLASS II HISTOCOMPATIBILITY ANTIGEN\" $PROJDIR/simple_bank_sub/*cif \n", 231 | "! grep \"TRIOSEPHOSPHATE ISOMERASE PEPTIDE\" $PROJDIR/simple_bank_sub/*cif " 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "4d1e8d83", 237 | "metadata": {}, 238 | "source": [ 239 | "These entities come from one single CIF file: 1klg+00.cif \n", 240 | "\n", 241 | "By reading the CIF file (run the cell below, removing the '#') , or using a molecular visualization tool, the user can see that this is an outlier. It was selected because there is a small fragment of the triosephosphate isomerase, but the main structure is of the HLA Class II Histocompatibility antigen. It is best to remove these structures from our ensemble. " 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 8, 247 | "id": "3ceb29e8", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "# ! cat $PROJDIR/simple_bank_sub/1klg+00.cif" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 9, 257 | "id": "f0e21fee", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# Remove problematic CIF file\n", 262 | "\n", 263 | "! rm $PROJDIR/simple_bank_sub/1klg+00.cif \n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "8eb4a8da", 269 | "metadata": {}, 270 | "source": [ 271 | "## How to assign new MolID? \n", 272 | "\n", 273 | "Let's rerun `PDBClean_MolID_CIF.py` with our subsampled ensemble, now with only 6 structures. " 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 10, 279 | "id": "e923fda1", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 6)\n", 287 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 6)\n", 288 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (3 of 6)\n", 289 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (4 of 6)\n", 290 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (5 of 6)\n", 291 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (6 of 6)\n", 292 | "PDBClean MolID Conversion Build Menu\n", 293 | " Select one of the following options to proceed:\n", 294 | " 1) Show full conversion\n", 295 | " 2) Show only unassigned conversions\n", 296 | " 3) Enter input file\n", 297 | " 4) Search MolID to add chain ID conversion\n", 298 | " 5) Go entry by entry to add chain ID conversion\n", 299 | " 6) Remove a chain ID conversion\n", 300 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 301 | " \n", 302 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 303 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 304 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 305 | "2:GLYCEROL:\n", 306 | "2:WATER:\n", 307 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 308 | "1:PHOSPHATE ION:\n", 309 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 310 | "You need to accept 8 entity conversions\n", 311 | "You need to accept 12 total chain conversions\n", 312 | "PDBClean MolID Conversion Build Menu\n", 313 | " Select one of the following options to proceed:\n", 314 | " 1) Show full conversion\n", 315 | " 2) Show only unassigned conversions\n", 316 | " 3) Enter input file\n", 317 | " 4) Search MolID to add chain ID conversion\n", 318 | " 5) Go entry by entry to add chain ID conversion\n", 319 | " 6) Remove a chain ID conversion\n", 320 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 321 | " \n", 322 | "Option Number: " 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "! echo '2\\nQUIT' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "id": "9191e8cc", 333 | "metadata": {}, 334 | "source": [ 335 | "### Renaming MolID, how to choose a name? \n", 336 | "\n", 337 | "This is a personal decision. You can decide how name each entity. For example, the easiest way is to assign a different MolID to each entity, as shown in the table below:\n", 338 | "\n", 339 | "| New MolID | ENTITIES |\n", 340 | "|---|:---|\n", 341 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n", 342 | "| B | 1:SN-GLYCEROL-3-PHOSPHATE: |\n", 343 | "| C | 1:SN-GLYCEROL-1-PHOSPHATE: |\n", 344 | "| D,E | 2:GLYCEROL: |\n", 345 | "| F,G | 2:WATER: |\n", 346 | "| H,I | 2:TRIOSEPHOSPHATE ISOMERASE: |\n", 347 | "| J | 1:PHOSPHATE ION: |\n", 348 | "| K,L | 2:2-PHOSPHOGLYCOLIC ACID: | \n", 349 | "\n", 350 | "\n", 351 | "We need to input the new assignment manually when it is printed on screen. Notice that in the next cell, `echo` allows us to type the input in advance. \n", 352 | "\n", 353 | "`2) Show only unassigned conversions` -> `5) Go entry by entry to add chain ID conversion` -> `Letters we chose on the table in this cell` -> `7) Continue to next step of curation` -> `6) Finalize Curation`\n" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 11, 359 | "id": "fabe944a", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "Reading: ./TIM//simple_bank_sub/2y62+00.cif (1 of 6)\n", 367 | "Reading: ./TIM//simple_bank_sub/1ag1+00.cif (2 of 6)\n", 368 | "Reading: ./TIM//simple_bank_sub/1aw1+04.cif (3 of 6)\n", 369 | "Reading: ./TIM//simple_bank_sub/1aw1+02.cif (4 of 6)\n", 370 | "Reading: ./TIM//simple_bank_sub/1aw1+03.cif (5 of 6)\n", 371 | "Reading: ./TIM//simple_bank_sub/1aw1+01.cif (6 of 6)\n", 372 | "PDBClean MolID Conversion Build Menu\n", 373 | " Select one of the following options to proceed:\n", 374 | " 1) Show full conversion\n", 375 | " 2) Show only unassigned conversions\n", 376 | " 3) Enter input file\n", 377 | " 4) Search MolID to add chain ID conversion\n", 378 | " 5) Go entry by entry to add chain ID conversion\n", 379 | " 6) Remove a chain ID conversion\n", 380 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 381 | " \n", 382 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 383 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 384 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 385 | "2:GLYCEROL:\n", 386 | "2:WATER:\n", 387 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 388 | "1:PHOSPHATE ION:\n", 389 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 390 | "You need to accept 8 entity conversions\n", 391 | "You need to accept 12 total chain conversions\n", 392 | "PDBClean MolID Conversion Build Menu\n", 393 | " Select one of the following options to proceed:\n", 394 | " 1) Show full conversion\n", 395 | " 2) Show only unassigned conversions\n", 396 | " 3) Enter input file\n", 397 | " 4) Search MolID to add chain ID conversion\n", 398 | " 5) Go entry by entry to add chain ID conversion\n", 399 | " 6) Remove a chain ID conversion\n", 400 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 401 | " \n", 402 | "Option Number: Enter chain IDs for each of the following MolID.\n", 403 | "Comma separated, no spaces\n", 404 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n", 405 | " conversion templates. You can proceed to the next section\n", 406 | " by selection option 7 or, continue to edit your conversion\n", 407 | " template through this menu\n", 408 | " \n", 409 | "PDBClean MolID Conversion Build Menu\n", 410 | " Select one of the following options to proceed:\n", 411 | " 1) Show full conversion\n", 412 | " 2) Show only unassigned conversions\n", 413 | " 3) Enter input file\n", 414 | " 4) Search MolID to add chain ID conversion\n", 415 | " 5) Go entry by entry to add chain ID conversion\n", 416 | " 6) Remove a chain ID conversion\n", 417 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 418 | " \n", 419 | " 7) Continue to next step of curation\n", 420 | "Option Number: 6) Finalize Curation\n", 421 | "Option Number: Finalizing Curation ...\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "! echo '2\\n5\\nA\\nB\\nC\\nD,E\\nF,G\\nH,I\\nJ\\nK,L\\n7\\n6\\n' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub $PROJDIR/standard_MolID_bank" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "46092d65", 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "id": "be78b076", 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [] 444 | } 445 | ], 446 | "metadata": { 447 | "kernelspec": { 448 | "display_name": "PDBCleanV2", 449 | "language": "python", 450 | "name": "pdbcleanv2" 451 | }, 452 | "language_info": { 453 | "codemirror_mode": { 454 | "name": "ipython", 455 | "version": 3 456 | }, 457 | "file_extension": ".py", 458 | "mimetype": "text/x-python", 459 | "name": "python", 460 | "nbconvert_exporter": "python", 461 | "pygments_lexer": "ipython3", 462 | "version": "3.10.5" 463 | } 464 | }, 465 | "nbformat": 4, 466 | "nbformat_minor": 5 467 | } 468 | -------------------------------------------------------------------------------- /Notebooks/Step2.CreateOneCIFFilePerBiologicalAssembly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fcc80ba7", 6 | "metadata": {}, 7 | "source": [ 8 | "# 2. Clean Structures and Create one CIFF file per Biological Assembly" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "37b09ec8", 14 | "metadata": {}, 15 | "source": [ 16 | "## What is the goal of this notebook? \n", 17 | "\n", 18 | "This notebook achieves 2 goals:\n", 19 | "\n", 20 | "1. The first step 'cleans' the CIF files we downloaded in step 0. This step will remove some of the **data names** \n", 21 | "and **data blocks** included in the raw CIF files. A new directory 'clean_bank' is created in this step.\n", 22 | "\n", 23 | "\n", 24 | "2. The second step will create a new CIF files for each [biological assembly](https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-BioUnit) present in any structure. Each new structure will be saved with the suffix +0x.cif (where x is the number of the biological assembly). This step also standardizes the **data names** and **data blocks**. In particular how the coordinate portion of the file is printed.\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "fb5c24aa", 30 | "metadata": {}, 31 | "source": [ 32 | "## Import library and create working directory" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "id": "90edccb0", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from PDBClean import pdbclean_io, pdbutils, cleanutils" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "id": "101c2903", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# Path to project directory\n", 53 | "PROJDIR=\"./TIM/\"" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "id": "71d410ed", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Create directory where we will stored the 'clean' cif files.\n", 64 | "pdbclean_io.check_project(projdir=PROJDIR, level='clean_bank')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "f3503582", 70 | "metadata": {}, 71 | "source": [ 72 | "## Clean CIF files (standardize data blocks)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "id": "3dbc8c16", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "[1/244]: 7rpn.cif\n", 86 | "[2/244]: 2v2d.cif\n", 87 | "[3/244]: 4poc.cif\n", 88 | "[4/244]: 5zfx.cif\n", 89 | "[5/244]: 1o5x.cif\n", 90 | "[6/244]: 4gnj.cif\n", 91 | "[7/244]: 1ydv.cif\n", 92 | "[8/244]: 4zvj.cif\n", 93 | "[9/244]: 4ff7.cif\n", 94 | "[10/244]: 7qon.cif\n", 95 | "[11/244]: 1klu.cif\n", 96 | "[12/244]: 3qsr.cif\n", 97 | "[13/244]: 4o54.cif\n", 98 | "[14/244]: 2x1s.cif\n", 99 | "[15/244]: 3py2.cif\n", 100 | "[16/244]: 2vfh.cif\n", 101 | "[17/244]: 1hg3.cif\n", 102 | "[18/244]: 4obt.cif\n", 103 | "[19/244]: 6up5.cif\n", 104 | "[20/244]: 7sx1.cif\n", 105 | "[21/244]: 4hhp.cif\n", 106 | "[22/244]: 4o57.cif\n", 107 | "[23/244]: 1nf0.cif\n", 108 | "[24/244]: 4iot.cif\n", 109 | "[25/244]: 5tim.cif\n", 110 | "[26/244]: 1ml1.cif\n", 111 | "[27/244]: 2vfi.cif\n", 112 | "[28/244]: 2x1r.cif\n", 113 | "[29/244]: 3gvg.cif\n", 114 | "[30/244]: 1m7p.cif\n", 115 | "[31/244]: 1aw2.cif\n", 116 | "[32/244]: 7pek.cif\n", 117 | "[33/244]: 4zz9.cif\n", 118 | "[34/244]: 4o4v.cif\n", 119 | "[35/244]: 4o53.cif\n", 120 | "[36/244]: 1ney.cif\n", 121 | "[37/244]: 6upf.cif\n", 122 | "[38/244]: 4mva.cif\n", 123 | "[39/244]: 2y63.cif\n", 124 | "[40/244]: 5i3k.cif\n", 125 | "[41/244]: 4jeq.cif\n", 126 | "[42/244]: 4owg.cif\n", 127 | "[43/244]: 3qst.cif\n", 128 | "[44/244]: 5i3j.cif\n", 129 | "[45/244]: 2y62.cif\n", 130 | "[46/244]: 7tim.cif\n", 131 | "[47/244]: 4o52.cif\n", 132 | "[48/244]: 6up1.cif\n", 133 | "[49/244]: 4o4w.cif\n", 134 | "[50/244]: 4pod.cif\n", 135 | "[51/244]: 1ssd.cif\n", 136 | "[52/244]: 4br1.cif\n", 137 | "[53/244]: 7pej.cif\n", 138 | "[54/244]: 2v2c.cif\n", 139 | "[55/244]: 2x16.cif\n", 140 | "[56/244]: 2x1u.cif\n", 141 | "[57/244]: 1aw1.cif\n", 142 | "[58/244]: 3kxq.cif\n", 143 | "[59/244]: 6oog.cif\n", 144 | "[60/244]: 4o50.cif\n", 145 | "[61/244]: 5bmx.cif\n", 146 | "[62/244]: 5i3h.cif\n", 147 | "[63/244]: 3m9y.cif\n", 148 | "[64/244]: 3ta6.cif\n", 149 | "[65/244]: 1klg.cif\n", 150 | "[66/244]: 5i3i.cif\n", 151 | "[67/244]: 2y61.cif\n", 152 | "[68/244]: 7t0q.cif\n", 153 | "[69/244]: 6nee.cif\n", 154 | "[70/244]: 7rgc.cif\n", 155 | "[71/244]: 1ssg.cif\n", 156 | "[72/244]: 2x1t.cif\n", 157 | "[73/244]: 2j27.cif\n", 158 | "[74/244]: 1vga.cif\n", 159 | "[75/244]: 2vxn.cif\n", 160 | "[76/244]: 1mss.cif\n", 161 | "[77/244]: 5ujw.cif\n", 162 | "[78/244]: 1b9b.cif\n", 163 | "[79/244]: 3tim.cif\n", 164 | "[80/244]: 4mkn.cif\n", 165 | "[81/244]: 2i9e.cif\n", 166 | "[82/244]: 6w4u.cif\n", 167 | "[83/244]: 2v5b.cif\n", 168 | "[84/244]: 1su5.cif\n", 169 | "[85/244]: 2j24.cif\n", 170 | "[86/244]: 3pf3.cif\n", 171 | "[87/244]: 5gzp.cif\n", 172 | "[88/244]: 2ypi.cif\n", 173 | "[89/244]: 5ibx.cif\n", 174 | "[90/244]: 7az3.cif\n", 175 | "[91/244]: 1btm.cif\n", 176 | "[92/244]: 1tph.cif\n", 177 | "[93/244]: 1ci1.cif\n", 178 | "[94/244]: 3psv.cif\n", 179 | "[95/244]: 4ywi.cif\n", 180 | "[96/244]: 1trd.cif\n", 181 | "[97/244]: 3uwz.cif\n", 182 | "[98/244]: 5vwn.cif\n", 183 | "[99/244]: 1iih.cif\n", 184 | "[100/244]: 7aza.cif\n", 185 | "[101/244]: 4ohq.cif\n", 186 | "[102/244]: 6nxy.cif\n", 187 | "[103/244]: 2ian.cif\n", 188 | "[104/244]: 3s6d.cif\n", 189 | "[105/244]: 4z0s.cif\n", 190 | "[106/244]: 6nxx.cif\n", 191 | "[107/244]: 3krs.cif\n", 192 | "[108/244]: 1tre.cif\n", 193 | "[109/244]: 3psw.cif\n", 194 | "[110/244]: 4yxg.cif\n", 195 | "[111/244]: 1tim.cif\n", 196 | "[112/244]: 4nvt.cif\n", 197 | "[113/244]: 3uwy.cif\n", 198 | "[114/244]: 1if2.cif\n", 199 | "[115/244]: 5gv4.cif\n", 200 | "[116/244]: 7az4.cif\n", 201 | "[117/244]: 6nlh.cif\n", 202 | "[118/244]: 2iam.cif\n", 203 | "[119/244]: 1lzo.cif\n", 204 | "[120/244]: 8tim.cif\n", 205 | "[121/244]: 2dp3.cif\n", 206 | "[122/244]: 7rmn.cif\n", 207 | "[123/244]: 1sw7.cif\n", 208 | "[124/244]: 3uwu.cif\n", 209 | "[125/244]: 1tpc.cif\n", 210 | "[126/244]: 1iig.cif\n", 211 | "[127/244]: 1yya.cif\n", 212 | "[128/244]: 3th6.cif\n", 213 | "[129/244]: 4tim.cif\n", 214 | "[130/244]: 5brb.cif\n", 215 | "[131/244]: 4z0j.cif\n", 216 | "[132/244]: 6nxw.cif\n", 217 | "[133/244]: 7az9.cif\n", 218 | "[134/244]: 1wob.cif\n", 219 | "[135/244]: 1lyx.cif\n", 220 | "[136/244]: 4y90.cif\n", 221 | "[137/244]: 1tpb.cif\n", 222 | "[138/244]: 1tpu.cif\n", 223 | "[139/244]: 6cg9.cif\n", 224 | "[140/244]: 2vom.cif\n", 225 | "[141/244]: 6jox.cif\n", 226 | "[142/244]: 1tpw.cif\n", 227 | "[143/244]: 3uwv.cif\n", 228 | "[144/244]: 7r9b.cif\n", 229 | "[145/244]: 7rcq.cif\n", 230 | "[146/244]: 4g1k.cif\n", 231 | "[147/244]: 5eyw.cif\n", 232 | "[148/244]: 1r2s.cif\n", 233 | "[149/244]: 1r2r.cif\n", 234 | "[150/244]: 5upr.cif\n", 235 | "[151/244]: 1woa.cif\n", 236 | "[152/244]: 6bve.cif\n", 237 | "[153/244]: 1ag1.cif\n", 238 | "[154/244]: 1tri.cif\n", 239 | "[155/244]: 1tpv.cif\n", 240 | "[156/244]: 3uww.cif\n", 241 | "[157/244]: 6c2g.cif\n", 242 | "[158/244]: 4unk.cif\n", 243 | "[159/244]: 6d43.cif\n", 244 | "[160/244]: 2v5l.cif\n", 245 | "[161/244]: 1sux.cif\n", 246 | "[162/244]: 1tpe.cif\n", 247 | "[163/244]: 1tsi.cif\n", 248 | "[164/244]: 4y9a.cif\n", 249 | "[165/244]: 1n55.cif\n", 250 | "[166/244]: 7qh0.cif\n", 251 | "[167/244]: 1wyi.cif\n", 252 | "[168/244]: 7abx.cif\n", 253 | "[169/244]: 6nxq.cif\n", 254 | "[170/244]: 4y96.cif\n", 255 | "[171/244]: 7r7m.cif\n", 256 | "[172/244]: 1sw0.cif\n", 257 | "[173/244]: 1tpd.cif\n", 258 | "[174/244]: 4unl.cif\n", 259 | "[175/244]: 6tim.cif\n", 260 | "[176/244]: 2oma.cif\n", 261 | "[177/244]: 1tpf.cif\n", 262 | "[178/244]: 2jgq.cif\n", 263 | "[179/244]: 4ymz.cif\n", 264 | "[180/244]: 4y8f.cif\n", 265 | "[181/244]: 6nxs.cif\n", 266 | "[182/244]: 1r2t.cif\n", 267 | "[183/244]: 6nxr.cif\n", 268 | "[184/244]: 7skj.cif\n", 269 | "[185/244]: 4x22.cif\n", 270 | "[186/244]: 2jk2.cif\n", 271 | "[187/244]: 1sw3.cif\n", 272 | "[188/244]: 1hti.cif\n", 273 | "[189/244]: 5csr.cif\n", 274 | "[190/244]: 4bi5.cif\n", 275 | "[191/244]: 2ven.cif\n", 276 | "[192/244]: 1spq.cif\n", 277 | "[193/244]: 5zg4.cif\n", 278 | "[194/244]: 5cg7.cif\n", 279 | "[195/244]: 5zg5.cif\n", 280 | "[196/244]: 1tti.cif\n", 281 | "[197/244]: 3ypi.cif\n", 282 | "[198/244]: 1dkw.cif\n", 283 | "[199/244]: 5css.cif\n", 284 | "[200/244]: 1m7o.cif\n", 285 | "[201/244]: 4k6a.cif\n", 286 | "[202/244]: 4bi6.cif\n", 287 | "[203/244]: 2vem.cif\n", 288 | "[204/244]: 5zga.cif\n", 289 | "[205/244]: 1sq7.cif\n", 290 | "[206/244]: 5bmw.cif\n", 291 | "[207/244]: 5i3g.cif\n", 292 | "[208/244]: 3tao.cif\n", 293 | "[209/244]: 5i3f.cif\n", 294 | "[210/244]: 6ooi.cif\n", 295 | "[211/244]: 1ttj.cif\n", 296 | "[212/244]: 2vel.cif\n", 297 | "[213/244]: 4e41.cif\n", 298 | "[214/244]: 7n8u.cif\n", 299 | "[215/244]: 4bi7.cif\n", 300 | "[216/244]: 1tmh.cif\n", 301 | "[217/244]: 1m6j.cif\n", 302 | "[218/244]: 1mo0.cif\n", 303 | "[219/244]: 2v0t.cif\n", 304 | "[220/244]: 2vfd.cif\n", 305 | "[221/244]: 6r8h.cif\n", 306 | "[222/244]: 1kv5.cif\n", 307 | "[223/244]: 6up8.cif\n", 308 | "[224/244]: 1i45.cif\n", 309 | "[225/244]: 7rde.cif\n", 310 | "[226/244]: 2vfe.cif\n", 311 | "[227/244]: 2vei.cif\n", 312 | "[228/244]: 3pvf.cif\n", 313 | "[229/244]: 2x2g.cif\n", 314 | "[230/244]: 2vek.cif\n", 315 | "[231/244]: 1tcd.cif\n", 316 | "[232/244]: 2vfg.cif\n", 317 | "[233/244]: 2v2h.cif\n", 318 | "[234/244]: 4wje.cif\n", 319 | "[235/244]: 1qds.cif\n", 320 | "[236/244]: 4p61.cif\n", 321 | "[237/244]: 1w0m.cif\n", 322 | "[238/244]: 2btm.cif\n", 323 | "[239/244]: 1ypi.cif\n", 324 | "[240/244]: 5bnk.cif\n", 325 | "[241/244]: 4rcx.cif\n", 326 | "[242/244]: 2vff.cif\n", 327 | "[243/244]: 3pwa.cif\n", 328 | "[244/244]: 2h6r.cif\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "cleanutils.process(projdir=PROJDIR, step='clean', source='raw_bank', target='clean_bank')" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "af89bbd9", 339 | "metadata": {}, 340 | "source": [ 341 | "## Simplify and Split into Biological Assemblies " 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 5, 347 | "id": "aa8acf0b", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# Create directory to store new structures\n", 352 | "pdbclean_io.check_project(projdir=PROJDIR, level='simple_bank')" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 6, 358 | "id": "d481cd0f", 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "[1/244]: 7rpn.cif\n", 366 | "[2/244]: 2v2d.cif\n", 367 | "[3/244]: 4poc.cif\n", 368 | "[4/244]: 5zfx.cif\n", 369 | "[5/244]: 1o5x.cif\n", 370 | "[6/244]: 4gnj.cif\n", 371 | "[7/244]: 1ydv.cif\n", 372 | "[8/244]: 4zvj.cif\n", 373 | "[9/244]: 4ff7.cif\n", 374 | "[10/244]: 7qon.cif\n", 375 | "[11/244]: 1klu.cif\n", 376 | "[12/244]: 3qsr.cif\n", 377 | "[13/244]: 4o54.cif\n", 378 | "[14/244]: 2x1s.cif\n", 379 | "[15/244]: 3py2.cif\n", 380 | "[16/244]: 2vfh.cif\n", 381 | "[17/244]: 1hg3.cif\n", 382 | "[18/244]: 4obt.cif\n", 383 | "[19/244]: 6up5.cif\n", 384 | "[20/244]: 7sx1.cif\n", 385 | "[21/244]: 4hhp.cif\n", 386 | "[22/244]: 4o57.cif\n", 387 | "[23/244]: 1nf0.cif\n", 388 | "[24/244]: 4iot.cif\n", 389 | "[25/244]: 5tim.cif\n", 390 | "[26/244]: 1ml1.cif\n", 391 | "[27/244]: 2vfi.cif\n", 392 | "[28/244]: 2x1r.cif\n", 393 | "[29/244]: 3gvg.cif\n", 394 | "[30/244]: 1m7p.cif\n", 395 | "[31/244]: 1aw2.cif\n", 396 | "[32/244]: 7pek.cif\n", 397 | "[33/244]: 4zz9.cif\n", 398 | "[34/244]: 4o4v.cif\n", 399 | "[35/244]: 4o53.cif\n", 400 | "[36/244]: 1ney.cif\n", 401 | "[37/244]: 6upf.cif\n", 402 | "[38/244]: 4mva.cif\n", 403 | "[39/244]: 2y63.cif\n", 404 | "[40/244]: 5i3k.cif\n", 405 | "[41/244]: 4jeq.cif\n", 406 | "[42/244]: 4owg.cif\n", 407 | "[43/244]: 3qst.cif\n", 408 | "[44/244]: 5i3j.cif\n", 409 | "[45/244]: 2y62.cif\n", 410 | "[46/244]: 7tim.cif\n", 411 | "[47/244]: 4o52.cif\n", 412 | "[48/244]: 6up1.cif\n", 413 | "[49/244]: 4o4w.cif\n", 414 | "[50/244]: 4pod.cif\n", 415 | "[51/244]: 1ssd.cif\n", 416 | "[52/244]: 4br1.cif\n", 417 | "[53/244]: 7pej.cif\n", 418 | "[54/244]: 2v2c.cif\n", 419 | "[55/244]: 2x16.cif\n", 420 | "[56/244]: 2x1u.cif\n", 421 | "[57/244]: 1aw1.cif\n", 422 | "[58/244]: 3kxq.cif\n", 423 | "[59/244]: 6oog.cif\n", 424 | "[60/244]: 4o50.cif\n", 425 | "[61/244]: 5bmx.cif\n", 426 | "[62/244]: 5i3h.cif\n", 427 | "[63/244]: 3m9y.cif\n", 428 | "[64/244]: 3ta6.cif\n", 429 | "[65/244]: 1klg.cif\n", 430 | "[66/244]: 5i3i.cif\n", 431 | "[67/244]: 2y61.cif\n", 432 | "[68/244]: 7t0q.cif\n", 433 | "[69/244]: 6nee.cif\n", 434 | "[70/244]: 7rgc.cif\n", 435 | "[71/244]: 1ssg.cif\n", 436 | "[72/244]: 2x1t.cif\n", 437 | "[73/244]: 2j27.cif\n", 438 | "[74/244]: 1vga.cif\n", 439 | "[75/244]: 2vxn.cif\n", 440 | "[76/244]: 1mss.cif\n", 441 | "[77/244]: 5ujw.cif\n", 442 | "[78/244]: 1b9b.cif\n", 443 | "[79/244]: 3tim.cif\n", 444 | "[80/244]: 4mkn.cif\n", 445 | "[81/244]: 2i9e.cif\n", 446 | "[82/244]: 6w4u.cif\n", 447 | "[83/244]: 2v5b.cif\n", 448 | "[84/244]: 1su5.cif\n", 449 | "[85/244]: 2j24.cif\n", 450 | "[86/244]: 3pf3.cif\n", 451 | "[87/244]: 5gzp.cif\n", 452 | "[88/244]: 2ypi.cif\n", 453 | "[89/244]: 5ibx.cif\n", 454 | "[90/244]: 7az3.cif\n", 455 | "[91/244]: 1btm.cif\n", 456 | "[92/244]: 1tph.cif\n", 457 | "[93/244]: 1ci1.cif\n", 458 | "[94/244]: 3psv.cif\n", 459 | "[95/244]: 4ywi.cif\n", 460 | "[96/244]: 1trd.cif\n", 461 | "[97/244]: 3uwz.cif\n", 462 | "[98/244]: 5vwn.cif\n", 463 | "[99/244]: 1iih.cif\n", 464 | "[100/244]: 7aza.cif\n", 465 | "[101/244]: 4ohq.cif\n", 466 | "[102/244]: 6nxy.cif\n", 467 | "[103/244]: 2ian.cif\n", 468 | "[104/244]: 3s6d.cif\n", 469 | "[105/244]: 4z0s.cif\n", 470 | "[106/244]: 6nxx.cif\n", 471 | "[107/244]: 3krs.cif\n", 472 | "[108/244]: 1tre.cif\n", 473 | "[109/244]: 3psw.cif\n", 474 | "[110/244]: 4yxg.cif\n", 475 | "[111/244]: 1tim.cif\n", 476 | "[112/244]: 4nvt.cif\n", 477 | "[113/244]: 3uwy.cif\n", 478 | "[114/244]: 1if2.cif\n", 479 | "[115/244]: 5gv4.cif\n", 480 | "[116/244]: 7az4.cif\n", 481 | "[117/244]: 6nlh.cif\n", 482 | "[118/244]: 2iam.cif\n", 483 | "[119/244]: 1lzo.cif\n", 484 | "[120/244]: 8tim.cif\n", 485 | "[121/244]: 2dp3.cif\n", 486 | "[122/244]: 7rmn.cif\n", 487 | "[123/244]: 1sw7.cif\n", 488 | "[124/244]: 3uwu.cif\n", 489 | "[125/244]: 1tpc.cif\n", 490 | "[126/244]: 1iig.cif\n", 491 | "[127/244]: 1yya.cif\n", 492 | "[128/244]: 3th6.cif\n", 493 | "[129/244]: 4tim.cif\n", 494 | "[130/244]: 5brb.cif\n", 495 | "[131/244]: 4z0j.cif\n", 496 | "[132/244]: 6nxw.cif\n", 497 | "[133/244]: 7az9.cif\n", 498 | "[134/244]: 1wob.cif\n", 499 | "[135/244]: 1lyx.cif\n", 500 | "[136/244]: 4y90.cif\n", 501 | "[137/244]: 1tpb.cif\n", 502 | "[138/244]: 1tpu.cif\n", 503 | "[139/244]: 6cg9.cif\n", 504 | "[140/244]: 2vom.cif\n", 505 | "[141/244]: 6jox.cif\n", 506 | "[142/244]: 1tpw.cif\n", 507 | "[143/244]: 3uwv.cif\n", 508 | "[144/244]: 7r9b.cif\n", 509 | "[145/244]: 7rcq.cif\n", 510 | "[146/244]: 4g1k.cif\n", 511 | "[147/244]: 5eyw.cif\n", 512 | "[148/244]: 1r2s.cif\n", 513 | "[149/244]: 1r2r.cif\n", 514 | "[150/244]: 5upr.cif\n", 515 | "[151/244]: 1woa.cif\n", 516 | "[152/244]: 6bve.cif\n", 517 | "[153/244]: 1ag1.cif\n", 518 | "[154/244]: 1tri.cif\n", 519 | "[155/244]: 1tpv.cif\n", 520 | "[156/244]: 3uww.cif\n", 521 | "[157/244]: 6c2g.cif\n", 522 | "[158/244]: 4unk.cif\n", 523 | "[159/244]: 6d43.cif\n", 524 | "[160/244]: 2v5l.cif\n", 525 | "[161/244]: 1sux.cif\n", 526 | "[162/244]: 1tpe.cif\n", 527 | "[163/244]: 1tsi.cif\n", 528 | "[164/244]: 4y9a.cif\n", 529 | "[165/244]: 1n55.cif\n", 530 | "[166/244]: 7qh0.cif\n", 531 | "[167/244]: 1wyi.cif\n", 532 | "[168/244]: 7abx.cif\n", 533 | "[169/244]: 6nxq.cif\n", 534 | "[170/244]: 4y96.cif\n", 535 | "[171/244]: 7r7m.cif\n", 536 | "[172/244]: 1sw0.cif\n", 537 | "[173/244]: 1tpd.cif\n", 538 | "[174/244]: 4unl.cif\n", 539 | "[175/244]: 6tim.cif\n", 540 | "[176/244]: 2oma.cif\n", 541 | "[177/244]: 1tpf.cif\n", 542 | "[178/244]: 2jgq.cif\n", 543 | "[179/244]: 4ymz.cif\n", 544 | "[180/244]: 4y8f.cif\n", 545 | "[181/244]: 6nxs.cif\n", 546 | "[182/244]: 1r2t.cif\n", 547 | "[183/244]: 6nxr.cif\n", 548 | "[184/244]: 7skj.cif\n", 549 | "[185/244]: 4x22.cif\n", 550 | "[186/244]: 2jk2.cif\n", 551 | "[187/244]: 1sw3.cif\n", 552 | "[188/244]: 1hti.cif\n", 553 | "[189/244]: 5csr.cif\n", 554 | "[190/244]: 4bi5.cif\n", 555 | "[191/244]: 2ven.cif\n", 556 | "[192/244]: 1spq.cif\n", 557 | "[193/244]: 5zg4.cif\n", 558 | "[194/244]: 5cg7.cif\n", 559 | "[195/244]: 5zg5.cif\n", 560 | "[196/244]: 1tti.cif\n", 561 | "[197/244]: 3ypi.cif\n", 562 | "[198/244]: 1dkw.cif\n", 563 | "[199/244]: 5css.cif\n", 564 | "[200/244]: 1m7o.cif\n", 565 | "[201/244]: 4k6a.cif\n", 566 | "[202/244]: 4bi6.cif\n", 567 | "[203/244]: 2vem.cif\n", 568 | "[204/244]: 5zga.cif\n", 569 | "[205/244]: 1sq7.cif\n", 570 | "[206/244]: 5bmw.cif\n", 571 | "[207/244]: 5i3g.cif\n", 572 | "[208/244]: 3tao.cif\n", 573 | "[209/244]: 5i3f.cif\n", 574 | "[210/244]: 6ooi.cif\n", 575 | "[211/244]: 1ttj.cif\n", 576 | "[212/244]: 2vel.cif\n", 577 | "[213/244]: 4e41.cif\n", 578 | "[214/244]: 7n8u.cif\n", 579 | "[215/244]: 4bi7.cif\n", 580 | "[216/244]: 1tmh.cif\n", 581 | "[217/244]: 1m6j.cif\n", 582 | "[218/244]: 1mo0.cif\n", 583 | "[219/244]: 2v0t.cif\n", 584 | "[220/244]: 2vfd.cif\n", 585 | "[221/244]: 6r8h.cif\n", 586 | "[222/244]: 1kv5.cif\n", 587 | "[223/244]: 6up8.cif\n", 588 | "[224/244]: 1i45.cif\n", 589 | "[225/244]: 7rde.cif\n", 590 | "[226/244]: 2vfe.cif\n", 591 | "[227/244]: 2vei.cif\n", 592 | "[228/244]: 3pvf.cif\n", 593 | "[229/244]: 2x2g.cif\n", 594 | "[230/244]: 2vek.cif\n", 595 | "[231/244]: 1tcd.cif\n", 596 | "[232/244]: 2vfg.cif\n", 597 | "[233/244]: 2v2h.cif\n", 598 | "[234/244]: 4wje.cif\n", 599 | "[235/244]: 1qds.cif\n", 600 | "[236/244]: 4p61.cif\n", 601 | "[237/244]: 1w0m.cif\n", 602 | "[238/244]: 2btm.cif\n", 603 | "[239/244]: 1ypi.cif\n", 604 | "[240/244]: 5bnk.cif\n", 605 | "[241/244]: 4rcx.cif\n", 606 | "[242/244]: 2vff.cif\n", 607 | "[243/244]: 3pwa.cif\n", 608 | "[244/244]: 2h6r.cif\n" 609 | ] 610 | } 611 | ], 612 | "source": [ 613 | "cleanutils.process(projdir=PROJDIR, step='simplify', source='clean_bank', target='simple_bank')" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "id": "6c5b8e0d", 619 | "metadata": {}, 620 | "source": [] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "id": "756e588d", 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "13d466a2", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [] 637 | } 638 | ], 639 | "metadata": { 640 | "kernelspec": { 641 | "display_name": "PDBCleanV2", 642 | "language": "python", 643 | "name": "pdbcleanv2" 644 | }, 645 | "language_info": { 646 | "codemirror_mode": { 647 | "name": "ipython", 648 | "version": 3 649 | }, 650 | "file_extension": ".py", 651 | "mimetype": "text/x-python", 652 | "name": "python", 653 | "nbconvert_exporter": "python", 654 | "pygments_lexer": "ipython3", 655 | "version": "3.10.5" 656 | } 657 | }, 658 | "nbformat": 4, 659 | "nbformat_minor": 5 660 | } 661 | -------------------------------------------------------------------------------- /src/pdbcleanresiduestandardizationutils.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | from Bio.PDB.MMCIFParser import FastMMCIFParser 4 | from PDBClean.alignmentutils import * 5 | from PDBClean.listutils import * 6 | # 7 | 8 | #################### 9 | # INITIALIZE STEPS # 10 | #################### 11 | 12 | def pdb_to_structurelists(filelist): 13 | """ 14 | Iterates through a list CIF(s) and retrieves structure IDs, chain IDs, and maps chain IDs to their sequences, 15 | and maps chain IDs to their residue numbers. 16 | 17 | Parameters: 18 | ----------- 19 | filelist : list of str 20 | list of file paths for all '.cif' files in specified directory 21 | 22 | Returns: 23 | -------- 24 | Structure_Sequences : dict 25 | Contains dictionary where chain ID is mapped to their sequence for each structure. 26 | ChID_ResiNum_Vector : list of dict 27 | Each dictionary maps the chain ID to their residue numbers for a structure 28 | structid_list : list of str 29 | List of unique structure identifiers for each CIF. Format is 'input directory / CIF' 30 | chid_list : list of str 31 | A list containing all the chain IDs from CIF(s) 32 | """ 33 | 34 | # Structure_Sequences is the master list of maps from chain IDs to their sequences 35 | Structure_Sequences = {} 36 | ChID_ResiNum_Vector = [] 37 | chid_list = [] 38 | structid_list = [] 39 | 40 | N = 0 41 | for my_file in filelist: 42 | N += 1 43 | print("Reading:" + ' ' + my_file + " (" + str(N) + " of " + str(len(filelist)) + ")") 44 | struct = FastMMCIFParser(auth_residues=False,QUIET=1).get_structure(str(my_file), my_file) 45 | structid_list.append(struct.get_id()) 46 | chid_seq_map = {} 47 | chid_resinum_map = {} 48 | # Only written for structures with only one model in them 49 | for chain in struct[0]: 50 | if (chain.get_id() not in chid_resinum_map): #FAPA: HERE WE NEED TO ADD IF TO CHECK IF pdbx_PDB_ins_code != '?' 51 | chid_resinum_map[chain.get_id()] = [] 52 | key = str(struct.get_id()) + "_" + str(chain.get_id()) 53 | resinum_list = [] 54 | seq = "" 55 | for residue in chain: 56 | #print("printing residue ids: "+ str(residue.get_id()[2])) 57 | resinum_list.append(residue.get_id()[1]) 58 | #chid_resinum_map[chain.get_id()].append(residue.get_id()[1]) 59 | # For each residue we extract both the residue number and the associated "letter" (pdbx_PDB_ins_code) 60 | chid_resinum_map[chain.get_id()].append(str(residue.get_id()[1])+str(residue.get_id()[2])) #FAPA 17 oct 2024 61 | #resinum_list.append(residue.id()[1]) 62 | #chid_resinum_map[chain.get_id()].append(residue.id()[1]) 63 | seq += ResnConvert(residue.get_resname()) 64 | Structure_Sequences[key] = seq 65 | # chid_list is a master list of all chainIDs used 66 | chid_list.append(chain.get_id()) 67 | ChID_ResiNum_Vector.append(chid_resinum_map) 68 | chid_set = set(chid_list) 69 | chid_list = sorted(list(chid_set)) 70 | #print(ChID_ResiNum_Vector) 71 | return Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list 72 | 73 | ######################################### 74 | # INTERACTIVE STANDARDIZATION FUNCTIONS # 75 | ######################################### 76 | 77 | def perform_multiple_alignment(Structure_Sequences, ChID_ResiNum_Vector, structid_list, chid_list, check): 78 | """ 79 | Interactive user interface for performing multiple alignments 80 | 81 | Parameters: 82 | ----------- 83 | Structure_Sequences : dict 84 | Contains dictionary where chain ID is mapped to their sequence for each structure. 85 | ChID_ResiNum_Vector : list of dict 86 | Each dictionary maps the chain ID to their residue numbers for a structure 87 | structid_list : list of str 88 | List of unique structure identifiers for each CIF. Format is 'input directory / CIF' 89 | chid_list : list of str 90 | A list containing all the chain IDs from CIF(s) 91 | check : str 92 | Option chosen by user which opens the submenu 93 | 94 | Returns: 95 | -------- 96 | Structure_Sequences_Aligned : dict 97 | A dictionary where each key is a combination of structure identifier and chain ID, 98 | and the value is the aligned sequence for that chain. 99 | Structure_ConversionTemplate : dict 100 | A dictionary mapping each structure identifier to a conversion template, 101 | which contains mappings of residue numbers from the original sequence to the aligned sequence. 102 | chid_list : list of str 103 | Updated list of chain IDs where some may have been removed based on the user's options 104 | check : str 105 | Updated string representing the state of the main menu, set to '1' to indicate a state change. 106 | """ 107 | Structure_Sequences_Aligned = {} 108 | Structure_ConversionTemplate = {} 109 | Structure_Sequences_GAPS = {} 110 | input_submenu = "" 111 | while(input_submenu != "QUIT"): 112 | print(" Perform multiple alignments to identify residues", 113 | " 1) Show list of chains to be standardized", 114 | " 2) Remove chain IDs from list of chains to be standardized", 115 | " 3) Input file of chain IDs to remove from list of chains to be standardized", 116 | " 4) Perform multiple alignments", 117 | sep="\n") 118 | input_submenu = input('Option Number: ') 119 | if (input_submenu == "1"): 120 | show_list(chid_list) 121 | elif (input_submenu == "2"): 122 | chid_list = remove_user_defined_chain_from_list(chid_list) 123 | elif (input_submenu == "3"): 124 | chid_list = remove_file_defined_chain_from_list(chid_list) 125 | elif (input_submenu == "4"): 126 | print(" Choose occupancy threshold for residue renumbering", 127 | " Input an integer number between 1 and 100", 128 | sep="\n") 129 | user_gap = input('Occupancy threshold: ') 130 | user_gap = int(user_gap) 131 | 132 | for chid in chid_list: 133 | this_chainsseq_list = [] 134 | this_chainsseq_list_ids = [] #FAPA 135 | this_chainsseq_aligned_list = [] 136 | for I in range(len(structid_list)): 137 | key = str(structid_list[I]) + "_" + chid 138 | if key in Structure_Sequences: 139 | this_chainsseq_list.append(Structure_Sequences[key]) 140 | this_chainsseq_list_ids.append(structid_list[I]) # FAPA 141 | #this_chainsseq_aligned_list_map = AlignSequences_v2(this_chainsseq_list, chid,this_chainsseq_list_ids ) 142 | #this_chainsseq_aligned_list_map = AlignSequences_v3(this_chainsseq_list, chid, this_chainsseq_list_ids) #FAPA MAY2024 143 | this_chainsseq_aligned_list_map, this_chainseq_gap_percentages = AlignSequences_v4(this_chainsseq_list, chid, 144 | this_chainsseq_list_ids) # FAPA JULY2024 145 | i = 0 146 | for I in range(len(structid_list)): 147 | key = str(structid_list[I]) + "_" + chid 148 | if key in Structure_Sequences: 149 | #Structure_Sequences_Aligned[key] = this_chainsseq_aligned_list[i] 150 | Structure_Sequences_Aligned[key] = this_chainsseq_aligned_list_map[str(structid_list[I])] 151 | Structure_Sequences_GAPS[key] = this_chainseq_gap_percentages #FAPA 152 | i += 1 153 | 154 | #THIS IS THE VERSION THAT WORKS, COMMENTED SO WE TRY SOMETHING NEW 155 | #for I in range(len(structid_list)): 156 | # conversion_template = {} 157 | # for chain in ChID_ResiNum_Vector[I]: 158 | # resinum_aligned_list = [] 159 | # key = str(structid_list[I]) + "_" + str(chain) 160 | # if key in Structure_Sequences_Aligned: 161 | # seq = Structure_Sequences_Aligned[key] 162 | # i = 0 163 | # for resn in seq: 164 | # i += 1 165 | # if (resn != "-"): 166 | # resinum_aligned_list.append(i) 167 | # i = 0 168 | # for residue in ChID_ResiNum_Vector[I][chain]: 169 | # key2 = chain + "_" + str(residue) 170 | # conversion_template[key2] = resinum_aligned_list[i] 171 | # i += 1 172 | # Structure_ConversionTemplate[structid_list[I]] = conversion_template 173 | 174 | # MY TEST STARTS HERE, WITH VARIATIONS OF THE CODE ABOVE 175 | 176 | for I in range(len(structid_list)): 177 | conversion_template = {} 178 | for chain in ChID_ResiNum_Vector[I]: 179 | #print(ChID_ResiNum_Vector[I]) 180 | #residue_numbers_users = [residue.get_id()[1] for residue in chain.get_residues()] 181 | #print(residue_numbers_users) 182 | resinum_aligned_list = [] 183 | key = str(structid_list[I]) + "_" + str(chain) 184 | if key in Structure_Sequences_Aligned: 185 | seq = Structure_Sequences_Aligned[key] 186 | gaps = Structure_Sequences_GAPS[key] 187 | i = 0 188 | counter=1 189 | new_res_num=[] 190 | freq_tracker=1 191 | gap_tracker=0 192 | gap_letter=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] 193 | for freq in gaps: 194 | #freq_tracker = 1 195 | #gap_tracker = 0 196 | #print(freq) 197 | if freq < 100-user_gap: # accepted gap percentage based on user defined occupancy threshold 198 | new_res_num.append(counter) 199 | counter += 1 200 | freq_tracker=freq 201 | gap_tracker=0 202 | #print("hello") 203 | else: 204 | #print(gap_tracker) 205 | #new_res_num.append(str(counter)+"_"+str(gap_tracker)) 206 | new_res_num.append(str(counter-1) +" "+str(gap_letter[gap_tracker])) 207 | gap_tracker+=1 208 | freq_tracker=freq 209 | 210 | #print(new_res_num) 211 | #print(len(new_res_num)) 212 | #print(len(seq)) 213 | 214 | 215 | for resn in seq: 216 | #print(new_res_num[i]) 217 | if (resn != "-"): 218 | resinum_aligned_list.append(new_res_num[i]) 219 | i += 1 220 | 221 | #print("resinum_aligned_list") 222 | #print(len(resinum_aligned_list)) 223 | #print(resinum_aligned_list) 224 | 225 | #i = 0 226 | 227 | #print("THIS IS THE CHAIN NUMBER TEST") 228 | #print(ChID_ResiNum_Vector[I][chain]) #FAPA 229 | 230 | for residue in range(len(resinum_aligned_list)): 231 | #i +=1 232 | #print("this is the value of i:", i) 233 | #print("this is resimnum_aligned_list[i]:", resinum_aligned_list[residue] ) 234 | #print("this is the value of residue:", residue) 235 | #print("this is what is in the structure:",ChID_ResiNum_Vector[I][chain][residue]) 236 | 237 | key2 = chain + "_" + str(ChID_ResiNum_Vector[I][chain][residue]) 238 | #print("key 2 is:",key2) 239 | #print(key2) #FAPA 240 | #print(resinum_aligned_list[i]) #FAPA 241 | conversion_template[key2] = resinum_aligned_list[residue] 242 | i += 1 243 | #print(conversion_template) 244 | 245 | 246 | 247 | #for residue in ChID_ResiNum_Vector[I][chain]: 248 | # #i +=1 249 | # print("this is the value of i:", i) 250 | # print("this is resimnum_aligned_list[i]:", resinum_aligned_list[i] ) 251 | # print("this is the value of residue:", residue) 252 | # if i == len(resinum_aligned_list)-1: 253 | # break 254 | # elif resinum_aligned_list[i] in ChID_ResiNum_Vector[I][chain]: 255 | # key2 = chain + "_" + str(residue) 256 | # #print(key2) #FAPA 257 | # #print(resinum_aligned_list[i]) #FAPA 258 | # conversion_template[key2] = resinum_aligned_list[i] 259 | # i += 1 260 | #print(conversion_template) 261 | 262 | Structure_ConversionTemplate[structid_list[I]] = conversion_template 263 | 264 | 265 | 266 | 267 | check = "1" 268 | input_submenu = "QUIT" 269 | return Structure_Sequences_Aligned, Structure_ConversionTemplate, chid_list, check 270 | 271 | def show_conversiontemplate(Structure_ConversionTemplate): 272 | """ 273 | Prints the conversion template to screen 274 | 275 | Paramters: 276 | ---------- 277 | Structure_ConversionTemplate : dict 278 | A dictionary mapping each structure identifier to a conversion template, 279 | which contains mappings of residue numbers from the original sequence to the aligned sequence. 280 | 281 | Returns: 282 | -------- 283 | None 284 | """ 285 | 286 | for structid in Structure_ConversionTemplate: 287 | print(structid) 288 | for key in Structure_ConversionTemplate[structid]: 289 | print(key + ":" + str(Structure_ConversionTemplate[structid][key])) 290 | 291 | ## FAPA 292 | def write_and_show_conversiontemplate(Structure_ConversionTemplate, target_dir, write_csv=True): 293 | """ 294 | Writes and displays a mapping of old residue IDs to new residue IDs for each structure. 295 | 296 | This function prints the mapping to the console and optionally writes it to a CSV file 297 | in the specified target directory. 298 | 299 | Parameters: 300 | ----------- 301 | Structure_ConversionTemplate : dict 302 | list of file paths for all '.cif' files in specified directory 303 | target_dir : str 304 | Directory where the new files will be saved 305 | write_csv : bool, optional 306 | Writes the mapping to a CSV file named 'OldResID_NewResID_Map.csv' if True. Default is 'True'. 307 | 308 | Returns: 309 | -------- 310 | None 311 | """ 312 | 313 | if write_csv: 314 | with open(f'{target_dir}/OldResID_NewResID_Map.csv', 'w') as fout: 315 | fout.write('OldResID:NewResId:File\n') 316 | 317 | 318 | for structid in Structure_ConversionTemplate: 319 | #print(structid) 320 | for key in Structure_ConversionTemplate[structid]: 321 | #print(key + ":" + str(Structure_ConversionTemplate[structid][key])) 322 | #structid_for_print=[x.split("/")[-1] for x in structid] 323 | structid_for_print = structid.split("/")[-1] 324 | if write_csv: 325 | with open(f'{target_dir}/OldResID_NewResID_Map.csv', 'a') as fout: 326 | fout.write(f'{key}:{str(Structure_ConversionTemplate[structid][key])}:{structid_for_print}\n') 327 | 328 | # FAPA 329 | 330 | ################# 331 | # FINALIZE STEP # 332 | ################# 333 | 334 | def conversiontemplate_to_pdb(filelist, Structure_ConversionTemplate, target_dir=None): 335 | """ 336 | Saves the conversion template into re-written CIF(s) which are placed into the target directory 337 | 338 | Parameters: 339 | ----------- 340 | filelist : str 341 | list of file paths for all '.cif' files in specified directory 342 | Structure_ConversionTemplate : dict 343 | A dictionary mapping each structure identifier to a conversion template, 344 | which contains mappings of residue numbers from the original sequence to the aligned sequence. 345 | target_dir : str, optional 346 | Directory where the new files will be saved. If none, no files will be saved. 347 | 348 | Returns: 349 | -------- 350 | None 351 | """ 352 | for my_files in filelist: 353 | newciffilename=target_dir+'/'+my_files.split('/')[-1] 354 | with open(my_files) as myfile: 355 | with open(newciffilename, 'w') as newciffile: 356 | # Now figure out which file is which template 357 | conversion_template = Structure_ConversionTemplate[myfile.name] 358 | for line in myfile: 359 | if (line[0:4] == "ATOM") or (line[0:6] == "HETATM"): 360 | # Chains outside map should not exist but just in case 361 | line_split = line.strip() 362 | line_split = line.split() 363 | key = line_split[17] + "_" + str(line_split[15]) 364 | if key in conversion_template: 365 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + line_split[7] + " " + line_split[8] + " " + line_split[9] + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n" 366 | newciffile.write(newline) 367 | else: 368 | newciffile.write(line) 369 | else: 370 | newciffile.write(line) 371 | 372 | # FAPA MAY 2024 TEST STARTS 373 | 374 | def conversiontemplate_to_pdb_FAPA(filelist, Structure_ConversionTemplate, target_dir=None): 375 | """ 376 | Saves the conversion template into re-written CIF(s) which are placed into the target directory. 377 | This function considers cases where a residue number also includes a letter. 378 | 379 | Parameters: 380 | ----------- 381 | filelist : str 382 | list of file paths for all '.cif' files in specified directory 383 | Structure_ConversionTemplate : dict 384 | A dictionary mapping each structure identifier to a conversion template, 385 | which contains mappings of residue numbers from the original sequence to the aligned sequence. 386 | target_dir : str, optional 387 | Directory where the new files will be saved. If none, no files will be saved. 388 | 389 | Returns: 390 | -------- 391 | None 392 | """ 393 | for my_files in filelist: 394 | newciffilename=target_dir+'/'+my_files.split('/')[-1] 395 | with open(my_files) as myfile: 396 | with open(newciffilename, 'w') as newciffile: 397 | # Now figure out which file is which template 398 | conversion_template = Structure_ConversionTemplate[myfile.name] 399 | #print(conversion_template) 400 | 401 | 402 | for line in myfile: 403 | resnum=1 404 | #old_line_resnum=0 405 | if (line[0:4] == "ATOM") or (line[0:6] == "HETATM"): 406 | # Chains outside map should not exist but just in case 407 | #line_split = line.strip() 408 | line_split = line.split() 409 | #print(line_split[8]) 410 | #new_line_resnum=int(line_split[8]) 411 | 412 | #if new_line_resnum == start_line_resnum: 413 | 414 | # We need to consider the value of pdbx_PDB_ins_code, in column 9 415 | # This is considered in the key 416 | # and original value will be overwritten with '?' 417 | # in next version, we will add the letter value to column 9 418 | 419 | if str(line_split[9]) == '?': 420 | key = line_split[6] + "_" + str(line_split[8]) + " " # FAPA: WE WANT CHAINID_RESID TO BE THE KEY 421 | else: 422 | key = line_split[6] + "_" + str(line_split[8]) + str(line_split[9]) 423 | 424 | 425 | #key = line_split[6] + "_" + str(line_split[8]) #FAPA: WE WANT CHAINID_RESID TO BE THE KEY 426 | #key = line_split[6] + "_" + str(resnum) # FAPA: WE WANT CHAINID_RESID TO BE THE KEY 427 | 428 | #print(len(line_split)) 429 | 430 | 431 | 432 | if key in conversion_template: 433 | #print(key, conversion_template[key]) 434 | if len(line_split) == 18: 435 | if len(str(conversion_template[key]).split()) < 2: 436 | #print(conversion_template[key]) 437 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + \ 438 | line_split[7] + " " + str(conversion_template[key]) + " " + "?" + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + \ 439 | " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + "\n" 440 | newciffile.write(newline) 441 | else: 442 | # the key already contains two columns (this is a test) 443 | #print("i am here: "+ str(conversion_template[key].split()[1])) 444 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \ 445 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \ 446 | line_split[6] + " " + \ 447 | line_split[7] + " " + str(conversion_template[key].split()[0]) + " " + \ 448 | str(conversion_template[key].split()[1])+ " " + \ 449 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \ 450 | line_split[13] + \ 451 | " " + line_split[14] + " " + str(conversion_template[key].split()[0]) + " " + \ 452 | line_split[16] + " " + line_split[17] + " " + "\n" 453 | newciffile.write(newline) 454 | else: 455 | if len(str(conversion_template[key]).split()) < 2: 456 | #print(conversion_template[key]) 457 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + line_split[6] + " " + \ 458 | line_split[7] + " " + str(conversion_template[key]) + " " + "?" + " " + line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + line_split[13] + \ 459 | " " + line_split[14] + " " + str(conversion_template[key]) + " " + line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n" 460 | newciffile.write(newline) 461 | else: 462 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \ 463 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \ 464 | line_split[6] + " " + \ 465 | line_split[7] + " " + str(conversion_template[key].split()[0]) + " " + \ 466 | str(conversion_template[key].split()[1]) + " " + \ 467 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \ 468 | line_split[13] + \ 469 | " " + line_split[14] + " " + str(conversion_template[key].split()[0]) + " " + \ 470 | line_split[16] + " " + line_split[17] + " " + line_split[18] + " " + line_split[19] + "\n" 471 | newciffile.write(newline) 472 | else: 473 | if len(line_split) == 18: 474 | newciffile.write(line) 475 | else: 476 | if line_split[8] == ".": 477 | newline = line_split[0] + " " + line_split[1] + " " + line_split[2] + " " + \ 478 | line_split[3] + " " + line_split[4] + " " + line_split[5] + " " + \ 479 | line_split[6] + " " + \ 480 | line_split[7] + " " + line_split[15] + " " + \ 481 | line_split[9] + " " + \ 482 | line_split[10] + " " + line_split[11] + " " + line_split[12] + " " + \ 483 | line_split[13] + \ 484 | " " + line_split[14] + " " + line_split[15] + " " + \ 485 | line_split[16] + " " + line_split[17] + " " + line_split[18] + " "+ line_split[19] +"\n" 486 | newciffile.write(newline) 487 | else: 488 | newciffile.write(line) 489 | else: 490 | newciffile.write(line) 491 | 492 | # FAPA MAY 2024 TEST ENDS -------------------------------------------------------------------------------- /Notebooks/Step3.2.AssignMolIDToEntitiesFoundInCIFfiles2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1a9e8a53", 6 | "metadata": {}, 7 | "source": [ 8 | "# Assign MolID to the entities found in the CIF files (2) " 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "bcec58a5", 14 | "metadata": {}, 15 | "source": [ 16 | "## What is the goal of this notebook?\n", 17 | "\n", 18 | "This is a continuation from `Assign MolID to the entities found in the CIF files (1)`.\n", 19 | "In this notebook we will show what happens when you assign the same name to different chains, because you want to concatenate them. For example, if you want to make all the waters or ions be in the same chain. \n", 20 | "\n", 21 | "**Note:** Make sure to run part 1 of this step in advance." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "3771f378", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "## First, import library and setup directories" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "05789e90", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from PDBClean import pdbclean_io" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "ee726354", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "PROJDIR=\"./TIM/\"" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "id": "2fa40724", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='simple_bank_sub2')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "id": "968b5aae", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# Let's copy the same structures we selected on step 2.1 \n", 72 | "\n", 73 | "! cp $PROJDIR/simple_bank_sub/*cif $PROJDIR/simple_bank_sub2/" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 6, 79 | "id": "afbff1c4", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "pdbclean_io.check_project(projdir=PROJDIR, action='create', level='standard_MolID_bank2')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "abfe9d89", 89 | "metadata": {}, 90 | "source": [ 91 | "### Running PDBClean_MolID_CIF.py\n", 92 | "\n", 93 | "Remember that the way to run this script in the terminal is as following:\n", 94 | "\n", 95 | "> PDBClean_MolID_CIF.py `{Input Directory}` `{Output Directory}`\n", 96 | "\n", 97 | "The input directory contains the structures that we generated in Step 1. The output directory is where the new structures will be stored. " 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "db035e81", 103 | "metadata": {}, 104 | "source": [ 105 | "### Renaming MolID, how to choose a name? \n", 106 | "\n", 107 | "This is a personal decision. You can decide how name each entity. In part 2.1 we assigned a different MolID to each entity, as shown in the table below:\n", 108 | "\n", 109 | "| New MolID | ENTITIES |\n", 110 | "|---|:---|\n", 111 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n", 112 | "| B | 1:SN-GLYCEROL-3-PHOSPHATE: |\n", 113 | "| C | 1:SN-GLYCEROL-1-PHOSPHATE: |\n", 114 | "| D,E | 2:GLYCEROL: |\n", 115 | "| F,G | 2:WATER: |\n", 116 | "| H,I | 2:TRIOSEPHOSPHATE ISOMERASE: |\n", 117 | "| J | 1:PHOSPHATE ION: |\n", 118 | "| K,L | 2:2-PHOSPHOGLYCOLIC ACID: | \n", 119 | "\n", 120 | "\n", 121 | "For this example, let's try assigning the same MolID to different entities: \n", 122 | "\n", 123 | "| New MolID | ENTITIES |\n", 124 | "|---|:---|\n", 125 | "| A | 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM: |\n", 126 | "| D | 1:SN-GLYCEROL-3-PHOSPHATE: |\n", 127 | "| D | 1:SN-GLYCEROL-1-PHOSPHATE: |\n", 128 | "| D,D | 2:GLYCEROL: |\n", 129 | "| C,C | 2:WATER: |\n", 130 | "| A,B | 2:TRIOSEPHOSPHATE ISOMERASE: |\n", 131 | "| D | 1:PHOSPHATE ION: |\n", 132 | "| D,D | 2:2-PHOSPHOGLYCOLIC ACID: | \n", 133 | "\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "id": "6dcca91c", 140 | "metadata": { 141 | "scrolled": false 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Reading: ./TIM//simple_bank_sub2/2y62+00.cif (1 of 6)\n", 149 | "Reading: ./TIM//simple_bank_sub2/1ag1+00.cif (2 of 6)\n", 150 | "Reading: ./TIM//simple_bank_sub2/1aw1+04.cif (3 of 6)\n", 151 | "Reading: ./TIM//simple_bank_sub2/1aw1+02.cif (4 of 6)\n", 152 | "Reading: ./TIM//simple_bank_sub2/1aw1+03.cif (5 of 6)\n", 153 | "Reading: ./TIM//simple_bank_sub2/1aw1+01.cif (6 of 6)\n", 154 | "PDBClean MolID Conversion Build Menu\n", 155 | " Select one of the following options to proceed:\n", 156 | " 1) Show full conversion\n", 157 | " 2) Show only unassigned conversions\n", 158 | " 3) Enter input file\n", 159 | " 4) Search MolID to add chain ID conversion\n", 160 | " 5) Go entry by entry to add chain ID conversion\n", 161 | " 6) Remove a chain ID conversion\n", 162 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 163 | " \n", 164 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 165 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 166 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 167 | "2:GLYCEROL:\n", 168 | "2:WATER:\n", 169 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 170 | "1:PHOSPHATE ION:\n", 171 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 172 | "You need to accept 8 entity conversions\n", 173 | "You need to accept 12 total chain conversions\n", 174 | "PDBClean MolID Conversion Build Menu\n", 175 | " Select one of the following options to proceed:\n", 176 | " 1) Show full conversion\n", 177 | " 2) Show only unassigned conversions\n", 178 | " 3) Enter input file\n", 179 | " 4) Search MolID to add chain ID conversion\n", 180 | " 5) Go entry by entry to add chain ID conversion\n", 181 | " 6) Remove a chain ID conversion\n", 182 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 183 | " \n", 184 | "Option Number: Enter chain IDs for each of the following MolID.\n", 185 | "Comma separated, no spaces\n", 186 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n", 187 | " conversion templates. You can proceed to the next section\n", 188 | " by selection option 7 or, continue to edit your conversion\n", 189 | " template through this menu\n", 190 | " \n", 191 | "PDBClean MolID Conversion Build Menu\n", 192 | " Select one of the following options to proceed:\n", 193 | " 1) Show full conversion\n", 194 | " 2) Show only unassigned conversions\n", 195 | " 3) Enter input file\n", 196 | " 4) Search MolID to add chain ID conversion\n", 197 | " 5) Go entry by entry to add chain ID conversion\n", 198 | " 6) Remove a chain ID conversion\n", 199 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 200 | " \n", 201 | " 7) Continue to next step of curation\n", 202 | "Option Number: PDBClean Concatenations Menu\n", 203 | " -------------------------------\n", 204 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n", 205 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n", 206 | " or ACCEPT concatenations.\n", 207 | "\n", 208 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n", 209 | " be completed.\n", 210 | "\n", 211 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n", 212 | " been accepted. It will also give you the proper format of the input for option 3.\n", 213 | "\n", 214 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n", 215 | " will be printed to screen as they are being accepted. \n", 216 | "\n", 217 | " Select one of the following options to proceed:\n", 218 | " 1) Show all conversions\n", 219 | " 2) Show only unaccepted concatenations\n", 220 | " 3) Search and modify destination chainIDs of proposed concatenations\n", 221 | " 4) Accept proposed concatenation one by one\n", 222 | " (Repeat this step until finalizing option appears)\n", 223 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n", 224 | " \n", 225 | "Option Number: ./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n", 226 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n", 227 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n", 228 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n", 229 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n", 230 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n", 231 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n", 232 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n", 233 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n", 234 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n", 235 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n", 236 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n", 237 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n", 238 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n", 239 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n", 240 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n", 241 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n", 242 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n", 243 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n", 244 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n", 245 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n", 246 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n", 247 | "You need to accept 22 concatenations\n", 248 | "PDBClean Concatenations Menu\n", 249 | " -------------------------------\n", 250 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n", 251 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n", 252 | " or ACCEPT concatenations.\n", 253 | "\n", 254 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n", 255 | " be completed.\n", 256 | "\n", 257 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n", 258 | " been accepted. It will also give you the proper format of the input for option 3.\n", 259 | "\n", 260 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n", 261 | " will be printed to screen as they are being accepted. \n", 262 | "\n", 263 | " Select one of the following options to proceed:\n", 264 | " 1) Show all conversions\n", 265 | " 2) Show only unaccepted concatenations\n", 266 | " 3) Search and modify destination chainIDs of proposed concatenations\n", 267 | " 4) Accept proposed concatenation one by one\n", 268 | " (Repeat this step until finalizing option appears)\n", 269 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n", 270 | " \n", 271 | "Option Number: " 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "! echo '2\\n5\\nA\\nD\\nD\\nD,D\\nC,C\\nA,B\\nD\\nD,D\\n7\\n2\\nQUIT\\n' | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub2 $PROJDIR/standard_MolID_bank2" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "id": "dd0f37da", 282 | "metadata": {}, 283 | "source": [ 284 | "## A pause to explain what is going on:\n", 285 | "\n", 286 | "Notice that a new menu appears when we assign the same MolID to more than one entity. We need to either give a new MolID to the entities, or accept a concatenation. We want to guarantee that you did not assign the same MolID by mistake, so you need to approve each case one by one. \n", 287 | "\n", 288 | "In the cell above, we chose option `2) Show only unaccepted concatenations`. Let's take a look at the output:\n", 289 | "\n", 290 | "`\n", 291 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n", 292 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n", 293 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n", 294 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n", 295 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n", 296 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n", 297 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n", 298 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n", 299 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n", 300 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n", 301 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n", 302 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n", 303 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n", 304 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n", 305 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n", 306 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n", 307 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n", 308 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n", 309 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n", 310 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n", 311 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n", 312 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n", 313 | "You need to accept 22 concatenations\n", 314 | "`\n", 315 | "\n", 316 | "Notice that the format is: \n", 317 | "\n", 318 | "`file name` : `entity` : `New MolID we just assigned`: `order of entity with same MolID in CIF file`\n", 319 | "\n", 320 | "To continue running the script, you will need to accept each of these concatenations. For this notebook we only show how to accept three of the proposed concatenations. We recommend doing this step on the terminal, and approve each concatenation one by one. \n", 321 | "\n", 322 | "Choosing menu `4) Accept proposed concatenation one by one` will print one of the concatenations that stills need to be approved.A new menu will appear, we need to choose option `2) Accept planned concatenation`. This will bring us back to the concatenation menu. We need to repeat this step (choose option 4, and then 2), until the finalize option appears. \n", 323 | "\n", 324 | "Even though it is beneficial to check each merge one by one, it can be very tedious (in the case of ions, users would have to approve hundreds of merges), so we also provide the option to accept all the merges automatically `5) Accept ALL`. PDBCleanV2 will still print all merges to the screen, so users can verify that everything is fine.\n", 325 | "\n", 326 | "Once all concatenations have been accepted, an option to finalize the curation will appear.\n" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 8, 332 | "id": "7821c381", 333 | "metadata": { 334 | "scrolled": false 335 | }, 336 | "outputs": [ 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "Reading: ./TIM//simple_bank_sub2/2y62+00.cif (1 of 6)\n", 342 | "Reading: ./TIM//simple_bank_sub2/1ag1+00.cif (2 of 6)\n", 343 | "Reading: ./TIM//simple_bank_sub2/1aw1+04.cif (3 of 6)\n", 344 | "Reading: ./TIM//simple_bank_sub2/1aw1+02.cif (4 of 6)\n", 345 | "Reading: ./TIM//simple_bank_sub2/1aw1+03.cif (5 of 6)\n", 346 | "Reading: ./TIM//simple_bank_sub2/1aw1+01.cif (6 of 6)\n", 347 | "PDBClean MolID Conversion Build Menu\n", 348 | " Select one of the following options to proceed:\n", 349 | " 1) Show full conversion\n", 350 | " 2) Show only unassigned conversions\n", 351 | " 3) Enter input file\n", 352 | " 4) Search MolID to add chain ID conversion\n", 353 | " 5) Go entry by entry to add chain ID conversion\n", 354 | " 6) Remove a chain ID conversion\n", 355 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 356 | " \n", 357 | "Option Number: 1:TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:\n", 358 | "1:SN-GLYCEROL-3-PHOSPHATE:\n", 359 | "1:SN-GLYCEROL-1-PHOSPHATE:\n", 360 | "2:GLYCEROL:\n", 361 | "2:WATER:\n", 362 | "2:TRIOSEPHOSPHATE ISOMERASE:\n", 363 | "1:PHOSPHATE ION:\n", 364 | "2:2-PHOSPHOGLYCOLIC ACID:\n", 365 | "You need to accept 8 entity conversions\n", 366 | "You need to accept 12 total chain conversions\n", 367 | "PDBClean MolID Conversion Build Menu\n", 368 | " Select one of the following options to proceed:\n", 369 | " 1) Show full conversion\n", 370 | " 2) Show only unassigned conversions\n", 371 | " 3) Enter input file\n", 372 | " 4) Search MolID to add chain ID conversion\n", 373 | " 5) Go entry by entry to add chain ID conversion\n", 374 | " 6) Remove a chain ID conversion\n", 375 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 376 | " \n", 377 | "Option Number: Enter chain IDs for each of the following MolID.\n", 378 | "Comma separated, no spaces\n", 379 | "TRIOSEPHOSPHATE ISOMERASE SYNONYM TRIOSE-PHOSPHATE ISOMERASE, TIM:SN-GLYCEROL-3-PHOSPHATE:SN-GLYCEROL-1-PHOSPHATE:GLYCEROL:WATER:TRIOSEPHOSPHATE ISOMERASE:PHOSPHATE ION:2-PHOSPHOGLYCOLIC ACID:Congratulations! You have successfully constructed your\n", 380 | " conversion templates. You can proceed to the next section\n", 381 | " by selection option 7 or, continue to edit your conversion\n", 382 | " template through this menu\n", 383 | " \n", 384 | "PDBClean MolID Conversion Build Menu\n", 385 | " Select one of the following options to proceed:\n", 386 | " 1) Show full conversion\n", 387 | " 2) Show only unassigned conversions\n", 388 | " 3) Enter input file\n", 389 | " 4) Search MolID to add chain ID conversion\n", 390 | " 5) Go entry by entry to add chain ID conversion\n", 391 | " 6) Remove a chain ID conversion\n", 392 | " A) Track changes (original_chain_name:new_chain:entity:file_name)\n", 393 | " \n", 394 | " 7) Continue to next step of curation\n", 395 | "Option Number: PDBClean Concatenations Menu\n", 396 | " -------------------------------\n", 397 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n", 398 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n", 399 | " or ACCEPT concatenations.\n", 400 | "\n", 401 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n", 402 | " be completed.\n", 403 | "\n", 404 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n", 405 | " been accepted. It will also give you the proper format of the input for option 3.\n", 406 | "\n", 407 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n", 408 | " will be printed to screen as they are being accepted. \n", 409 | "\n", 410 | " Select one of the following options to proceed:\n", 411 | " 1) Show all conversions\n", 412 | " 2) Show only unaccepted concatenations\n", 413 | " 3) Search and modify destination chainIDs of proposed concatenations\n", 414 | " 4) Accept proposed concatenation one by one\n", 415 | " (Repeat this step until finalizing option appears)\n", 416 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n", 417 | " \n", 418 | "Option Number: ./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-3-PHOSPHATE:B:D:1\n", 419 | "Select one of the following options to proceed:\n", 420 | " 1) Perform new search\n", 421 | " 2) Accept planned concatenation\n", 422 | " \n", 423 | "./TIM//simple_bank_sub2/2y62+00.cif:SN-GLYCEROL-1-PHOSPHATE:C:D:2\n", 424 | "Select one of the following options to proceed:\n", 425 | " 1) Perform new search\n", 426 | " 2) Accept planned concatenation\n", 427 | " \n", 428 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:D:D:3\n", 429 | "Select one of the following options to proceed:\n", 430 | " 1) Perform new search\n", 431 | " 2) Accept planned concatenation\n", 432 | " \n", 433 | "./TIM//simple_bank_sub2/2y62+00.cif:GLYCEROL:E:D:4\n", 434 | "Select one of the following options to proceed:\n", 435 | " 1) Perform new search\n", 436 | " 2) Accept planned concatenation\n", 437 | " \n", 438 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:D:C:1\n", 439 | "Select one of the following options to proceed:\n", 440 | " 1) Perform new search\n", 441 | " 2) Accept planned concatenation\n", 442 | " \n", 443 | "./TIM//simple_bank_sub2/1ag1+00.cif:WATER:E:C:2\n", 444 | "Select one of the following options to proceed:\n", 445 | " 1) Perform new search\n", 446 | " 2) Accept planned concatenation\n", 447 | " \n", 448 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:O:D:1\n", 449 | "Select one of the following options to proceed:\n", 450 | " 1) Perform new search\n", 451 | " 2) Accept planned concatenation\n", 452 | " \n", 453 | "./TIM//simple_bank_sub2/1aw1+04.cif:2-PHOSPHOGLYCOLIC ACID:P:D:2\n", 454 | "Select one of the following options to proceed:\n", 455 | " 1) Perform new search\n", 456 | " 2) Accept planned concatenation\n", 457 | " \n", 458 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:W:C:1\n", 459 | "Select one of the following options to proceed:\n", 460 | " 1) Perform new search\n", 461 | " 2) Accept planned concatenation\n", 462 | " \n", 463 | "./TIM//simple_bank_sub2/1aw1+04.cif:WATER:X:C:2\n", 464 | "Select one of the following options to proceed:\n", 465 | " 1) Perform new search\n", 466 | " 2) Accept planned concatenation\n", 467 | " \n", 468 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:K:D:1\n", 469 | "Select one of the following options to proceed:\n", 470 | " 1) Perform new search\n", 471 | " 2) Accept planned concatenation\n", 472 | " \n", 473 | "./TIM//simple_bank_sub2/1aw1+02.cif:2-PHOSPHOGLYCOLIC ACID:L:D:2\n", 474 | "Select one of the following options to proceed:\n", 475 | " 1) Perform new search\n", 476 | " 2) Accept planned concatenation\n", 477 | " \n", 478 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:S:C:1\n", 479 | "Select one of the following options to proceed:\n", 480 | " 1) Perform new search\n", 481 | " 2) Accept planned concatenation\n", 482 | " \n", 483 | "./TIM//simple_bank_sub2/1aw1+02.cif:WATER:T:C:2\n", 484 | "Select one of the following options to proceed:\n", 485 | " 1) Perform new search\n", 486 | " 2) Accept planned concatenation\n", 487 | " \n", 488 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:M:D:1\n", 489 | "Select one of the following options to proceed:\n", 490 | " 1) Perform new search\n", 491 | " 2) Accept planned concatenation\n", 492 | " \n", 493 | "./TIM//simple_bank_sub2/1aw1+03.cif:2-PHOSPHOGLYCOLIC ACID:N:D:2\n", 494 | "Select one of the following options to proceed:\n", 495 | " 1) Perform new search\n", 496 | " 2) Accept planned concatenation\n", 497 | " \n", 498 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:U:C:1\n", 499 | "Select one of the following options to proceed:\n", 500 | " 1) Perform new search\n", 501 | " 2) Accept planned concatenation\n", 502 | " \n", 503 | "./TIM//simple_bank_sub2/1aw1+03.cif:WATER:V:C:2\n", 504 | "Select one of the following options to proceed:\n", 505 | " 1) Perform new search\n", 506 | " 2) Accept planned concatenation\n", 507 | " \n", 508 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:I:D:1\n", 509 | "Select one of the following options to proceed:\n", 510 | " 1) Perform new search\n", 511 | " 2) Accept planned concatenation\n", 512 | " \n", 513 | "./TIM//simple_bank_sub2/1aw1+01.cif:2-PHOSPHOGLYCOLIC ACID:J:D:2\n", 514 | "Select one of the following options to proceed:\n", 515 | " 1) Perform new search\n", 516 | " 2) Accept planned concatenation\n", 517 | " \n", 518 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:Q:C:1\n", 519 | "Select one of the following options to proceed:\n", 520 | " 1) Perform new search\n", 521 | " 2) Accept planned concatenation\n", 522 | " \n", 523 | "./TIM//simple_bank_sub2/1aw1+01.cif:WATER:R:C:2\n", 524 | "Select one of the following options to proceed:\n", 525 | " 1) Perform new search\n", 526 | " 2) Accept planned concatenation\n", 527 | " \n", 528 | "0\n", 529 | "Congratulations! You have successfully constructed your\n", 530 | " conversion templates.You can proceed to the next section\n", 531 | " by selection option 6 or, continue to edit your conversion\n", 532 | " template through this menu\n", 533 | " \n", 534 | "PDBClean Concatenations Menu\n", 535 | " -------------------------------\n", 536 | " This menu appeared because you have assigned the same chain name to two (or more) entities.\n", 537 | " Note that this will concatenate the entities. So you need to either re-assign chain names,\n", 538 | " or ACCEPT concatenations.\n", 539 | "\n", 540 | " Note: All proposed concatenations must be accepted (by running option 4 or 5) before the curation can\n", 541 | " be completed.\n", 542 | "\n", 543 | " Before you do anything, we suggest to choose option 2, so you know which concatenations have not\n", 544 | " been accepted. It will also give you the proper format of the input for option 3.\n", 545 | "\n", 546 | " If you are sure that all the concatenations are correct. Option 5 will accept all of them. They \n", 547 | " will be printed to screen as they are being accepted. \n", 548 | "\n", 549 | " Select one of the following options to proceed:\n", 550 | " 1) Show all conversions\n", 551 | " 2) Show only unaccepted concatenations\n", 552 | " 3) Search and modify destination chainIDs of proposed concatenations\n", 553 | " 4) Accept proposed concatenation one by one\n", 554 | " (Repeat this step until finalizing option appears)\n", 555 | " 5) Accept ALL (BE CAREFUL, make sure you agree with all concatenations)\n", 556 | " \n", 557 | " 6) Finalize Curation\n", 558 | "Option Number: Finalizing Curation ...\n" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "! echo \"2\\n5\\nA\\nD\\nD\\nD,D\\nC,C\\nA,B\\nD\\nD,D\\n7\\n5\\n6\" | PDBClean_MolID_CIF.py $PROJDIR/simple_bank_sub2 $PROJDIR/standard_MolID_bank2" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "a65e7eaa", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [] 573 | } 574 | ], 575 | "metadata": { 576 | "kernelspec": { 577 | "display_name": "PDBCleanV2", 578 | "language": "python", 579 | "name": "pdbcleanv2" 580 | }, 581 | "language_info": { 582 | "codemirror_mode": { 583 | "name": "ipython", 584 | "version": 3 585 | }, 586 | "file_extension": ".py", 587 | "mimetype": "text/x-python", 588 | "name": "python", 589 | "nbconvert_exporter": "python", 590 | "pygments_lexer": "ipython3", 591 | "version": "3.10.5" 592 | } 593 | }, 594 | "nbformat": 4, 595 | "nbformat_minor": 5 596 | } 597 | --------------------------------------------------------------------------------