├── .readthedocs.yaml ├── LICENSE ├── README.md ├── docs ├── examples │ ├── ACRDIN11.json │ ├── ACRDIN12.json │ ├── ACSALA24.json │ ├── ACSALA32.json │ ├── ACSALA35.json │ └── example_structures_filter_data.json └── source │ ├── analysis.rst │ ├── conf.py │ ├── features.rst │ ├── index.rst │ ├── installation.rst │ ├── procedure.rst │ └── requirements.txt ├── input_files └── input_data_extraction.txt ├── source_code ├── Crystal_Math_CSA.ipynb ├── create_reference_fragments.py ├── csd_data_extraction.py ├── csd_operations.py ├── generate_molecule_fragments.py ├── get_analysis_data.py ├── get_structure_data.py ├── get_structures_list.py ├── input_checks.py ├── io_operations.py ├── maths.py ├── space_group_operations.py ├── structure_operations.py ├── utilities.py └── visualize.py └── source_data ├── atomic_properties.json ├── close_contacts_properties.json ├── fragment_list.json ├── fragments_geometry_data.txt ├── reference_fragment_list.json ├── space_group_properties.json └── variables.json /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for Sphinx projects 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the OS, Python version and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.9" 12 | 13 | # Build documentation in the "docs/" directory with Sphinx 14 | sphinx: 15 | configuration: docs/source/conf.py 16 | 17 | # Optionally build your docs in additional formats such as PDF and ePub 18 | formats: 19 | - pdf 20 | - epub 21 | 22 | # Python requirements 23 | python: 24 | install: 25 | - requirements: docs/source/requirements.txt 26 | 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Nikolaos Galanakis 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CrystalMath 2 | 3 | ## Tools for systematic exploration of the molecular structures in the CSD databasetowards a topological based CSP 4 | 5 | CrystalMath provides a comprehensive statistical analysis of molecular crystal structures from the CSD database and custom structures in the *.cif format. It offers deep insights into molecular packing trends, intermolecular interactions, and the topological nuances that dictate these patterns. 6 | 7 | The algorithm begins with a systematic exploration of the CSD, extracting and analyzing topological and geometrical data. This method integrates a fundamental understanding that molecular crystals conform to specific geometrical constraints and topological patterns. Through statistical analysis, CrystalMath derives logical rules and predictive models that enhance our understanding of molecular structures. 8 | 9 | **For comprehensive documentation, including installation, usage, and examples, please visit the** [**CrystalMath documentation on Read the Docs**](https://crystal-math.readthedocs.io/en/latest/index.html) 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/examples/example_structures_filter_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "ACRDIN11": { 3 | "space_group": "P21/n", 4 | "z_crystal": 4.0, 5 | "z_prime": 1.0, 6 | "species": ["C","H","N"], 7 | "fragments": ["acridin"], 8 | "contact_pairs": [ 9 | ["C","C","vdW",true], 10 | ["C","H","vdW",true], 11 | ["H","C","vdW",true], 12 | ["H","H","vdW",false], 13 | ["C","N","vdW",false], 14 | ["N","C","vdW",false], 15 | ["C","H","vdW",false], 16 | ["H","C","vdW",false], 17 | ["H","N","vdW",true], 18 | ["N","H","vdW",true], 19 | ["C","C","vdW",false], 20 | ["H","H","vdW",true], 21 | ["C","N","vdW",true], 22 | ["N","C","vdW",true] 23 | ], 24 | "contact_central_fragments": [ 25 | ["acridin","vdW",true], 26 | ["acridin","vdW",false] 27 | ], 28 | "contact_fragment_pairs": [ 29 | ["acridin","acridin","vdW",true], 30 | ["acridin","acridin","vdW",false] 31 | ] 32 | }, 33 | "ACRDIN12": { 34 | "space_group": "P21/n", 35 | "z_crystal": 4.0, 36 | "z_prime": 1.0, 37 | "species": ["C","H","N"], 38 | "fragments": ["acridin"], 39 | "contact_pairs": [ 40 | ["C","C","vdW",true], 41 | ["C","C","vdW",false], 42 | ["H","C","vdW",true], 43 | ["C","H","vdW",true], 44 | ["H","C","vdW",false], 45 | ["C","H","vdW",false], 46 | ["H","H","vdW",false], 47 | ["H","H","vdW",true], 48 | ["N","C","vdW",false], 49 | ["C","N","vdW",false], 50 | ["N","H","vdW",true], 51 | ["H","N","vdW",true], 52 | ["N","C","vdW",true], 53 | ["C","N","vdW",true] 54 | ], 55 | "contact_central_fragments": [ 56 | ["acridin","vdW",true], 57 | ["acridin","vdW",false] 58 | ], 59 | "contact_fragment_pairs": [ 60 | ["acridin","acridin","vdW",true], 61 | ["acridin","acridin","vdW",false] 62 | ] 63 | }, 64 | "ACSALA24": { 65 | "space_group": "P21/c", 66 | "z_crystal": 8.0, 67 | "z_prime": 2.0, 68 | "species": ["C","H","O"], 69 | "fragments": ["benzene","carboxylic_acid","ester_aromatic-aliphatic"], 70 | "contact_pairs": [ 71 | ["C","C","vdW",true], 72 | ["C","O","vdW",false], 73 | ["O","C","vdW",false], 74 | ["C","H","vdW",false], 75 | ["H","C","vdW",false], 76 | ["C","C","vdW",false], 77 | ["C","O","vdW",true], 78 | ["O","C","vdW",true], 79 | ["O","O","vdW",false], 80 | ["O","H","vdW",true], 81 | ["H","O","vdW",true], 82 | ["O","O","vdW",true], 83 | ["C","H","vdW",true], 84 | ["H","C","vdW",true], 85 | ["H","H","vdW",false], 86 | ["O","O","hbond",false], 87 | ["O","H","vdW",false], 88 | ["H","O","vdW",false], 89 | ["O","H","hbond",true], 90 | ["H","O","hbond",true], 91 | ["H","H","vdW",true] 92 | ], 93 | "contact_central_fragments": [ 94 | ["benzene","vdW",true], 95 | ["carboxylic_acid","vdW",true], 96 | ["ester_aromatic-aliphatic","vdW",true], 97 | ["benzene","vdW",false], 98 | ["carboxylic_acid","vdW",false], 99 | ["ester_aromatic-aliphatic","vdW",false], 100 | ["carboxylic_acid","hbond",false], 101 | ["carboxylic_acid","hbond",true] 102 | ], 103 | "contact_fragment_pairs": [ 104 | ["benzene","benzene","vdW",true], 105 | ["benzene","ester_aromatic-aliphatic","vdW",true], 106 | ["carboxylic_acid","benzene","vdW",true], 107 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",true], 108 | ["ester_aromatic-aliphatic","benzene","vdW",true], 109 | ["benzene","carboxylic_acid","vdW",true], 110 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",true], 111 | ["benzene","ester_aromatic-aliphatic","vdW",false], 112 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",false], 113 | ["ester_aromatic-aliphatic","benzene","vdW",false], 114 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",false], 115 | ["benzene","benzene","vdW",false], 116 | ["carboxylic_acid","benzene","vdW",false], 117 | ["benzene","carboxylic_acid","vdW",false], 118 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",false], 119 | ["carboxylic_acid","carboxylic_acid","vdW",true], 120 | ["carboxylic_acid","carboxylic_acid","vdW",false], 121 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",true], 122 | ["carboxylic_acid","carboxylic_acid","hbond",false], 123 | ["carboxylic_acid","carboxylic_acid","hbond",true] 124 | ] 125 | }, 126 | "ACSALA32": { 127 | "space_group": "P21/c", 128 | "z_crystal": 4.0, 129 | "z_prime": 1.0, 130 | "species": ["C","H","O"], 131 | "fragments": ["benzene","carboxylic_acid","ester_aromatic-aliphatic"], 132 | "contact_pairs": [ 133 | ["O","C","vdW",true], 134 | ["C","O","vdW",true], 135 | ["O","H","vdW",false], 136 | ["H","O","vdW",false], 137 | ["H","C","vdW",false], 138 | ["C","H","vdW",false], 139 | ["O","C","vdW",false], 140 | ["C","O","vdW",false], 141 | ["O","H","vdW",true], 142 | ["H","O","vdW",true], 143 | ["C","C","vdW",false], 144 | ["C","C","vdW",true], 145 | ["C","H","vdW",true], 146 | ["H","C","vdW",true], 147 | ["H","H","vdW",false], 148 | ["H","H","vdW",true], 149 | ["O","O","vdW",false], 150 | ["O","O","hbond",false], 151 | ["H","O","hbond",true], 152 | ["O","H","hbond",true] 153 | ], 154 | "contact_central_fragments": [ 155 | ["carboxylic_acid","vdW",true], 156 | ["benzene","vdW",true], 157 | ["carboxylic_acid","vdW",false], 158 | ["benzene","vdW",false], 159 | ["ester_aromatic-aliphatic","vdW",false], 160 | ["ester_aromatic-aliphatic","vdW",true], 161 | ["carboxylic_acid","hbond",false], 162 | ["carboxylic_acid","hbond",true] 163 | ], 164 | "contact_fragment_pairs": [ 165 | ["carboxylic_acid","benzene","vdW",true], 166 | ["benzene","carboxylic_acid","vdW",true], 167 | ["carboxylic_acid","benzene","vdW",false], 168 | ["benzene","carboxylic_acid","vdW",false], 169 | ["ester_aromatic-aliphatic","benzene","vdW",false], 170 | ["benzene","ester_aromatic-aliphatic","vdW",false], 171 | ["ester_aromatic-aliphatic","benzene","vdW",true], 172 | ["benzene","ester_aromatic-aliphatic","vdW",true], 173 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",true], 174 | ["benzene","benzene","vdW",false], 175 | ["benzene","benzene","vdW",true], 176 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",false], 177 | ["carboxylic_acid","carboxylic_acid","vdW",false], 178 | ["carboxylic_acid","carboxylic_acid","hbond",false], 179 | ["carboxylic_acid","carboxylic_acid","vdW",true], 180 | ["carboxylic_acid","carboxylic_acid","hbond",true], 181 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",true], 182 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",true], 183 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",false], 184 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",false] 185 | ] 186 | }, 187 | "ACSALA35": { 188 | "space_group": "P21/c", 189 | "z_crystal": 4.0, 190 | "z_prime": 1.0, 191 | "species": ["C","H","O"], 192 | "fragments": ["benzene","carboxylic_acid","ester_aromatic-aliphatic"], 193 | "contact_pairs": [ 194 | ["C","O","vdW",false], 195 | ["O","C","vdW",false], 196 | ["C","H","vdW",false], 197 | ["H","C","vdW",false], 198 | ["C","C","vdW",true], 199 | ["H","O","vdW",true], 200 | ["O","H","vdW",true], 201 | ["C","O","vdW",true], 202 | ["O","C","vdW",true], 203 | ["C","H","vdW",true], 204 | ["H","C","vdW",true], 205 | ["C","C","vdW",false], 206 | ["H","O","vdW",false], 207 | ["O","H","vdW",false], 208 | ["H","H","vdW",true], 209 | ["O","O","vdW",false], 210 | ["O","O","hbond",false], 211 | ["H","O","hbond",true], 212 | ["O","H","hbond",true] 213 | ], 214 | "contact_central_fragments": [ 215 | ["benzene","vdW",false], 216 | ["carboxylic_acid","vdW",false], 217 | ["benzene","vdW",true], 218 | ["carboxylic_acid","vdW",true], 219 | ["ester_aromatic-aliphatic","vdW",false], 220 | ["ester_aromatic-aliphatic","vdW",true], 221 | ["carboxylic_acid","hbond",false], 222 | ["carboxylic_acid","hbond",true] 223 | ], 224 | "contact_fragment_pairs": [ 225 | ["benzene","carboxylic_acid","vdW",false], 226 | ["carboxylic_acid","benzene","vdW",false], 227 | ["benzene","carboxylic_acid","vdW",true], 228 | ["carboxylic_acid","benzene","vdW",true], 229 | ["benzene","ester_aromatic-aliphatic","vdW",false], 230 | ["ester_aromatic-aliphatic","benzene","vdW",false], 231 | ["benzene","ester_aromatic-aliphatic","vdW",true], 232 | ["ester_aromatic-aliphatic","benzene","vdW",true], 233 | ["benzene","benzene","vdW",false], 234 | ["benzene","benzene","vdW",true], 235 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",false], 236 | ["ester_aromatic-aliphatic","ester_aromatic-aliphatic","vdW",true], 237 | ["carboxylic_acid","carboxylic_acid","vdW",true], 238 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",true], 239 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",true], 240 | ["carboxylic_acid","carboxylic_acid","vdW",false], 241 | ["carboxylic_acid","carboxylic_acid","hbond",false], 242 | ["carboxylic_acid","carboxylic_acid","hbond",true], 243 | ["ester_aromatic-aliphatic","carboxylic_acid","vdW",false], 244 | ["carboxylic_acid","ester_aromatic-aliphatic","vdW",false] 245 | ] 246 | } 247 | } -------------------------------------------------------------------------------- /docs/source/analysis.rst: -------------------------------------------------------------------------------- 1 | Post extraction analysis 2 | ======================== 3 | This section outlines the default post-extraction analysis tools. 4 | The purpose of this tool is to perform qualitative and quantitative analysis of the structure, fragment, contact and hydrogen bond data for the selected group of structures. 5 | The tool is designed to create scatter plots for pairs of parameters and histograms for the parameters extracted during the data extraction process. 6 | 7 | For the scatter plots, the algorithm calculates the correlation coefficients for the selected set of variables, while for the histograms, it offers the option to fit distributions to the selected data, and report the characteristics of the fitted curve. 8 | 9 | The Data Analysis Input File 10 | ---------------------------- 11 | The first step is to modify the ``input_data_analysis.txt`` file based on the required criteria. The general format of the file and descriptions of each parameter are as follows: 12 | 13 | Input File Format 14 | ^^^^^^^^^^^^^^^^^ 15 | The configuration should be specified in JSON format as shown below: 16 | 17 | .. code-block:: json 18 | 19 | { 20 | "plots_directory": "../csd_db_analysis/visualize/", 21 | "data_directory": "../csd_db_analysis/db_data/", 22 | "data_prefix": "homomolecular", 23 | "folder": "contacts_carboxylic-acid_carboxylic-acid_OH_hb_Zprime_1", 24 | "figure_size": [5,3.75], 25 | "save_figs": false, 26 | "data_filters": { 27 | "space_group": { 28 | "is_active": false, 29 | "type": "single", 30 | "values": ["P21/c","P21/n"], 31 | "operator": "or", 32 | "refine_data": false 33 | }, 34 | "z_crystal": { 35 | "is_active": false, 36 | "type": "single", 37 | "values": [4,8], 38 | "operator": "or", 39 | "refine_data": false 40 | }, 41 | "z_prime": { 42 | "is_active": true, 43 | "type": "single", 44 | "values": [1], 45 | "operator": "or", 46 | "refine_data": false 47 | }, 48 | "species": { 49 | "is_active": false, 50 | "type": "multiple", 51 | "values": ["C","H","N","O"], 52 | "operator": "or", 53 | "refine_data": false 54 | }, 55 | "fragments": { 56 | "is_active": true, 57 | "type": "multiple", 58 | "values": [ 59 | "carboxylic_acid", 60 | // ... 61 | ], 62 | "operator": "and", 63 | "refine_data": true 64 | }, 65 | "contact_pairs": { 66 | "is_active": true, 67 | "type": "multiple_lists", 68 | "values": [ 69 | ["O","H","hbond",true], 70 | // ... 71 | ], 72 | "operator": "or", 73 | "refine_data": true 74 | }, 75 | "contact_central_fragments": { 76 | "is_active": true, 77 | "type": "multiple_lists", 78 | "values": [ 79 | ["carboxylic_acid","hbond",true] 80 | // ... 81 | ], 82 | "operator": "or", 83 | "refine_data": true 84 | }, 85 | "contact_fragment_pairs": { 86 | "is_active": true, 87 | "type": "multiple_lists", 88 | "values": [ 89 | ["carboxylic_acid","carboxylic_acid","hbond",true], 90 | // ... 91 | ], 92 | "operator": "and", 93 | "refine_data": true 94 | } 95 | }, 96 | "plot_data_options": { 97 | "individual_space_groups_plots": true, 98 | "interactive": true, 99 | "percentiles": [[10,25,50,75,90],true,true,true], 100 | "2D_scatter": [ 101 | ["cell_length_b_sc","cell_length_c_sc",null], 102 | // ... 103 | ], 104 | "2D_scatter_marker": "o", 105 | "2D_scatter_facecolor": "whitesmoke", 106 | "2D_scatter_edgecolor": "black", 107 | "2D_scatter_opacity": 1.0, 108 | "3D_scatter": [ 109 | ["cc_contact_atom_ref_bv_x","cc_contact_atom_ref_bv_y","cc_contact_atom_ref_bv_z",null], 110 | // ... 111 | ], 112 | "3D_scatter_marker": "o", 113 | "3D_scatter_facecolor": "whitesmoke", 114 | "3D_scatter_edgecolor": "black", 115 | "3D_scatter_opacity": 1.0, 116 | "histogram": [ 117 | ["cc_length",null,false], 118 | // ... 119 | ], 120 | "histogram_density": false, 121 | "titles": false 122 | } 123 | } 124 | 125 | 126 | Key Descriptions 127 | ^^^^^^^^^^^^^^^^ 128 | - ``plots_directory`` 129 | Specifies the directory where plots will be saved. Using the default option is recommended. 130 | - ``data_directory`` 131 | The directory where the extracted data is stored. It must match the ``"save_directory"`` specified in the ``input_data_extraction.json`` file. 132 | - ``data_prefix`` 133 | A prefix applied to output files to facilitate their identification. This must be consistent with the ``"data_prefix"`` in the ``input_data_extraction.json`` file. 134 | - ``figure_size`` 135 | Defines the dimensions of exported figures in inches, formatted as :math:`(W \times H)`. The default Matplotlib size is :math:`(6.4 \times 4.8)`. To place two figures side by side in a 12-inch wide document using an 11pt font, the optimal size is :math:`(5.0 \times 3.75)`. Adjust dimensions according to your document's specific requirements. 136 | - ``data_filters`` 137 | Details for filtering structures for the analysis. Structures can be filtered based on: 138 | 139 | - **Space group** 140 | The space group of the structure. 141 | - :math:`Z` **value** 142 | The total number of molecules in the unit cell (Number of symmetry operations) :math:`\times` (Number of molecules in the asymmetric unit). 143 | - :math:`Z^{\prime}` **value** 144 | The number of molecules in the asymmetric unit. 145 | - **Atomic species** 146 | The different atomic species found in the structure. 147 | - **Fragments** 148 | The different fragments found in the structure. 149 | - **Contact atomic pairs** 150 | The different atomic pairs found for the contacts in the structure. 151 | - **Contact central fragments** 152 | The different central fragments for the contacts in the structure. 153 | - **Contact fragment pairs** 154 | The different fragment pairs found for the contacts in the structure. 155 | 156 | Each filter has 5 options: 157 | 158 | - ``is_active`` 159 | Set to ``true`` to activate the filter. Setting to ``false`` will deactivate the filter. 160 | - ``type`` 161 | The type of the filter. The available options are 162 | 163 | - ``single`` 164 | A structure is characterized by a single specific value for the variable (for example the space group). 165 | - ``multiple`` 166 | A structure is characterized by a list of values for the specific variable (for example the atomic species in the structure). 167 | - ``multiple_list`` 168 | A structure is characterized by a list of values for the specific variable, but each value is now a list (for example the contact pairs in the structure, where each contact pair is characterized by the species of the cetnral atom, the species of the contact atom, the type of the contact and a boolean that states if the contact is in line of sight). 169 | 170 | - ``values`` 171 | A list (or a list of lists) for the allowed values. 172 | - ``operator`` 173 | The available options are 174 | 175 | - ``"or"`` 176 | The filter will check for structures that have **any** of the declared values, 177 | - ``"and"`` 178 | The filter will check for structures that have **all** the declared values, 179 | 180 | - ``refine_data`` 181 | Set to ``true`` to refine the data for all the components in the structure based on the values of the filter. 182 | 183 | - ``plot_data_options`` 184 | Details the plotting options: 185 | 186 | - ``individual_space_groups_plots`` 187 | Set to ``true`` to create plots across all space groups and for each pace group sepaately. 188 | 189 | - ``interactive`` 190 | Set to ``true`` to create interactive `*.html`` plots with the plotly package. (Currently this is the only option supported. Currently developing a routine to generate publication-ready ``*.png`` plots). 191 | 192 | - ``percentiles`` 193 | The options to calculate the kde density for the 2D and 3D scatter plots. The format for the values includes a list of integerss (of floats) representing the desired percentiles followed by 3 booleans. Each boolean activates the creation of the lowest percentine (in the example the 10%), the middle percentines (25%, 50%, 75%), and the top percentile (90%). For the interactive ``*.html``` plots, it is recommended to set all options to ``true`` as the interactive plots allow to toggle on/off the different percentiles. For static ``*.png`` images, the booleans should be adjusted to include the desired percentiles in the plots. 194 | 195 | - ``2D_scatter``/``3D_scatter`` 196 | A list of the requested 2D/3D scatter plots to be generated. Each entry has the format ``[variable_1, variable_2, group_variable]``/``[variable_1, variable_2, variable_3, group_variable]``. The ``variable_1``, ``variable_2`` and ``variable_3`` are the variables used for the scatter plots. The entry ``group_variable`` declares the variable to group data and plot them separately based on the values of the group variable. Setting ``group_variable`` to ``null`` generates a single plot for the full set of selected data. The group variable can take different values depending on the nature of ``variable_1``, ``variable_2``, ``variable_3``. 197 | 198 | - ``2D_scatter_marker``/``3D_scatter_marker`` 199 | The marker for the data points (static images only). For the available options please refer to the `official matplotlib documentation `_. 200 | 201 | - ``2D_scatter_facecolor``/``3D_scatter_facecolor`` 202 | The marker face color for the data points (static images only). For the available options please refer to the `official matplotlib documentation `_. 203 | 204 | - ``2D_scatter_edgecolor``/``3D_scatter_edgecolor`` 205 | The marker edge color for the data points (static images only). For the available options please refer to the `official matplotlib documentation `_. 206 | 207 | - ``2D_scatter_opacity``/``3D_scatter_opacity`` 208 | The marker opacity for the data points (static images only). Can take a value in the range :math:`[0,1]`. 209 | 210 | - ``histogram`` 211 | A list of the requested histograms to be generated. Each entry has the format ``[variable, group_variable, fit_kde_curve]``. The ``group_variable`` works in a similar was as for the 2D/3D scatter plots. the ``fit_kde_curve`` can be set to ``true`` when we require to fit a kde curve to the histogram data. 212 | 213 | - ``histogram_density`` 214 | Setting to ``false`` will plot on the ``y`` axis the occurences. Setting to ``true`` will plot the frequency. 215 | 216 | List of available variables 217 | --------------------------- 218 | 219 | The available variables are included in the file ``variables.json`` located in the ``source_data`` folder. Currently, the algorithm supports 127 different variables grouped into 5 families (See details below). Details for each variable can be found in the `Data Extraction Procedure section `_. Each variable is described using a dictionary entry in the following format. 220 | 221 | .. code-block :: json 222 | 223 | "variable_name": { 224 | "latex_name": string, 225 | "html_name": string, 226 | "family": string, 227 | "path": [list of strings], 228 | "position_symmetry": [boolean,boolean,boolean,integer] 229 | } 230 | 231 | 232 | Key Descriptions 233 | ^^^^^^^^^^^^^^^^ 234 | 235 | - ``variable_name`` 236 | The name of the variable. Currently 127 variables are supported. 237 | 238 | - ``latex_name`` 239 | The name of the variable in LaTeX format used to render static ``*.png`` images. 240 | 241 | - ``html_name`` 242 | The name of the variable in html format used to render interactive ``*.html`` plots. 243 | 244 | - ``family`` 245 | The family of the variable. Currently the available variables are grouped into 5 different families based on the nature of the variable: 246 | 247 | - ``structure`` variable family (27 variables) 248 | 249 | Includes all the variables related to the geeral characteristics of the structure. 250 | 251 | - ``str_id`` 252 | - ``space_group`` 253 | - ``z_crystal`` 254 | - ``z_prime`` 255 | - ``formula`` 256 | - ``species`` 257 | - ``cell_length_a`` 258 | - ``cell_length_b`` 259 | - ``cell_length_c`` 260 | - ``cell_length_a_sc`` 261 | - ``cell_length_b_sc`` 262 | - ``cell_length_c_sc`` 263 | - ``cell_angle_alpha`` 264 | - ``cell_angle_beta`` 265 | - ``cell_angle_gamma`` 266 | - ``cell_volume`` 267 | - ``cell_density`` 268 | - ``vdWFV`` 269 | - ``SAS`` 270 | - ``E_tot`` 271 | - ``E_el`` 272 | - ``E_vdW`` 273 | - ``E_vdW_at`` 274 | - ``E_vdW_rep`` 275 | - ``E_hb`` 276 | - ``E_hb_at`` 277 | - ``E_hb_rep`` 278 | 279 | - ``fragment`` variable family (52 variables) 280 | 281 | Includes all the variables related to the general characteristics of the fragments in the structure. 282 | 283 | - ``fragment`` 284 | - ``fragment_x`` 285 | - ``fragment_y`` 286 | - ``fragment_z`` 287 | - ``fragment_u`` 288 | - ``fragment_v`` 289 | - ``fragment_w`` 290 | - ``fragment_e1_x`` 291 | - ``fragment_e1_y`` 292 | - ``fragment_e1_z`` 293 | - ``fragment_e1_u`` 294 | - ``fragment_e1_v`` 295 | - ``fragment_e1_w`` 296 | - ``fragment_w11_u`` 297 | - ``fragment_w11_v`` 298 | - ``fragment_w11_w`` 299 | - ``fragment_w12_u`` 300 | - ``fragment_w12_v`` 301 | - ``fragment_w12_w`` 302 | - ``fragment_w1_angle_1`` 303 | - ``fragment_w1_angle_2`` 304 | - ``fragment_e1_d_min`` 305 | - ``fragment_e2_x`` 306 | - ``fragment_e2_y`` 307 | - ``fragment_e2_z`` 308 | - ``fragment_e2_u`` 309 | - ``fragment_e2_v`` 310 | - ``fragment_e2_w`` 311 | - ``fragment_w21_u`` 312 | - ``fragment_w21_v`` 313 | - ``fragment_w21_w`` 314 | - ``fragment_w22_u`` 315 | - ``fragment_w22_v`` 316 | - ``fragment_w22_w`` 317 | - ``fragment_w2_angle_1`` 318 | - ``fragment_w2_angle_2`` 319 | - ``fragment_e2_d_min`` 320 | - ``fragment_e3_x`` 321 | - ``fragment_e3_y`` 322 | - ``fragment_e3_z`` 323 | - ``fragment_e3_u`` 324 | - ``fragment_e3_v`` 325 | - ``fragment_e3_w`` 326 | - ``fragment_w31_u`` 327 | - ``fragment_w31_v`` 328 | - ``fragment_w31_w`` 329 | - ``fragment_w32_u`` 330 | - ``fragment_w32_v`` 331 | - ``fragment_w32_w`` 332 | - ``fragment_w3_angle_1`` 333 | - ``fragment_w3_angle_2`` 334 | - ``fragment_e3_d_min`` 335 | 336 | - ``fragment_atom`` variable family (14 variables) 337 | 338 | Includes all the variables related to the characteristics of the atoms in each fragment. 339 | 340 | - ``fragment_atom_species`` 341 | - ``fragment_atom_x`` 342 | - ``fragment_atom_y`` 343 | - ``fragment_atom_z`` 344 | - ``fragment_atom_u`` 345 | - ``fragment_atom_v`` 346 | - ``fragment_atom_w`` 347 | - ``fragment_atom_bv_x`` 348 | - ``fragment_atom_bv_y`` 349 | - ``fragment_atom_bv_z`` 350 | - ``fragment_atom_bv_u`` 351 | - ``fragment_atom_bv_v`` 352 | - ``fragment_atom_bv_w`` 353 | - ``fragment_atom_dzzp_min`` 354 | 355 | - ``contact`` variable family (3 variables) 356 | 357 | Includes all the variables related to the general characteristics of the close contacts in the structure. 358 | 359 | - ``cc_length`` 360 | - ``cc_type`` 361 | - ``cc_is_in_los`` 362 | 363 | - ``contact_atom`` variable family (31 variables) 364 | 365 | Includes all the variables related to theatoms forming the close contacts in the structure. 366 | 367 | - ``cc_central_atom_species`` 368 | - ``cc_central_atom_fragment`` 369 | - ``cc_central_atom_x`` 370 | - ``cc_central_atom_y`` 371 | - ``cc_central_atom_z`` 372 | - ``cc_central_atom_u`` 373 | - ``cc_central_atom_v`` 374 | - ``cc_central_atom_w`` 375 | - ``cc_central_atom_bv_x`` 376 | - ``cc_central_atom_bv_y`` 377 | - ``cc_central_atom_bv_z`` 378 | - ``cc_central_atom_ref_bv_x`` 379 | - ``cc_central_atom_ref_bv_y`` 380 | - ``cc_central_atom_ref_bv_z`` 381 | - ``cc_contact_atom_species`` 382 | - ``cc_contact_atom_fragment`` 383 | - ``cc_contact_atom_x`` 384 | - ``cc_contact_atom_y`` 385 | - ``cc_contact_atom_z`` 386 | - ``cc_contact_atom_u`` 387 | - ``cc_contact_atom_v`` 388 | - ``cc_contact_atom_w`` 389 | - ``cc_contact_atom_bv_x`` 390 | - ``cc_contact_atom_bv_y`` 391 | - ``cc_contact_atom_bv_z`` 392 | - ``cc_contact_atom_ref_bv_x`` 393 | - ``cc_contact_atom_ref_bv_y`` 394 | - ``cc_contact_atom_ref_bv_z`` 395 | - ``cc_contact_atom_ref_bv_r`` 396 | - ``cc_contact_atom_ref_bv_theta`` 397 | - ``cc_contact_atom_ref_bv_phi`` 398 | - ``path`` 399 | List of strings pointing to the location of the value for each variable within each structure dictionary. 400 | 401 | - ``position_symmetry`` 402 | The symmetry operations that are applied to get the complete set of values for a crystal. The first boolean declares if a rotation operation is applied to the variable and is ``true`` only for :math:`(x,y,z)` or :math:`(u,v,w)` related coordinates. The second boolean is ``true`` when translational symmetry is applied and the third is ``true`` for variables that are restricted within the limits of the unit cell (such as the fractional atomic coordinates). The fourth entry in the list, is an integer declaring the group ID for each variable. If set to ``-1`` the variable is not part of a group. If is set to ``0`` the variable is memebr of the structure geometry variables :math:`(a,b,c,\alpha,\beta,\gamma,\Omega)` that are required to apply coordinate transformations to any positional variable. If set to an integer :\math:`>0`, the variable is part of a specific group of connected positional variables, such as the coordinates of an atom. There are 24 groups of variables: 403 | 404 | - ``1``. ``['cc_central_atom_x', 'cc_central_atom_y', 'cc_central_atom_z']`` 405 | - ``2``. ``['cc_central_atom_u', 'cc_central_atom_v', 'cc_central_atom_w']`` 406 | - ``3``. ``['cc_central_atom_bv_x', 'cc_central_atom_bv_y', 'cc_central_atom_bv_z']`` 407 | - ``4``. ``['cc_contact_atom_x', 'cc_contact_atom_y', 'cc_contact_atom_z']`` 408 | - ``5``. ``['cc_contact_atom_u', 'cc_contact_atom_v', 'cc_contact_atom_w']`` 409 | - ``6``. ``['cc_contact_atom_bv_x', 'cc_contact_atom_bv_y', 'cc_contact_atom_bv_z']`` 410 | - ``7``. ``['fragment_x', 'fragment_y', 'fragment_z']`` 411 | - ``8``. ``['fragment_u', 'fragment_v', 'fragment_w']`` 412 | - ``9``. ``['fragment_e1_x', 'fragment_e1_y', 'fragment_e1_z']`` 413 | - ``10``. ``['fragment_e1_u', 'fragment_e1_v', 'fragment_e1_w']`` 414 | - ``11``. ``['fragment_w11_u', 'fragment_w11_v', 'fragment_w11_w']`` 415 | - ``12``. ``['fragment_w12_u', 'fragment_w12_v', 'fragment_w12_w']`` 416 | - ``13``. ``['fragment_e2_x', 'fragment_e2_y', 'fragment_e2_z']`` 417 | - ``14``. ``['fragment_e2_u', 'fragment_e2_v', 'fragment_e2_w']`` 418 | - ``15``. ``['fragment_w21_u', 'fragment_w21_v', 'fragment_w21_w']`` 419 | - ``16``. ``['fragment_w22_u', 'fragment_w22_v', 'fragment_w22_w']`` 420 | - ``17``. ``['fragment_e3_x', 'fragment_e3_y', 'fragment_e3_z']`` 421 | - ``18``. ``['fragment_e3_u', 'fragment_e3_v', 'fragment_e3_w']`` 422 | - ``19``. ``['fragment_w31_u', 'fragment_w31_v', 'fragment_w31_w']`` 423 | - ``20``. ``['fragment_w32_u', 'fragment_w32_v', 'fragment_w32_w']`` 424 | - ``21``. ``['fragment_atom_x', 'fragment_atom_y', 'fragment_atom_z']`` 425 | - ``22``. ``['fragment_atom_u', 'fragment_atom_v', 'fragment_atom_w']`` 426 | - ``23``. ``['fragment_atom_bv_x', 'fragment_atom_bv_y', 'fragment_atom_bv_z']`` 427 | - ``24``. ``['fragment_atom_bv_u', 'fragment_atom_bv_v', 'fragment_atom_bv_w']`` 428 | 429 | In case a positional variable from the above lists is selected to be displayed in any 2D/3D scatter plot, the algorithm adds the values for all the variables in the same group as well as the variables in group ``0`` to the analysis data to be able to perform the necessary coordinate transformations. 430 | 431 | Example usage of the filters 432 | ---------------------------- 433 | 434 | The filters for the analysis are designed in a way to facilitate detailed analysis of any of the available variables in refined sets of data consistent with the needs of every user. The correct combination of the filters is crucial in order to analyze the correct set of data. Below we provide examples on how to use the filters in different scenarios: 435 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'CrystalMath' 21 | copyright = '2024, Nikolaos Galanakis' 22 | author = 'Nikolaos Galanakis' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx.ext.mathjax' 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = 'sphinx_rtd_theme' 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/source/features.rst: -------------------------------------------------------------------------------- 1 | Features 2 | ======== 3 | 4 | CrystalMath provides a comprehensive statistical analysis of molecular crystal structures from the CSD database and custom structures in the *.cif format. It offers deep insights into molecular packing trends, intermolecular interactions, and the topological nuances that dictate these patterns. 5 | 6 | The algorithm begins with a systematic exploration of the CSD, extracting and analyzing topological and geometrical data. This method integrates a fundamental understanding that molecular crystals conform to specific geometrical constraints and topological patterns. Through statistical analysis, CrystalMath derives logical rules and predictive models that enhance our understanding of molecular structures. 7 | 8 | This section outlines the main features of the Crystal Math software, which include analysis of existing structures within the CSD and predictions of molecular crystal structures. 9 | 10 | Analysis of Existing Structures within the CSD 11 | ---------------------------------------------- 12 | This feature represents the algorithm's investigative aspect, wherein it meticulously explores the repository of the CSD to extract and analyze structural data. The process employs a sophisticated fragment-based approach to assess molecular geometry, allowing it to discern subtle nuances and patterns within the crystal structures. 13 | 14 | This computational process is not merely a data retrieval mechanism. It involves the calculation of crucial geometrical and topological properties, including: 15 | 16 | - **Relative orientation** 17 | - **Plane intersections with unit cell vertices** 18 | - **Close contacts** 19 | - **Hydrogen bonds** 20 | - **Void analysis in the unit cell** 21 | 22 | These computations are invaluable, forming the bedrock of the dataset that the subsequent predictive stage will utilize. The adaptability of this feature allows researchers to set specific criteria, enabling the algorithm to target structures that bear direct relevance to their studies, thereby ensuring a customized, relevant, and rich analytical output. 23 | 24 | Prediction of Molecular Crystal Structures 25 | ------------------------------------------ 26 | Building upon the robust foundation laid by the analytical phase, the prediction feature marks the algorithm’s leap into the realm of prospective crystallography. This innovative function does not merely extrapolate from existing data but employs a rigorous mathematical, geometrical, and topological framework to envision and predict feasible crystal structures. 27 | 28 | Bypassing traditional methods that rely heavily on force fields and energy calculations, this feature stands out due to its unique approach, essentially rewriting the rules of crystal structure prediction. By utilizing the detailed insights gleaned from the analysis of existing CSD structures, the algorithm assesses countless possibilities and predicts structures that are not just theoretically plausible but ripe for synthesis and experimental verification. 29 | 30 | Detailed Analysis of Existing CSD Structures 31 | -------------------------------------------- 32 | The algorithm delves into the CSD, applying user-defined criteria to identify and analyze structures pertinent to your research. These criteria could range from the atomic species present in the crystal to more complex attributes such as: 33 | 34 | - Space group 35 | - :math:`Z^{\prime}` value 36 | - Molecular weight for the components within the asymmetric unit 37 | 38 | Fragment-Based Analysis 39 | ^^^^^^^^^^^^^^^^^^^^^^^ 40 | The script communicates with the CSD database, seeking structures that align with specific user-defined rules. Upon identifying the relevant structures, the algorithm proceeds to extract critical data, focusing particularly on geometric and topological properties that inform the subsequent prediction phase. 41 | 42 | A pivotal aspect of the CSP Algorithm's analytical prowess hinges on its geometric interpretation of intermolecular forces, by extracting properties for the close contacts and hydrogen bonding within crystal structures. These interactions are not merely physical constraints but are insightful topological and energetic indicators that guide the strategic assembly of molecular crystals. 43 | 44 | Geometrical and Topological Properties Analysis 45 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 46 | 47 | The extracted data encompasses several key molecular aspects, with calculations and analyses including, but not limited to: 48 | 49 | - **Orientation Relative to Inertia Frame**: Assessing molecular and fragmentary orientation within the unit cell, referenced against their inertia frames. This analysis goes beyond simple spatial representation; it is a profound exploration of the positional relationship between molecular fragments and the encompassing lattice geometry. The algorithm calculates the orientations by establishing a molecule's inertia frame, a defined coordinate system based on the molecule's moment of inertia. This frame serves as a reference point, allowing for a standardized comparison of molecular orientations. With this approach, the algorithm can systematically analyze how different fragments within a molecule orient themselves relative to each other and their collective orientation within the unit cell. 50 | - **Relative positions of principal planes of inertia**: The algorithm computes the distances of certain points in a unit cell to the pnincipal planes of inertia (planes perpendicular to the principal axes of inertia, passing through the center of mass of each fragment). This calculation is instrumental in understanding the molecule's spatial orientation and placement. 51 | - **Inter-Fragment Correlations**: By observing the relative orientations of fragments within a molecule, the algorithm unveils potential correlations in geometric conformations. These insights are crucial for understanding the molecule's structural dynamics, offering clues about its stability, reactivity, or interactions with neighboring entities. 52 | - **Molecule-Unit Cell Interplay**: Expand the analysis to explore how the molecule fits and orients itself within the unit cell. This exploration can reveal critical insights into whether the molecule's orientation is influenced by the unit cell's geometric constraints, contributing to a deeper understanding of the crystal packing phenomena. 53 | - **Predictive Insights for New Structures**: By identifying trends and correlations between molecular orientation and unit cell geometry, the algorithm can hypothesize about probable orientations for molecules in novel crystal structures, providing a reliable foundation for anticipating the behavior of molecules in uncharted configurations. 54 | 55 | In essence, the orientation analysis relative to the inertia frame is not a mere calculation but a holistic examination of the molecule's spatial narrative. It provides contextual insights that are indispensable for predicting how new molecular assemblies might accommodate themselves within various lattice frameworks, essentially influencing the design strategy for new materials with desired properties. 56 | 57 | - **Close Contacts**: Traditional analysis of close contacts often stops at identifying distances shorter than the sum of van der Waals radii. However, the CSP Algorithm delves deeper, recognizing that the strength of these contacts is an extremely important topological property, intimately tied to the interaction energy's minimum. By examining a comprehensive matrix of atomic species pairs and their distribution across various space groups, the algorithm calculates the optimal strength of close contacts. In addition, it analyzes the spatial distribution of the contacts in respect to the center of mass for each fragment. This analysis provides a benchmark for constructing molecular crystals with judicious interatomic interactions, ensuring structural stability without compromising the lattice's integrity. These calculated parameters are instrumental during the prediction phase, where the algorithm utilizes this statistical backbone to forecast interaction energies, guiding the assembly of molecules within the crystal lattice in a manner that's energetically favorable. 58 | - **Hydrogen Bonds**: The analysis of the hydrogen bonds within the crystal matrix, provide insights into their geometric configuration which is tied to their energetic profile. This understanding is crucial because hydrogen bonds impart significant directional character to molecular arrangements in crystal lattices, influencing both structure and properties. The CSP Algorithm evaluates the geometry of potential hydrogen bond, ensuring not only geometric precision but also the right balance of strength and directionality in these interactions. This information is vital for constructing viable hydrogen-bonded networks, especially in complex molecular crystals where these interactions dictate structural feasibility and stability. 59 | - **Voids in Unit Cell**: Analyzing the van der Waals free volume and solvent-accessible surface within the crystal lattice provides insights into the potential for molecular movement, stability under pressure, or where guest molecules might reside. 60 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Crystal_Math documentation master file, created by 2 | sphinx-quickstart on Tue Apr 16 08:52:30 2024. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Crystal_Math's documentation! 7 | ======================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | features 14 | installation 15 | procedure 16 | analysis 17 | 18 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | We highly recommend using **Anaconda** for its ease of package management and environment handling, as it includes numerous scientific computing packages that facilitate a smoother setup process. 5 | 6 | Download and Install Anaconda 7 | ----------------------------- 8 | 9 | Visit the `Anaconda Distribution page `_ to download and install the distribution. Please ensure you download the version that includes ``Python 3.9`` or higher. 10 | 11 | Required Python Packages 12 | ------------------------ 13 | The following Python packages are necessary for running Crystal Math: 14 | 15 | - ``ast`` 16 | - ``datetime`` 17 | - ``itertools`` 18 | - ``json`` 19 | - ``matplotlib`` 20 | - ``networkx`` 21 | - ``numpy`` 22 | - ``os`` 23 | - ``scipy`` 24 | - ``re`` 25 | - ``time`` 26 | 27 | These can be installed using the following command: 28 | 29 | .. code-block:: bash 30 | 31 | pip install matplotlib networkx numpy scipy 32 | 33 | Note that some packages (``ast``, ``datetime``, ``itertools``, ``json``, ``os``, ``re``, ``time``) are part of the Python Standard Library and do not need installation via pip. 34 | 35 | Installing the CSD Python API 36 | ----------------------------- 37 | The current version requires the installation of the CSD Python API, which is crucial for the statistical analysis phase and for retrieving molecular structure data. Due to specific installation instructions and licensing, please refer to the `official installation notes `_ for detailed guidance. Adhere strictly to their guidelines to ensure full functionality within the CSP algorithm environment. 38 | 39 | Installing the code 40 | ------------------- 41 | The code itself requires **no installation** of additional software packages or libraries, other than Git for obtaining the code. Simply follow the steps below to clone the repository to your local machine and run the code directly. 42 | 43 | #. **Git**: Git is a version control system that lets you manage and keep track of your source code history. If you don't already have Git installed, you can download it from `the Git website `_. 44 | 45 | Cloning the Repository 46 | ^^^^^^^^^^^^^^^^^^^^^^ 47 | 48 | Cloning a repository means making a copy of the code on your local machine. This is done via Git. To clone the repository, follow these steps: 49 | 50 | 1. Open a terminal window. On Windows, you can search for ``CMD`` or ``Command Prompt`` in your start menu. On macOS, you can open the Terminal from your Applications folder under Utilities. 51 | 52 | 2. Use the following command to clone the repository: 53 | 54 | .. code-block:: bash 55 | 56 | git clone https://github.com/nigalanakis/Crystal_Math 57 | 58 | 3. After the cloning process is complete, navigate to the newly created directory: 59 | 60 | .. code-block:: bash 61 | 62 | cd your-repository -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=5.0 2 | sphinx_rtd_theme==1.0.0 3 | 4 | -------------------------------------------------------------------------------- /input_files/input_data_extraction.txt: -------------------------------------------------------------------------------- 1 | { 2 | "data_directory": "../csd_db_analysis/db_data/", 3 | "data_prefix": "homomolecular", 4 | "get_refcode_families": false, 5 | "cluster_refcode_families": false, 6 | "get_unique_structures": false, 7 | "get_structure_data": false, 8 | "get_structure_filter_data": false, 9 | "unique_structures_clustering_method": "vdWFV", 10 | "structure_list": ["csd-unique", "all"], 11 | "structures_to_exclude": ["BALDUP", "CEMVAS", "DAGRIN", "DAHKUV", "FADGEW", "HUPCUT", "JIKXOT", "LUQDAE", "PEVLOR", "TEVYAV", "VIRLOY", "ZEPDAZ04"], 12 | "crystal_type": ["homomolecular"], 13 | "target_species": ["C", "H", "N", "O", "F", "Cl", "Br", "I", "P", "S"], 14 | "add_symmetric_positions": true, 15 | "target_space_groups": ["P1", "P-1", "P21", "C2", "Pc", "Cc", "P21/m", "C2/m", "P2/c", "P21/c", "P21/n", "C2/c", "P21212", "P212121", "Pca21", "Pna21", "Pbcn", "Pbca", "Pnma", "R-3", "I41/a"], 16 | "target_z_prime_values": [1, 2, 3, 4, 5], 17 | "target_fragments": [], 18 | "molecule_weight_limit": 500.0, 19 | "molecule_formal_charges": [0], 20 | "center_molecule": true, 21 | "add_full_component": true, 22 | "fragments_to_check_alignment": [], 23 | "proposed_vectors_n_max": 5 24 | } 25 | -------------------------------------------------------------------------------- /source_code/create_reference_fragments.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | from maths import calculate_inertia 5 | from maths import center_of_mass 6 | from maths import ensure_right_handed_coordinate_system 7 | from maths import sort_eigenvectors 8 | 9 | def create_reference_fragments(): 10 | """ 11 | Converts the input fragment list into a space fixed list of fragments. 12 | 13 | Parameters 14 | ---------- 15 | 16 | Returns 17 | ------- 18 | reference_fragment_list : dict 19 | A dictionary with the space fixed reference fragments. 20 | """ 21 | with open("../source_data/fragment_list.json","r") as f: 22 | fragment_list = json.load(f) 23 | 24 | reference_fragment_list = {} 25 | for fragment in fragment_list: 26 | fragment_atoms_mass = np.array(fragment_list[fragment]["mass"]) 27 | fragment_atoms_pos = np.array(fragment_list[fragment]["coordinates"]) 28 | fragment_com = center_of_mass(fragment_atoms_mass,fragment_atoms_pos) 29 | fragment_atoms_bv = fragment_atoms_pos - fragment_com 30 | 31 | inertia_eigenvalues, inertia_eigenvectors = calculate_inertia(fragment_atoms_mass, 32 | fragment_atoms_bv) 33 | 34 | inertia_eigenvalues, inertia_eigenvectors = sort_eigenvectors(inertia_eigenvalues, 35 | inertia_eigenvectors) 36 | 37 | inertia_eigenvectors = ensure_right_handed_coordinate_system(inertia_eigenvectors) 38 | 39 | fragment_atoms_sfc = np.matmul(fragment_atoms_bv, 40 | inertia_eigenvectors) 41 | 42 | fragment_atoms_sfc = np.round(fragment_atoms_sfc, decimals=4) 43 | 44 | reference_fragment_list[fragment] = {"smarts": fragment_list[fragment]["smarts"], 45 | "species": fragment_list[fragment]["species"], 46 | "coordinates_sf": fragment_atoms_sfc.tolist(), 47 | "mass": fragment_list[fragment]["mass"], 48 | "atoms_to_align": fragment_list[fragment]["atoms_to_align"]} 49 | 50 | # Write the reference fragment to json file 51 | with open('../source_data/reference_fragment_list.json', 'w') as f: 52 | json.dump(reference_fragment_list, f, indent=4) 53 | 54 | return reference_fragment_list 55 | 56 | 57 | -------------------------------------------------------------------------------- /source_code/csd_data_extraction.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from time import process_time as timer 4 | 5 | from csd_operations import cluster_refcode_families 6 | from csd_operations import get_refcode_families 7 | from csd_operations import get_unique_structures 8 | from generate_molecule_fragments import create_reference_fragments 9 | from get_structure_data import get_structure_data, get_structure_filter_data 10 | from utilities import convert_seconds_to_hms 11 | 12 | def main(input_file): 13 | # Load execution parameters 14 | with open('input_files/' + input_file) as f: 15 | input_parameters = json.load(f) 16 | 17 | # Get the refcode families 18 | if input_parameters["get_refcode_families"]: 19 | print('Getting the CSD refcode families and the structures in each family.') 20 | get_refcode_families(input_parameters) 21 | 22 | # Cluster refcode families based on structure similarity. 23 | if input_parameters["cluster_refcode_families"]: 24 | print('Filter structures based on the user defined criteria and clustering refcode families members based on packing similarity.') 25 | cluster_refcode_families(input_parameters) 26 | 27 | # Get unique structures 28 | if input_parameters["get_unique_structures"]: 29 | print('Getting unique structures.') 30 | get_unique_structures(input_parameters) 31 | 32 | # Get structure data 33 | if input_parameters["get_structure_data"]: 34 | print('Getting structure data.') 35 | get_structure_data(input_parameters) 36 | 37 | # Get structure filter data 38 | if input_parameters["get_structure_filter_data"]: 39 | print('Getting structure filter data.') 40 | get_structure_filter_data(input_parameters) 41 | 42 | if __name__ == "__main__": 43 | input_file = "input_data_extraction.txt" 44 | 45 | now = datetime.now() 46 | print('#' * 80) 47 | print('Crystal Math') 48 | print('A Mathematical and Geometrical Crystal Structure Analyis Protocol') 49 | print('-' * 80) 50 | print('Nikos Galanakis') 51 | print('Research Scientist') 52 | print('The Tuckerman Group') 53 | print('New York University') 54 | print('ng1807@nyu.edu') 55 | print('=' * 80) 56 | print("Process started at ", now.strftime("%Y-%m-%d %H:%M:%S")) 57 | print('-' * 80) 58 | 59 | start = timer() 60 | main(input_file) 61 | 62 | cpu_time = timer() - start 63 | hours, minutes, seconds = convert_seconds_to_hms(cpu_time) 64 | now = datetime.now() 65 | print("Process completed at ", now.strftime("%Y-%m-%d %H:%M:%S")) 66 | print(f"Total computation time: {hours}h {minutes}m {seconds:.2f}s") -------------------------------------------------------------------------------- /source_code/csd_operations.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import numpy as np 4 | import os 5 | import ccdc.search 6 | from ccdc import io 7 | from ccdc.crystal import PackingSimilarity 8 | from ccdc.morphology import VisualHabit 9 | 10 | from create_reference_fragments import create_reference_fragments 11 | from maths import calculate_inertia 12 | from maths import ensure_right_handed_coordinate_system 13 | from maths import sort_eigenvectors 14 | from structure_operations import get_lattice_vectors 15 | from structure_operations import get_unique_species 16 | from structure_operations import similarity_check 17 | 18 | def structure_check(input_parameters,crystal,molecule): 19 | ''' 20 | Performs a check to see if a structure is consistent with the used defined 21 | requirements. 22 | 23 | Parameters 24 | ---------- 25 | input_parameters : dict 26 | A dictionary with the input parameters for the search. 27 | crystal : csd object 28 | The csd crystal for the structure. 29 | molecule : csd_object 30 | The csd molecule for the structure. 31 | 32 | Returns 33 | ------- 34 | True if a structure is accepted, None otherwise. 35 | ''' 36 | # Discard structures with based on the Z prime value 37 | if crystal.z_prime not in input_parameters['target_z_prime_values']: 38 | return None 39 | 40 | # Discard structures with unwanted space group 41 | if input_parameters['target_space_groups'] != [] and crystal.spacegroup_symbol not in input_parameters['target_space_groups']: 42 | return None 43 | 44 | # Assign unknow bond types, add missing hydrogens and assign 45 | # partial charges to atoms 46 | try: 47 | molecule.assign_bond_types() 48 | molecule.add_hydrogens(mode='missing') 49 | molecule.assign_partial_charges() 50 | except Exception: 51 | return None 52 | 53 | # Generate atoms 54 | try: 55 | atoms = molecule.atoms 56 | except Exception: 57 | return None 58 | 59 | # Discard structures with no atoms in the crystal 60 | if len(atoms) == 0: 61 | return None 62 | 63 | # Discard structures with missing coordinates: 64 | for at in atoms: 65 | if at.coordinates == None: 66 | return None 67 | 68 | # Discard structures based on the their type (homomolecular, co-crystals, hydrates) 69 | components = [c.formula for c in molecule.components] 70 | if all(item == components[0] for item in components): 71 | crystal_type = 'homomolecular' 72 | else: 73 | if 'H2 O1' in components: 74 | crystal_type = 'hydrate' 75 | else: 76 | crystal_type = 'co-crystal' 77 | if crystal_type not in input_parameters['crystal_type']: 78 | return None 79 | 80 | # Discard structures based on formal charge of molecules 81 | for component in molecule.components: 82 | if crystal_type == 'homomolecular' and component.formal_charge not in input_parameters['molecule_formal_charges']: 83 | return None 84 | 85 | # Discard structures with out-of-range molecular weight 86 | for component in molecule.components: 87 | if component.molecular_weight > input_parameters['molecule_weight_limit']: 88 | return None 89 | 90 | # Discard structures with unwanted atomic species 91 | if input_parameters['target_species'] != []: 92 | for s in get_unique_species(crystal.formula): 93 | if s not in input_parameters['target_species']: 94 | return None 95 | 96 | return True 97 | 98 | def get_refcode_families(input_parameters): 99 | ''' 100 | Reads the CSD database and returns the refcode families and the structures 101 | for each family. 102 | 103 | Parameters 104 | ---------- 105 | input_parameters : dict 106 | A dictionary with the user defined input parameters. 107 | 108 | Returns 109 | ------- 110 | refcode_families : dict 111 | A dictionaty with the refcode families and the structures for each 112 | family. 113 | ''' 114 | # Initialize the reader for the CSD 115 | reader = io.EntryReader('CSD') 116 | 117 | # List to hold the matching Refcodes 118 | refcode_families = {} 119 | 120 | # Iterate through all entries in the CSD 121 | family_i = '' 122 | for entry in reader: 123 | family_j = entry.identifier[:6] 124 | if family_j != family_i: 125 | refcode_families[family_j] = [entry.identifier] 126 | else: 127 | refcode_families[family_j].append(entry.identifier) 128 | family_i = family_j 129 | 130 | # Specify the filename you want to write to 131 | filename = '../csd_db_analysis/db_data/' + input_parameters['data_prefix'] + '_csd_refcode_families.json' 132 | 133 | # Writing the dictionary to a file in JSON format 134 | with open(filename, 'w') as f: 135 | json.dump(refcode_families, f, indent=4) 136 | 137 | return refcode_families 138 | 139 | def cluster_refcode_families(input_parameters): 140 | ''' 141 | Reads the csd families distionary and returns a new dictionary for the 142 | refcode familes where the structures are grouped based on their similarity. 143 | Only strutures consisntent with the user defined criteria are included in 144 | the clustered refcode families. 145 | 146 | Parameters 147 | ---------- 148 | input_parameters : dict 149 | A dictionary with the input parameters for the search. 150 | 151 | Returns 152 | ------- 153 | refcode_families : dict 154 | A dictionaty with the refcode families and the structures for each 155 | family grouped based on similarity. 156 | ''' 157 | # Open the refcode families file and read data. 158 | refcode_families_f = '../csd_db_analysis/db_data/' + input_parameters['data_prefix'] + '_csd_refcode_families.json' 159 | if not os.path.exists(refcode_families_f): 160 | # If the file does not exist, raise an exception 161 | raise FileNotFoundError(f'The file {refcode_families_f} does not exist.') 162 | else: 163 | # Set the checking similarity engine 164 | similarity_engine = PackingSimilarity() 165 | similarity_engine.settings.distance_tolerance = 0.2 166 | similarity_engine.settings.angle_tolerance = 20. 167 | similarity_engine.settings.ignore_bond_types = True 168 | similarity_engine.settings.ignore_hydrogen_counts = True 169 | similarity_engine.settings.ignore_hydrogen_positions = True 170 | similarity_engine.settings.packing_shell_size = 15 171 | 172 | # Get the families and member structures. 173 | with open(refcode_families_f) as f: 174 | data = f.read() 175 | refcode_families = ast.literal_eval(data) 176 | 177 | # Get families with more than one structure. For families with one 178 | # one structure, add structure to the unique structure lise. 179 | csd_entries = io.EntryReader('CSD') 180 | families_clustered = {} 181 | for family in refcode_families: 182 | if len(refcode_families[family]) > 1: 183 | structures_to_check = [] 184 | for structure in refcode_families[family]: 185 | entry = csd_entries.entry(structure) 186 | crystal = entry.crystal 187 | molecule = entry.molecule 188 | 189 | # Check if structure is valid according to search criteria. 190 | if structure_check(input_parameters,crystal,molecule) == None: 191 | continue 192 | 193 | structures_to_check.append([structure,crystal]) 194 | 195 | # Get similar structures 196 | similar_structure_groups = similarity_check(structures_to_check,similarity_engine) 197 | 198 | # Print out groups of similar structures 199 | if len(similar_structure_groups) > 0: 200 | families_clustered[family] = [] 201 | for i, group in enumerate(similar_structure_groups): 202 | group = sorted(group) 203 | families_clustered[family].append(group) 204 | 205 | else: 206 | entry = csd_entries.entry(refcode_families[family][0]) 207 | crystal = entry.crystal 208 | molecule = entry.molecule 209 | 210 | # Check if structure is valid according to search criteria. 211 | if structure_check(input_parameters,crystal,molecule) == None: 212 | continue 213 | 214 | families_clustered[family] = [[refcode_families[family][0]]] 215 | 216 | # Specify the filename for the clustered families 217 | filename = '../csd_db_analysis/db_data/' + input_parameters['data_prefix'] + '_csd_refcode_families_clustered.json' 218 | 219 | # Writing the dictionary to a file in JSON format 220 | with open(filename, 'w') as f: 221 | json.dump(families_clustered, f, indent=4) 222 | 223 | return families_clustered 224 | 225 | def get_unique_structures(input_parameters): 226 | ''' 227 | Goes through the clustered refcode families and return a single structure 228 | for each group of similar structures. The resulting structure is based on 229 | user defined criteria. 230 | 231 | Parameters 232 | ---------- 233 | input_parameters : dict 234 | A dictionary with the input parameters for the search. 235 | 236 | Returns 237 | ------- 238 | unique_structures : dict 239 | A dictionaty with the unique polymorphs for each refcode family. 240 | ''' 241 | # Set the unique structures clustering method 242 | unique_structures_clustering_method = input_parameters['unique_structures_clustering_method'] 243 | if unique_structures_clustering_method == 'energy': 244 | visualhabit_settings = VisualHabit.Settings() 245 | 246 | # Open the refcode families clusters file and read data. 247 | refcode_families_clusters_f = '../csd_db_analysis/db_data/' + input_parameters['data_prefix'] + '_csd_refcode_families_clustered.json' 248 | if not os.path.exists(refcode_families_clusters_f ): 249 | # If the file does not exist, raise an exception 250 | raise FileNotFoundError(f'The file {refcode_families_clusters_f} does not exist.') 251 | else: 252 | # Get the families and member structures. 253 | with open(refcode_families_clusters_f) as f: 254 | data = f.read() 255 | families_clustered = ast.literal_eval(data) 256 | 257 | # Loop over refcode family clusters. 258 | csd_entries = io.EntryReader('CSD') 259 | unique_structures = {} 260 | for family in families_clustered: 261 | # Loop over the numbe rof polymorphs 262 | unique_structures[family] = [] 263 | n_polymorphs = len(families_clustered[family]) 264 | for i in range(n_polymorphs): 265 | n_similar_structures = len(families_clustered[family][i]) 266 | 267 | # If the polymorph has only one structure deposited, add 268 | # structure to the dictionary. Else, cluster similar structures. 269 | if n_similar_structures == 1: 270 | if families_clustered[family][i][0] in input_parameters['structures_to_exclude']: 271 | continue 272 | unique_structures[family].append(families_clustered[family][i][0]) 273 | else: 274 | # Set the minimum value for the ranking 275 | minimum_value = np.inf 276 | minimum_value_structure = '' 277 | for structure in families_clustered[family][i]: 278 | if structure in input_parameters['structures_to_exclude']: 279 | continue 280 | 281 | entry = csd_entries.entry(structure) 282 | crystal = entry.crystal 283 | 284 | if unique_structures_clustering_method == 'energy': 285 | try: 286 | results = VisualHabit(settings=visualhabit_settings).calculate(crystal) 287 | except Exception: 288 | continue 289 | lattice_energy = results.lattice_energy.total 290 | 291 | if lattice_energy < minimum_value: 292 | minimum_value = lattice_energy 293 | minimum_value_structure = structure 294 | 295 | if unique_structures_clustering_method == 'vdWFV': 296 | vdWFV = 1.0 - crystal.packing_coefficient 297 | 298 | if vdWFV < minimum_value: 299 | minimum_value = vdWFV 300 | minimum_value_structure = structure 301 | 302 | if minimum_value_structure != '': 303 | unique_structures[family].append(minimum_value_structure) 304 | unique_structures[family] = sorted(unique_structures[family]) 305 | 306 | # Specify the filename for the clustered families 307 | filename = '../csd_db_analysis/db_data/' + input_parameters['data_prefix'] + '_csd_refcode_families_unique_structures.json' 308 | 309 | # Writing the dictionary to a file in JSON format 310 | with open(filename, 'w') as f: 311 | json.dump(unique_structures, f, indent=4) 312 | 313 | return unique_structures 314 | 315 | def check_for_target_fragments(input_parameters,molecule): 316 | fragment_list = create_reference_fragments() 317 | 318 | # Check for target fragments 319 | for fragment in fragment_list: 320 | if fragment not in input_parameters['target_fragments']: 321 | continue 322 | 323 | csd_fragment = ccdc.search.SMARTSSubstructure(fragment_list[fragment]['smarts']) 324 | fragmentSearch = ccdc.search.SubstructureSearch() 325 | fragmentID = fragmentSearch.add_substructure(csd_fragment) 326 | hits = fragmentSearch.search(molecule) 327 | 328 | if hits == []: 329 | return None 330 | 331 | return True 332 | 333 | def get_csd_atom_and_molecule_properties(crystal,molecule,atoms): 334 | ''' 335 | Extracts and returns the atomic and molecular properties for a CSD entry. 336 | 337 | Parameters 338 | ---------- 339 | crystal : csd obj 340 | The CSD crystal object of the structure. 341 | molecule : csd obj 342 | The CSD molecule object of the structure. 343 | atoms : csd obj 344 | The CSD atoms object of the structure. 345 | 346 | Returns 347 | ------- 348 | atom_properties : dict 349 | A dictionary with the atomic properties. 350 | molecule_properties : dict 351 | A dictionary with the molecular properties. 352 | ''' 353 | structure_molecule = {} 354 | structure_molecule['atoms_charge'] = np.array([at.partial_charge for at in atoms]) 355 | structure_molecule['atoms_labels'] = [at.label for at in atoms] 356 | structure_molecule['atoms_mass'] = np.round(np.array([at.atomic_weight for at in atoms]),4) 357 | structure_molecule['atoms_species'] = [at.atomic_symbol for at in atoms] 358 | structure_molecule['atoms_vdW_radius'] = np.round(np.array([at.vdw_radius for at in atoms]),4) 359 | structure_molecule['atoms_coordinates_f'] = np.round(np.array([[at.fractional_coordinates[i] for i in [0,1,2]] for at in atoms]),4) 360 | structure_molecule['atoms_coordinates_c'] = np.round(np.array([[at.coordinates[i] for i in [0,1,2]] for at in atoms]),4) 361 | structure_molecule['n_atoms'] = len(atoms) 362 | structure_molecule['coordinates_f'] = np.round(np.sum(structure_molecule['atoms_mass'].reshape(structure_molecule['n_atoms'],1) * structure_molecule['atoms_coordinates_f'],axis=0) / np.sum(structure_molecule['atoms_mass']),4) 363 | structure_molecule['coordinates_c'] = np.round(np.sum(structure_molecule['atoms_mass'].reshape(structure_molecule['n_atoms'],1) * structure_molecule['atoms_coordinates_c'],axis=0) / np.sum(structure_molecule['atoms_mass']),4) 364 | structure_molecule['volume'] = np.round(molecule.molecular_volume,4) 365 | structure_molecule['atoms_bond_vectors_f'] = np.round(structure_molecule['atoms_coordinates_f'] - structure_molecule['coordinates_f'],4) 366 | structure_molecule['atoms_bond_vectors_c'] = np.round(structure_molecule['atoms_coordinates_c'] - structure_molecule['coordinates_c'],4) 367 | structure_molecule['bonds'] = [[bond.atoms[0].label, bond.atoms[1].label] for bond in molecule.bonds] 368 | 369 | return structure_molecule 370 | 371 | def get_csd_crystal_properties(crystal): 372 | ''' 373 | Extracts and returns the crystal properties for a CSD entry. 374 | 375 | Parameters 376 | ---------- 377 | crystal : csd obj 378 | The CSD crystal object of the structure. 379 | 380 | Returns 381 | ------- 382 | crystal_properties : dict 383 | A dictionary with the crystal properties. 384 | ''' 385 | # Set the engine for energy calculation 386 | visualhabit_settings = VisualHabit.Settings() 387 | visualhabit_settings.potential = 'gavezzotti' 388 | try: 389 | energy = VisualHabit(settings=visualhabit_settings).calculate(crystal) 390 | except Exception: 391 | energy = None 392 | if energy != None: 393 | lattice_energy = energy.lattice_energy 394 | 395 | crystal_properties = {} 396 | crystal_properties['ID'] = crystal.identifier 397 | crystal_properties['formula'] = crystal.formula 398 | crystal_properties['species'] = get_unique_species(crystal.formula) 399 | crystal_properties['space_group'] = crystal.spacegroup_symbol 400 | crystal_properties['z_crystal'] = crystal.z_value 401 | crystal_properties['z_prime'] = crystal.z_prime 402 | crystal_properties['cell_lengths'] = np.round(np.array([l for l in crystal.cell_lengths]),4) 403 | crystal_properties['scaled_cell_lengths'] = np.round(np.array([l for l in crystal.cell_lengths])/crystal.cell_lengths[0],4) 404 | crystal_properties['cell_angles'] = np.round(np.array([l for l in crystal.cell_angles]),2) 405 | crystal_properties['cell_volume'] = np.round(crystal.cell_volume,4) 406 | crystal_properties['cell_density'] = np.round(crystal.calculated_density,4) 407 | crystal_properties['vdWFV'] = np.round(1.0 - crystal.packing_coefficient,4) 408 | crystal_properties['SAS'] = np.round(crystal.void_volume(probe_radius=1.2,grid_spacing=0.2,mode='accessible'),4) 409 | crystal_properties['lattice_vectors'] = np.round(get_lattice_vectors(crystal_properties['cell_lengths'],crystal_properties['cell_angles'],crystal_properties['cell_volume']),4) 410 | crystal_properties['inverse_lattice_vectors'] = np.round(get_lattice_vectors(crystal_properties['cell_lengths'],crystal_properties['cell_angles'],crystal_properties['cell_volume'],inverse=True),4) 411 | crystal_properties['close_contacts'] = crystal.contacts(intermolecular='Intermolecular',distance_range=(-3.0, 0.50)) 412 | crystal_properties['hbonds'] = crystal.hbonds(intermolecular='Intermolecular') 413 | if energy != None: 414 | crystal_properties['lattice_energy'] = { 415 | 'total': np.round(lattice_energy.total,4), 416 | 'electrostatic': np.round(lattice_energy.electrostatic,4), 417 | 'vdW': np.round(lattice_energy.vdw,4), 418 | 'vdW_attraction': np.round(lattice_energy.vdw_attraction,4), 419 | 'vdW_repulsion': np.round(lattice_energy.vdw_repulsion,4), 420 | 'h-bond': np.round(lattice_energy.h_bond,4), 421 | 'h-bond_attraction': np.round(lattice_energy.h_bond_attraction,4), 422 | 'h-bond_repulsion': np.round(lattice_energy.h_bond_repulsion,4) 423 | } 424 | else: 425 | crystal_properties['lattice_energy'] = { 426 | 'total': 0.0, 427 | 'electrostatic': 0.0, 428 | 'vdW': 0.0, 429 | 'vdW_attraction': 0.0, 430 | 'vdW_repulsion': 0.0, 431 | 'h-bond': 0.0, 432 | 'h-bond_attraction': 0.0, 433 | 'h-bond_repulsion': 0.0} 434 | return crystal_properties 435 | 436 | def get_csd_structure_fragments(input_parameters,structure,molecule): 437 | ''' 438 | Identify and returns the fragments in a molecule 439 | 440 | Parameters 441 | ---------- 442 | input_parameters : dict 443 | A dictionary with the user defined input parameters. 444 | structure : dict 445 | A disctionary with the data for the structure. 446 | molecule : object 447 | The csd molecule object for the structure. 448 | 449 | Returns 450 | ------- 451 | str_fragments : dict 452 | A dictionary with the identified fragments in the molecule 453 | ''' 454 | # Update the reference fragment list 455 | fragment_list = create_reference_fragments() 456 | 457 | # Get the fragments for the structure 458 | fragments = {} 459 | i_hit = 0 460 | for fragment in fragment_list: 461 | csd_fragment = ccdc.search.SMARTSSubstructure(fragment_list[fragment]['smarts']) 462 | fragmentSearch = ccdc.search.SubstructureSearch() 463 | fragmentID = fragmentSearch.add_substructure(csd_fragment) 464 | hits = fragmentSearch.search(molecule) 465 | for hit in hits: 466 | i_hit += 1 467 | key = 'F' + str(i_hit).zfill(2) + '.' + fragment 468 | hit_atoms = [] 469 | hit_atoms_species = [] 470 | hit_atoms_labels = [] 471 | for at in hit.match_atoms(): 472 | hit_atoms.append(structure['molecule']['atoms_labels'].index(at.label)) 473 | hit_atoms_species.append(at.atomic_symbol) 474 | hit_atoms_labels.append(at.label) 475 | fragments[key] = {} 476 | fragments[key]['smarts'] = fragment_list[fragment]['smarts'] 477 | fragments[key]['atoms'] = hit_atoms 478 | fragments[key]['atoms_species'] = hit_atoms_species 479 | fragments[key]['atoms_labels'] = hit_atoms_labels 480 | fragments[key]['atoms_mass'] = np.round(np.array(fragment_list[fragment]['mass']),4) 481 | fragments[key]['n_atoms'] = len(fragments[key]['atoms']) 482 | fragments[key]['atoms_coordinates_c'] = np.round(np.array(structure['molecule']['atoms_coordinates_c'][hit_atoms]),4) 483 | fragments[key]['atoms_coordinates_f'] = np.round(np.array(structure['molecule']['atoms_coordinates_f'][hit_atoms]),4) 484 | fragments[key]['atoms_coordinates_sf'] = np.round(np.array(fragment_list[fragment]['coordinates_sf']),4) 485 | fragments[key]['atoms_to_align'] = fragment_list[fragment]['atoms_to_align'] 486 | fragments[key]['coordinates_c'] = np.round(np.sum(fragments[key]['atoms_mass'].reshape(fragments[key]['n_atoms'],1) * fragments[key]['atoms_coordinates_c'],axis=0) / np.sum(fragments[key]['atoms_mass']),4) 487 | fragments[key]['coordinates_f'] = np.round(np.sum(fragments[key]['atoms_mass'].reshape(fragments[key]['n_atoms'],1) * fragments[key]['atoms_coordinates_f'],axis=0) / np.sum(fragments[key]['atoms_mass']),4) 488 | fragments[key]['atoms_bond_vectors_c'] = np.round(fragments[key]['atoms_coordinates_c'] - fragments[key]['coordinates_c'],4) 489 | fragments[key]['atoms_bond_vectors_f'] = np.round(fragments[key]['atoms_coordinates_f'] - fragments[key]['coordinates_f'],4) 490 | 491 | # Remove subsets (sub-fragments) 492 | entries_to_remove = set() 493 | 494 | # Compare all pairs of keys 495 | for key1 in fragments: 496 | for key2 in fragments: 497 | if key1 != key2 and key1 not in entries_to_remove and key2 not in entries_to_remove: 498 | if fragments[key1]['smarts'] == fragments[key2]['smarts']: 499 | continue 500 | 501 | atoms1 = set(fragments[key1]['atoms_labels']) 502 | atoms2 = set(fragments[key2]['atoms_labels']) 503 | 504 | # Check if atoms of entry1 are subset of entry2 505 | if atoms1.issubset(atoms2): 506 | entries_to_remove.add(key1) 507 | elif atoms2.issubset(atoms1): 508 | entries_to_remove.add(key2) 509 | 510 | # Remove identified keys from the dictionary 511 | for key in entries_to_remove: 512 | del fragments[key] 513 | 514 | # Add a fragment number ID 515 | str_fragments = {} 516 | for i, key in enumerate(fragments): 517 | new_key = 'F' + str(i + 1).zfill(2) + '.' + key[4:] 518 | str_fragments[new_key] = fragments[key] 519 | 520 | # Add fragments for full components 521 | if input_parameters['add_full_component']: 522 | for i, component in enumerate(molecule.components): 523 | key = 'FMC.component_' + str(i + 1) 524 | str_fragments[key] = {} 525 | str_fragments[key]['atoms_labels'] = [at.label for at in component.atoms] 526 | str_fragments[key]['atoms'] = [structure['molecule']['atoms_labels'].index(at.label) for at in component.atoms] 527 | str_fragments[key]['atoms_species'] = [at.atomic_symbol for at in component.atoms] 528 | str_fragments[key]['atoms_mass'] = np.round(np.array([at.atomic_weight for at in component.atoms]),4) 529 | str_fragments[key]['n_atoms'] = len(component.atoms) 530 | str_fragments[key]['atoms_coordinates_c'] = np.round(np.array([at.coordinates for at in component.atoms]),4) 531 | str_fragments[key]['atoms_coordinates_f'] = np.round(np.array([at.fractional_coordinates for at in component.atoms]),4) 532 | str_fragments[key]['atoms_to_align'] = 'all' 533 | str_fragments[key]['coordinates_c'] = np.round(np.sum(str_fragments[key]['atoms_mass'].reshape(str_fragments[key]['n_atoms'],1) * str_fragments[key]['atoms_coordinates_c'],axis=0) / np.sum(str_fragments[key]['atoms_mass']),4) 534 | str_fragments[key]['coordinates_f'] = np.round(np.sum(str_fragments[key]['atoms_mass'].reshape(str_fragments[key]['n_atoms'],1) * str_fragments[key]['atoms_coordinates_f'],axis=0) / np.sum(str_fragments[key]['atoms_mass']),4) 535 | str_fragments[key]['atoms_bond_vectors_c'] = np.round(str_fragments[key]['atoms_coordinates_c'] - str_fragments[key]['coordinates_c'],4) 536 | str_fragments[key]['atoms_bond_vectors_f'] = np.round(str_fragments[key]['atoms_coordinates_c'] - str_fragments[key]['coordinates_c'],4) 537 | 538 | # Set the rotation of the full component 539 | inertia_eigenvalues, inertia_eigenvectors = calculate_inertia(str_fragments[key]['atoms_mass'],str_fragments[key]['atoms_bond_vectors_c']) 540 | inertia_eigenvalues, inertia_eigenvectors = sort_eigenvectors(inertia_eigenvalues,inertia_eigenvectors) 541 | inertia_eigenvectors = ensure_right_handed_coordinate_system(inertia_eigenvectors) 542 | 543 | str_fragments[key]['atoms_coordinates_sf'] = np.round(np.round(np.matmul(str_fragments[key]['atoms_bond_vectors_c'],inertia_eigenvectors), decimals=4),4) 544 | 545 | return str_fragments 546 | -------------------------------------------------------------------------------- /source_code/generate_molecule_fragments.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import json 3 | import numpy as np 4 | 5 | from maths import calculate_inertia 6 | from maths import center_of_mass 7 | from maths import ensure_right_handed_coordinate_system 8 | from maths import sort_eigenvectors 9 | from structure_operations import get_atoms_from_formula 10 | 11 | def create_reference_fragments(): 12 | """ 13 | Converts the input fragment list into a space fixed list of fragments. 14 | 15 | Parameters 16 | ---------- 17 | 18 | Returns 19 | ------- 20 | reference_fragment_list : dict 21 | A dictionary with the space fixed reference fragments. 22 | """ 23 | with open("../source_data/fragment_list.json","r") as f: 24 | fragment_list = json.load(f) 25 | 26 | reference_fragment_list = {} 27 | for fragment in fragment_list: 28 | fragment_atoms_mass = np.array(fragment_list[fragment]["mass"]) 29 | fragment_atoms_pos = np.array(fragment_list[fragment]["coordinates"]) 30 | fragment_com = center_of_mass(fragment_atoms_mass,fragment_atoms_pos) 31 | fragment_atoms_bv = fragment_atoms_pos - fragment_com 32 | 33 | inertia_eigenvalues, inertia_eigenvectors = calculate_inertia(fragment_atoms_mass, 34 | fragment_atoms_bv) 35 | 36 | inertia_eigenvalues, inertia_eigenvectors = sort_eigenvectors(inertia_eigenvalues, 37 | inertia_eigenvectors) 38 | 39 | inertia_eigenvectors = ensure_right_handed_coordinate_system(inertia_eigenvectors) 40 | 41 | fragment_atoms_sfc = np.matmul(fragment_atoms_bv, 42 | inertia_eigenvectors) 43 | 44 | fragment_atoms_sfc = np.round(fragment_atoms_sfc, decimals=4) 45 | 46 | reference_fragment_list[fragment] = {"smarts": fragment_list[fragment]["smarts"], 47 | "species": fragment_list[fragment]["species"], 48 | "coordinates_sf": fragment_atoms_sfc.tolist(), 49 | "mass": fragment_list[fragment]["mass"], 50 | "atoms_to_align": fragment_list[fragment]["atoms_to_align"]} 51 | 52 | # Write the reference fragment to json file 53 | with open('../source_data/reference_fragment_list.json', 'w') as f: 54 | json.dump(reference_fragment_list, f, indent=4) 55 | 56 | return 57 | 58 | def get_reference_fragment_list(): 59 | ''' 60 | Returns the reference fragment list. 61 | ''' 62 | with open('../source_data/reference_fragment_list.json','r') as f: 63 | reference_fragment_list = json.load(f) 64 | return reference_fragment_list 65 | 66 | def get_molecule_fragments(input_fragments,reference_fragment_list): 67 | ''' 68 | Returns the fragments for the reference molecule. 69 | 70 | Parameters 71 | ---------- 72 | input_fragments : list 73 | A list with the fragments for the compound. 74 | reference_fragment_list : dict 75 | A dictionary with the reference fragments used to build the molecules. 76 | 77 | Returns 78 | ------- 79 | int 80 | The number of fragments. 81 | molecule_fragments : dict 82 | A dictionary with the properties of the fragments. 83 | 84 | ''' 85 | molecule_fragments = {} 86 | fragment_count = {} # Keep track of how many times each fragment has appeared 87 | 88 | for fragment in input_fragments: 89 | if fragment in fragment_count: 90 | # If the fragment has appeared before, increment its count 91 | fragment_count[fragment] += 1 92 | # Use the fragment name and its count to create a unique key 93 | unique_key = f"{fragment}_{fragment_count[fragment]}" 94 | else: 95 | # If it's the first time the fragment appears, initialize its count 96 | fragment_count[fragment] = 1 97 | unique_key = fragment 98 | 99 | # Use the unique key to store the fragment in the molecule_fragments dictionary 100 | molecule_fragments[unique_key] = reference_fragment_list[fragment] 101 | 102 | return len(molecule_fragments), molecule_fragments 103 | 104 | def calculate_molecular_volume(formula,compound_rings,atomic_properties): 105 | ''' 106 | Calculates the molecular vdW volume 107 | J. Org. Chem. 2003, 68, 19, 7368–7373 108 | 109 | Parameters 110 | ---------- 111 | formula : str 112 | The molecular formula. 113 | compound_rings : dictionary 114 | A dictionary with the number of aromatic and aliphatic rings. 115 | atomic_properties : dict 116 | A dictionary containing the atomic properties. 117 | 118 | Returns 119 | ------- 120 | molecular_volume : float 121 | The molecular vdW volume. 122 | ''' 123 | # Set the number of aromatic and aliphatic rings from the dictionary 124 | n_rings = [compound_rings['aromatic'], compound_rings['aliphatic']] 125 | 126 | # Get the count of atoms for each species 127 | species_counts, n_atoms, _ = get_atoms_from_formula(formula) 128 | 129 | # Calculate the number of bonds 130 | n_bonds = n_atoms - 1 + n_rings[0] + n_rings[1] 131 | 132 | # Calculate total atomic vdW volume 133 | atomic_vdW_volume = np.sum([species_counts[key] * (4.0 * np.pi * atomic_properties[key]['van_der_waals_radius']**3 / 3) for key in species_counts]) 134 | 135 | # Calculate the molecular volume 136 | molecular_volume = atomic_vdW_volume - 5.92 * n_bonds - 14.7 * n_rings[0] - 3.8 * n_rings[1] 137 | 138 | return molecular_volume 139 | 140 | def calculate_molecular_weight(formula,atomic_properties): 141 | ''' 142 | Calculates the molecular weight from formula 143 | 144 | Parameters 145 | ---------- 146 | formula : str 147 | The molecular formula. 148 | atomic_properties : dict 149 | A dictionary containing the atomic properties. 150 | 151 | Returns 152 | ------- 153 | molecular_weight : float 154 | The molecular weight 155 | ''' 156 | # Get the count of atoms for each species 157 | species_counts, n_atoms, _ = get_atoms_from_formula(formula) 158 | 159 | # Calculate the molecular weight 160 | molecular_weight = np.sum([species_counts[atom]*atomic_properties[atom]['atomic_mass'] for atom in species_counts]) 161 | 162 | return molecular_weight 163 | 164 | def generate_fragments(input_parameters,atomic_properties): 165 | ''' 166 | Reads input data from input files. 167 | 168 | Parameters 169 | ---------- 170 | input_parameters : dict 171 | A dictionary containing the input parameters. 172 | 173 | Returns 174 | ------- 175 | n_fragments : int 176 | The number of fragments in the molecule. 177 | fragments : dict 178 | A dictionary with the fragment properties. 179 | reference_molecule : dict 180 | A dictionary containing the reference molecule properties. 181 | ''' 182 | # Read the reference fragment list 183 | reference_fragment_list = get_reference_fragment_list() 184 | 185 | # Get the fragments for the molecule 186 | n_fragments, fragments = get_molecule_fragments(input_parameters['fragments'], 187 | reference_fragment_list) 188 | 189 | # Initialize reference molecule dictionary 190 | reference_molecule = {'formula': input_parameters['formula']} 191 | 192 | # Get the molecular volume 193 | reference_molecule['volume'] = calculate_molecular_volume(input_parameters['formula'], 194 | input_parameters['rings'], 195 | atomic_properties) 196 | 197 | # Calculate the molecular weight 198 | reference_molecule['weight'] = calculate_molecular_weight(input_parameters['formula'], 199 | atomic_properties) 200 | 201 | return n_fragments, fragments, reference_molecule -------------------------------------------------------------------------------- /source_code/get_analysis_data.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | from space_group_operations import apply_symmetry_operations 6 | from structure_operations import get_lattice_vectors 7 | 8 | def get_user_variables(input_parameters,variables): 9 | ''' 10 | Gets the variables for the analysis/plotting 11 | 12 | Parameters 13 | ---------- 14 | input_parameters : dict 15 | The user defined input file 16 | variables : dict 17 | A dictionary with the available variables 18 | 19 | Returns 20 | ------- 21 | A list with the unique variables for the analysis/plotting 22 | ''' 23 | # Get the variables for the analysis 24 | user_variables = [] 25 | if input_parameters['histograms_options']['variables'] != 'all': 26 | user_variables.extend([var[0][0] for var in input_parameters['histograms_options']['variables']]) 27 | else: 28 | for key in variables: 29 | for var in variables[key]: 30 | user_variables.append(var) 31 | if input_parameters['2D_scatter_plots_options']['variables'] != 'all': 32 | user_variables.extend([var[0][0] for var in input_parameters['2D_scatter_plots_options']['variables']]) 33 | user_variables.extend([var[0][1] for var in input_parameters['2D_scatter_plots_options']['variables']]) 34 | else: 35 | for key in variables: 36 | for var in variables[key]: 37 | user_variables.append(var) 38 | if input_parameters['3D_scatter_plots_options']['variables'] != 'all': 39 | user_variables.extend([var[0][0] for var in input_parameters['3D_scatter_plots_options']['variables']]) 40 | user_variables.extend([var[0][1] for var in input_parameters['3D_scatter_plots_options']['variables']]) 41 | user_variables.extend([var[0][2] for var in input_parameters['3D_scatter_plots_options']['variables']]) 42 | else: 43 | for key in variables: 44 | for var in variables[key]: 45 | user_variables.append(var) 46 | 47 | return sorted(list(set(user_variables))) 48 | 49 | def get_value(data, path): 50 | ''' 51 | Gets value for a specific variable 52 | 53 | Parameters 54 | ---------- 55 | data : dict 56 | A dictionary with the structure data. 57 | path : str 58 | The path to read the variable value from the structure data dictionary. 59 | 60 | Returns 61 | ------- 62 | data : float, str or bool 63 | The value for the specific variable in the structure. 64 | ''' 65 | for key in path: 66 | data = data[key] 67 | return data 68 | 69 | def get_analysis_structures_list(input_parameters): 70 | # Get the structures list based on the user defined filters 71 | structures_list = {} 72 | structures_filter_data_filename = input_parameters['data_directory'] + input_parameters['data_prefix'] + '_structures_filter_data.json' 73 | 74 | filter_groups = { 75 | 'single': { 76 | 'target_space_groups': 'space_group', 77 | 'target_z_crystal_values': 'z_crystal', 78 | 'target_z_prime_values': 'z_prime', 79 | 'target_species': 'species' 80 | }, 81 | 'single_combinations': { 82 | 'target_structure_fragments': 'fragments', 83 | 'target_contact_central_fragments': 'contact_central_fragments' 84 | }, 85 | 'multiple_combinations': { 86 | 'target_contact_pairs': 'contact_pairs', 87 | 'target_contact_fragment_pairs': 'contact_fragment_pairs' 88 | } 89 | } 90 | 91 | with open(structures_filter_data_filename) as f: 92 | structures_filter_data = json.load(f) 93 | 94 | for structure, values in zip(structures_filter_data.keys(),structures_filter_data.values()): 95 | accept_structure = True 96 | 97 | if input_parameters['target_families'] != None and structure[:6] not in input_parameters['target_families']: 98 | accept_structure = False 99 | 100 | if input_parameters['target_structures'] != None and structure not in input_parameters['target_structures']: 101 | accept_structure = False 102 | 103 | for filter, property in zip(filter_groups['single'].keys(),filter_groups['single'].values()): 104 | if input_parameters[filter] != None and values[property] not in input_parameters[filter]: 105 | accept_structure = False 106 | 107 | for filter, property in zip(filter_groups['single_combinations'].keys(),filter_groups['single_combinations'].values()): 108 | if input_parameters[filter] != None: 109 | if input_parameters[filter][1] == 'or': 110 | if not any(item in set(values[property]) for item in input_parameters[filter][0]): 111 | accept_structure = False 112 | elif input_parameters[filter][1] == 'and': 113 | if not set(input_parameters[filter][0]).issubset(set(values[property])): 114 | accept_structure = False 115 | 116 | for filter, property in zip(filter_groups['multiple_combinations'].keys(),filter_groups['multiple_combinations'].values()): 117 | if input_parameters[filter] != None: 118 | if input_parameters[filter][1] == 'or': 119 | if not any(tuple(item) in set(tuple(x) for x in values[property]) for item in input_parameters[filter][0]): 120 | accept_structure = False 121 | elif input_parameters[filter][1] == 'and': 122 | if not set(tuple(x) for x in input_parameters[filter][0]).issubset(set(tuple(x) for x in values[property])): 123 | accept_structure = False 124 | 125 | if not accept_structure: 126 | continue 127 | 128 | space_group = structures_filter_data[structure]['space_group'] 129 | if space_group != 'R-3': 130 | if space_group not in structures_list: 131 | structures_list[space_group] = [] 132 | structures_list[space_group].append(structure) 133 | return structures_list 134 | 135 | def get_analysis_data(input_parameters,variables): 136 | ''' 137 | Gets the data for the plots. 138 | 139 | Parameters 140 | ---------- 141 | input_parameters : dict 142 | A dictionary with the user defined input parameters. 143 | variables : dict 144 | A dictionary with the available variables and their properties. 145 | 146 | Returns 147 | ------- 148 | data : dict 149 | A dictionary with the required data to create the user defined plots. 150 | 151 | ''' 152 | 153 | # Get the variables for the analysis 154 | user_variables = get_user_variables(input_parameters,variables) 155 | 156 | # Get the structures list for the analysis based on the used defined filters 157 | structures_list = get_analysis_structures_list(input_parameters) 158 | 159 | # Set the structures folder 160 | structure_files_folder = input_parameters['data_directory'] + input_parameters['data_prefix'] + '_structures/' 161 | 162 | # Get the user variables families 163 | variable_families = [variables[var]['family'] for var in user_variables] 164 | variable_families = sorted(list(set(variable_families))) 165 | 166 | # Initialize the data dictionary for the analysis 167 | data = {key: {space_group: {} for space_group in structures_list} for key in variable_families} 168 | 169 | # Set the data filter dependencies (variables that should be added for filtering data) 170 | filter_dependencies = { 171 | 'structure': ['z_crystal','z_prime'], 172 | 'fragment': ['z_crystal','z_prime','fragment'], 173 | 'contact': ['z_crystal','z_prime','cc_central_atom_fragment','cc_contact_atom_fragment','cc_central_atom_species','cc_contact_atom_species','cc_type','cc_is_in_los'], 174 | 'fragment_atom': ['z_crystal','z_prime','fragment','fragment_atom_species'], 175 | 'contact_atom': ['z_crystal','z_prime','cc_central_atom_fragment','cc_contact_atom_fragment','cc_central_atom_species','cc_contact_atom_species','cc_type','cc_is_in_los','cc_length']} 176 | 177 | # Add variables to the data ditionary for data filter dependencies 178 | for variable_family in variable_families: 179 | for space_group in structures_list: 180 | for denepdency in filter_dependencies[variable_family]: 181 | data[variable_family][space_group][denepdency] = [] 182 | 183 | # Add user variables to data dictionary 184 | for var in user_variables: 185 | for space_group in structures_list: 186 | data[variables[var]['family']][space_group][var] = [] 187 | 188 | # Check if additional positional variables should be added to calculate coordinate transformations from fractional to cartesian and via versa. 189 | variable_groups = {} 190 | for var in variables: 191 | variable_group = variables[var]['position_symmetry'][3] 192 | if str(variable_group) not in variable_groups: 193 | variable_groups[str(variable_group)] = [] 194 | 195 | variable_groups[str(variable_group)].append(var) 196 | 197 | for variable_family in data: 198 | for space_group in structures_list: 199 | add_zero = False 200 | family_variables = [key for key in data[variable_family][space_group]] 201 | for var in family_variables: 202 | variable_group = variables[var]['position_symmetry'][3] 203 | if variable_group > 0 and var[-2:] in ['_x','_y','_z','_u','_v','_w']: 204 | add_zero = True 205 | for group_var in variable_groups[str(variable_group)]: 206 | if group_var not in data[variable_family][space_group]: 207 | data[variable_family][space_group][group_var] = [] 208 | if add_zero: 209 | for var in variable_groups['0']: 210 | data[variable_family][space_group][var] = [] 211 | 212 | # Get data from indivirdual structure files grouped by space group 213 | for space_group in structures_list: 214 | for structure in structures_list[space_group]: 215 | with open(structure_files_folder + structure + '.json') as f: 216 | # Read the current structure data 217 | structure_data = json.load(f) 218 | 219 | # Add structure data to the data dictionary for the analysis 220 | for variable_family in variable_families: 221 | for var in data[variable_family][space_group]: 222 | 223 | if variable_family == 'structure': 224 | path = copy.deepcopy(variables[var]['path']) 225 | value = get_value(structure_data, path) 226 | data[variable_family][space_group][var].append(value) 227 | 228 | if variable_family == 'fragment': 229 | if variables[var]['family'] == 'fragment': 230 | for fragment_key in structure_data['fragments']: 231 | path = copy.deepcopy(variables[var]['path']) 232 | path[1] = fragment_key 233 | value = get_value(structure_data, path) 234 | data[variable_family][space_group][var].append(value) 235 | if variables[var]['family'] == 'structure': 236 | for fragment_key in structure_data['fragments']: 237 | path = copy.deepcopy(variables[var]['path']) 238 | value = get_value(structure_data, path) 239 | data[variable_family][space_group][var].append(value) 240 | 241 | if variable_family == 'fragment_atom': 242 | if variables[var]['family'] == 'fragment_atom': 243 | for fragment_key in structure_data['fragments']: 244 | path = copy.deepcopy(variables[var]['path']) 245 | path[1] = fragment_key 246 | for atom_key in structure_data['fragments'][fragment_key]['atoms']: 247 | path[3] = atom_key 248 | value = get_value(structure_data, path) 249 | data[variable_family][space_group][var].append(value) 250 | if variables[var]['family'] == 'fragment': 251 | for fragment_key in structure_data['fragments']: 252 | for atom_key in structure_data['fragments'][fragment_key]['atoms']: 253 | path = copy.deepcopy(variables[var]['path']) 254 | path[1] = fragment_key 255 | value = get_value(structure_data, path) 256 | data[variable_family][space_group][var].append(value) 257 | if variables[var]['family'] == 'structure': 258 | for fragment_key in structure_data['fragments']: 259 | for atom_key in structure_data['fragments'][fragment_key]['atoms']: 260 | path = copy.deepcopy(variables[var]['path']) 261 | value = get_value(structure_data, path) 262 | data[variable_family][space_group][var].append(value) 263 | 264 | if variable_family == 'contact': 265 | if variables[var]['family'] == 'contact': 266 | for pair_key in structure_data['crystal']['close_contacts']: 267 | path = copy.deepcopy(variables[var]['path']) 268 | path[2] = pair_key 269 | value = get_value(structure_data, path) 270 | data[variable_family][space_group][var].append(value) 271 | if variables[var]['family'] == 'contact_atom' and var in ['cc_central_atom_fragment','cc_contact_atom_fragment','cc_central_atom_species','cc_contact_atom_species']: 272 | for pair_key in structure_data['crystal']['close_contacts']: 273 | path = copy.deepcopy(variables[var]['path']) 274 | path[2] = pair_key 275 | path[3] = var[:15] 276 | value = get_value(structure_data, path) 277 | data[variable_family][space_group][var].append(value) 278 | if variables[var]['family'] == 'structure': 279 | for pair_key in structure_data['crystal']['close_contacts']: 280 | path = copy.deepcopy(variables[var]['path']) 281 | value = get_value(structure_data, path) 282 | data[variable_family][space_group][var].append(value) 283 | 284 | if variable_family == 'contact_atom': 285 | if variables[var]['family'] == 'contact_atom': 286 | for pair_key in structure_data['crystal']['close_contacts']: 287 | path = copy.deepcopy(variables[var]['path']) 288 | path[2] = pair_key 289 | path[3] = var[:15] 290 | value = get_value(structure_data, path) 291 | data[variable_family][space_group][var].append(value) 292 | if variables[var]['family'] == 'contact': 293 | for pair_key in structure_data['crystal']['close_contacts']: 294 | path = copy.deepcopy(variables[var]['path']) 295 | path[2] = pair_key 296 | value = get_value(structure_data, path) 297 | data[variable_family][space_group][var].append(value) 298 | if variables[var]['family'] == 'structure': 299 | for pair_key in structure_data['crystal']['close_contacts']: 300 | path = copy.deepcopy(variables[var]['path']) 301 | value = get_value(structure_data, path) 302 | data[variable_family][space_group][var].append(value) 303 | 304 | # Apply symmetry operations to atomic coordinates 305 | # Get the space group properties 306 | with open('../source_data/space_group_properties.json') as f: 307 | space_group_properties = json.load(f) 308 | 309 | for variable_family in data: 310 | for space_group in data[variable_family]: 311 | # Sort data based on variable name for the correct application of symmetry operations 312 | data[variable_family][space_group] = OrderedDict(sorted(data[variable_family][space_group].items())) 313 | 314 | # Load symmetry operations for the current space group 315 | symmetry_operations = space_group_properties[space_group]['symmetry_operations'] 316 | 317 | # Get the groups of variables 318 | groups = [] 319 | for var in data[variable_family][space_group].keys(): 320 | group = variables[var]['position_symmetry'] 321 | if group[3] > 0 and group not in groups: 322 | groups.append(group) 323 | 324 | # Get the symmetry groups 325 | symmetry_groups = [] 326 | for group in groups: 327 | group_variables = [var for var in variables if variables[var]['position_symmetry'][3] == group[3]] 328 | symmetry_groups.append([group[0],group[1],group_variables]) 329 | 330 | # Get the coordinates for the symemtric atoms 331 | symmetric_variables = [] 332 | for rotation, translation, group_variables in symmetry_groups: 333 | symmetric_variables.extend([var for var in group_variables]) 334 | if group_variables[0][-2:] == '_u': 335 | fractional_positions = np.transpose([data[variable_family][space_group][group_variables[0]], 336 | data[variable_family][space_group][group_variables[1]], 337 | data[variable_family][space_group][group_variables[2]]]) 338 | 339 | elif group_variables[0][-2:] == '_x': 340 | cartesian_positions = np.transpose([data[variable_family][space_group][group_variables[0]], 341 | data[variable_family][space_group][group_variables[1]], 342 | data[variable_family][space_group][group_variables[2]]]) 343 | cell_parameters = np.transpose([data[variable_family][space_group]['cell_length_a'], 344 | data[variable_family][space_group]['cell_length_b'], 345 | data[variable_family][space_group]['cell_length_c'], 346 | data[variable_family][space_group]['cell_angle_alpha'], 347 | data[variable_family][space_group]['cell_angle_beta'], 348 | data[variable_family][space_group]['cell_angle_gamma'], 349 | data[variable_family][space_group]['cell_volume']]) 350 | 351 | fractional_positions = [] 352 | lattice_vectors = [] 353 | for (x, y, z), (a, b, c, alpha, beta, gamma, omega) in zip(cartesian_positions,cell_parameters): 354 | lattice_vectors.append(get_lattice_vectors(np.array([a,b,c]),np.array([alpha,beta,gamma]),omega,inverse=False)) 355 | inverse_lattice_vectors = get_lattice_vectors(np.array([a,b,c]),np.array([alpha,beta,gamma]),omega,inverse=True) 356 | fractional_positions.append(np.dot([x,y,z],inverse_lattice_vectors).tolist()) 357 | 358 | for op in symmetry_operations[1:]: 359 | if group_variables[0][-2:] == '_u': 360 | symmetric_positions = apply_symmetry_operations(fractional_positions, op, translation) 361 | if group_variables[0][-2:] == '_x': 362 | symmetric_positions = [] 363 | for pos, vec in zip(fractional_positions,lattice_vectors): 364 | symmetric_positions.append(np.dot(apply_symmetry_operations([pos], op, translation), vec)[0]) 365 | 366 | for x, y, z in symmetric_positions: 367 | data[variable_family][space_group][group_variables[0]].append(x) 368 | data[variable_family][space_group][group_variables[1]].append(y) 369 | data[variable_family][space_group][group_variables[2]].append(z) 370 | 371 | # Extend data for symmetric positions 372 | if symmetric_variables != []: 373 | for var in data[variable_family][space_group]: 374 | if var not in symmetric_variables: 375 | extend_data = copy.deepcopy(data[variable_family][space_group][var]) 376 | for i in range(len(symmetry_operations) - 1): 377 | data[variable_family][space_group][var].extend(extend_data) 378 | 379 | # Move fractional coordinates of atoms in unit cell if necessary 380 | for var in data[variable_family][space_group]: 381 | if variables[var]['position_symmetry'][2]: 382 | data[variable_family][space_group][var] = [x % 1 for x in data[variable_family][space_group][var]] 383 | 384 | return data -------------------------------------------------------------------------------- /source_code/get_structure_data.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import itertools 3 | import json 4 | import numpy as np 5 | import os 6 | import re 7 | from ccdc import io 8 | 9 | import io_operations 10 | from csd_operations import check_for_target_fragments 11 | from csd_operations import get_csd_atom_and_molecule_properties 12 | from csd_operations import get_csd_crystal_properties 13 | from csd_operations import get_csd_structure_fragments 14 | from get_structures_list import get_structures_list 15 | from maths import align_structures 16 | from maths import cartesian_to_spherical 17 | from maths import distance_to_plane 18 | from maths import distance_to_zzp_planes_family 19 | from maths import get_reference_cell_points 20 | from maths import kabsch_rotation_matrix 21 | from maths import set_zzp_planes 22 | from maths import vectors_closest_to_perpendicular 23 | 24 | class NumpyArrayEncoder(json.JSONEncoder): 25 | """ Custom encoder for numpy data types """ 26 | def default(self, obj): 27 | if isinstance(obj, np.ndarray): 28 | return obj.tolist() 29 | return json.JSONEncoder.default(self, obj) 30 | 31 | def format_lists(json_str): 32 | """ Formats lists in the JSON string to remove unnecessary whitespace and newlines. """ 33 | pattern = re.compile(r'\[\s*((?:[^[\]]|\n)+?)\s*\]', re.DOTALL) 34 | return re.sub(pattern, lambda x: '[' + x.group(1).replace('\n', '').replace(' ', '') + ']', json_str) 35 | 36 | def convert_to_json(data): 37 | """ Converts Python dictionary to formatted JSON string. """ 38 | json_str = json.dumps(data, cls=NumpyArrayEncoder, indent=4) 39 | formatted_json = format_lists(json_str) 40 | return formatted_json 41 | 42 | def get_structure_data(input_parameters): 43 | ''' 44 | Extracts data from the unique CSD structures. 45 | 46 | Parameters 47 | ---------- 48 | input_parameters : dict 49 | A dictionary with the input parameters for the search. 50 | 51 | Returns 52 | ------- 53 | ''' 54 | # Set the files to write data 55 | db_folder = "../csd_db_analysis/db_data/" 56 | prefix = input_parameters["data_prefix"] 57 | db_structures_folder = db_folder + "_".join([prefix,"structures"]) + "/" 58 | 59 | # Create the structures folder 60 | os.makedirs(db_structures_folder, exist_ok=True) 61 | 62 | # Get the reference structures dictionary. 63 | if input_parameters["structure_list"][0] == "csd-all": 64 | reference_structures_f = '../csd_db_analysis/db_data/' + input_parameters["data_prefix"] + '_csd_refcode_families_clustered.json' 65 | elif input_parameters["structure_list"][0] == "csd-unique": 66 | reference_structures_f = '../csd_db_analysis/db_data/' + input_parameters["data_prefix"] + '_csd_refcode_families_unique_structures.json' 67 | elif input_parameters["structure_list"][0] == "cif": 68 | cif_files_f = '../source_data/cif_files/' 69 | reference_structures_f = cif_files_f + 'cif_structures_list.json' 70 | 71 | # Check if the dictionary exists. 72 | if not os.path.exists(reference_structures_f): 73 | # If the file does not exist, raise an exception 74 | raise FileNotFoundError(f"The file {reference_structures_f} does not exist.") 75 | else: 76 | # Get the families and member structures. 77 | with open(reference_structures_f) as f: 78 | data = f.read() 79 | reference_structures = ast.literal_eval(data) 80 | 81 | # Get the structures list for the analysis 82 | structures_list = get_structures_list(input_parameters,reference_structures) 83 | 84 | # Get the csd entries if necessary 85 | if input_parameters["structure_list"][0] in ["csd-all","csd-unique"]: 86 | csd_entries = io.EntryReader("CSD") 87 | 88 | # Initialize cell reference points 89 | cell_reference_points = get_reference_cell_points(-1, 2.5, 0.5) 90 | 91 | # Set the ZZP planes 92 | zzp_planes = set_zzp_planes() 93 | 94 | # Loop over the structures in the list 95 | for structure_name in structures_list: 96 | # Set the csd_crystal and csd_molecule objects 97 | if input_parameters["structure_list"][0] in ["csd-all","csd-unique"]: 98 | entry = csd_entries.entry(structure_name) 99 | crystal = entry.crystal 100 | if input_parameters["center_molecule"]: 101 | crystal.centre_molecule() # Move molecule inside unit cell 102 | molecule = entry.molecule 103 | elif input_parameters["structure_list"][0] == "cif": 104 | crystal = io.CrystalReader(cif_files_f + structure_name) 105 | crystal = crystal[0] 106 | if input_parameters["center_molecule"]: 107 | crystal.centre_molecule() # Move molecule inside unit cell 108 | molecule = io.MoleculeReader(cif_files_f + structure_name) 109 | molecule = molecule[0] 110 | 111 | # Exclude structures 112 | if crystal.identifier in input_parameters["structures_to_exclude"]: 113 | continue 114 | 115 | # Add missing hydrogen atoms 116 | try: 117 | molecule.assign_bond_types() 118 | molecule.add_hydrogens(mode='missing') 119 | molecule.assign_partial_charges() 120 | except Exception: 121 | continue 122 | 123 | # Set the atoms for the reference molecule 124 | try: 125 | atoms = molecule.atoms 126 | except Exception: 127 | continue 128 | 129 | # Check for unnatural atoms with no coordinates 130 | discard = False 131 | for at in atoms: 132 | if at.coordinates == None: 133 | discard = True 134 | break 135 | 136 | if discard: 137 | continue 138 | 139 | # Check for target fragments 140 | if check_for_target_fragments(input_parameters,molecule) == None: 141 | continue 142 | 143 | # Initialize structure 144 | structure = {} 145 | 146 | # Get crystal, molecule and atom properties 147 | structure["crystal"] = get_csd_crystal_properties(crystal) 148 | structure["molecule"] = get_csd_atom_and_molecule_properties(crystal,molecule,atoms) 149 | 150 | # Get the fragments for the structure 151 | structure["fragments"] = get_csd_structure_fragments(input_parameters,structure,molecule) 152 | 153 | # Discard structures with none of the desired substructures 154 | if not bool(structure["fragments"]): 155 | continue 156 | 157 | # Calculate the structure specific cell reference points in cartesian 158 | # coordinates 159 | cell_points = np.dot(cell_reference_points, structure["crystal"]["lattice_vectors"]) 160 | 161 | # Loop over all fragments to calculate the fragment orientation 162 | # print('Analyzing structure ' + structure["crystal"]["ID"]) 163 | for fragment in structure["fragments"]: 164 | current_fragment = structure["fragments"][fragment] 165 | # Get the list of atoms that are used for the aligmnent 166 | if current_fragment["atoms_to_align"] == "all": 167 | atoms_to_align = list(range(current_fragment["n_atoms"])) 168 | else: 169 | atoms_to_align = current_fragment["atoms_to_align"] 170 | 171 | # Get the rotation matrix 172 | current_fragment["rotation_matrix"] = np.round(kabsch_rotation_matrix(current_fragment["atoms_coordinates_sf"][atoms_to_align], 173 | current_fragment["atoms_bond_vectors_c"][atoms_to_align]),4) 174 | current_fragment["inverse_rotation_matrix"] = np.round(current_fragment["rotation_matrix"].T,4) 175 | 176 | # Filter unwanted fragments in case of identical smarts representation 177 | if fragment[4:] in input_parameters["fragments_to_check_alignment"]: 178 | rmsd = align_structures(current_fragment["rotation_matrix"], 179 | current_fragment["atoms_coordinates_sf"][atoms_to_align], 180 | current_fragment["atoms_bond_vectors_c"][atoms_to_align]) 181 | 182 | if rmsd > input_parameters["alignment_tolerance"]: 183 | continue 184 | 185 | # Calculate the normalized vectors perpendicular to the 186 | # principal inertia planes in the crystallographic coordinates 187 | # system 188 | current_fragment["principal_inertia_planes_f"] = np.dot(current_fragment["rotation_matrix"], structure["crystal"]["lattice_vectors"].T) 189 | current_fragment["principal_inertia_planes_f"] = np.round(current_fragment["principal_inertia_planes_f"] / np.linalg.norm(current_fragment["principal_inertia_planes_f"], axis=1, keepdims=True),4) 190 | 191 | # Identify for each eigenvector the proposed vectors that are 192 | # closest to be perpendicular and the respective angle 193 | current_fragment["n_max_vectors"] = vectors_closest_to_perpendicular(current_fragment["principal_inertia_planes_f"], 194 | input_parameters["proposed_vectors_n_max"]) 195 | 196 | # Calculate minimum distances of pripcipal inertia planes to the 197 | # corners of all the points of a 3x3x3 supercell in the form 198 | # (0.5k1, 0.5k2, 0.5k3), k1, k2, k3 = -2, -1, ..., 4 199 | minimum_distances_to_planes = [] 200 | for plane in current_fragment["rotation_matrix"]: 201 | d_min = np.inf 202 | for point in cell_points: 203 | d = distance_to_plane(point,plane,current_fragment["coordinates_c"],normal=False) 204 | if d < d_min: 205 | d_min = d 206 | minimum_distances_to_planes.append(d_min) 207 | current_fragment["principal_inertia_planes_distances_to_cell_points"] = np.round(minimum_distances_to_planes,4) 208 | 209 | # Calculate minimum distance of non-hydeogen atoms to ZZP planes 210 | minimum_distances_to_zzp_planes = [] 211 | for point in current_fragment["atoms_coordinates_f"]: 212 | d_min = np.inf 213 | for plane_normal, plane_norm in zzp_planes: 214 | d = distance_to_zzp_planes_family(point, plane_normal, plane_norm) 215 | if d < d_min: 216 | d_min = d 217 | minimum_distances_to_zzp_planes.append(d_min) 218 | current_fragment["minimum_atom_distances_to_zzp_planes"] = np.round(minimum_distances_to_zzp_planes,4) 219 | 220 | # Add hydrogen atoms to fragmetnts 221 | if fragment[:3] != "FMC": 222 | for atom, atom_label in zip(structure["molecule"]["atoms_species"],structure["molecule"]["atoms_labels"]): 223 | if atom == "H": 224 | for at1, at2 in structure["molecule"]["bonds"]: 225 | if at1 == atom_label: 226 | bonded_atom = at2 227 | if at2 == atom_label: 228 | bonded_atom = at1 229 | if bonded_atom in current_fragment["atoms_labels"]: 230 | current_fragment["atoms_species"].append(atom) 231 | current_fragment["atoms_labels"].append(atom_label) 232 | 233 | # Create the contacts dictionary 234 | structure_contacts = {} 235 | for contact in structure["crystal"]["close_contacts"]: 236 | # Check if the contact is part of an h-bond 237 | is_hbond = False 238 | for hbond in structure["crystal"]["hbonds"]: 239 | hbond_atom_labels = [atom.label for atom in hbond.atoms] 240 | if (contact.atoms[0].label, contact.atoms[1].label) in list(itertools.permutations(hbond_atom_labels,2)): 241 | if [contact.atoms[0].label, contact.atoms[1].label] not in structure["molecule"]["bonds"] and [contact.atoms[1].label, contact.atoms[0].label] not in structure["molecule"]["bonds"]: 242 | is_hbond = True 243 | break 244 | 245 | # Get the central and contact groups (fragments) 246 | central_group = [fragment for fragment in structure["fragments"] if contact.atoms[0].label in structure["fragments"][fragment]["atoms_labels"] if fragment[:3] != "FMC"] 247 | contact_group = [fragment for fragment in structure["fragments"] if contact.atoms[1].label in structure["fragments"][fragment]["atoms_labels"] if fragment[:3] != "FMC"] 248 | for i in [0, 1]: 249 | for fragment1 in central_group: 250 | for fragment2 in contact_group: 251 | at1, at2 = 0, 1 252 | central_fragment, contact_fragment = fragment1, fragment2 253 | 254 | # Get the bond vectors of the contact atoms to the central 255 | # fragment 256 | central_bond_vector = contact.atoms[at1].coordinates - structure["fragments"][central_fragment]["coordinates_c"] 257 | contact_bond_vector = contact.atoms[at2].coordinates - structure["fragments"][central_fragment]["coordinates_c"] 258 | 259 | # Rotate them to the central fragment's reference system 260 | central_bond_vector_r = np.dot(central_bond_vector,structure["fragments"][central_fragment]["inverse_rotation_matrix"]) 261 | contact_bond_vector_r = np.dot(contact_bond_vector,structure["fragments"][central_fragment]["inverse_rotation_matrix"]) 262 | 263 | # Convert contact bond vector to spherical coodinates 264 | contact_bond_vector_spherical = cartesian_to_spherical(contact_bond_vector_r) 265 | 266 | # Get the contact type 267 | contact_type = "hbond" if is_hbond else "vdW" 268 | 269 | # Add contact data to list 270 | structure_contacts['_'.join([contact.atoms[at1].label,fragment1,contact.atoms[at2].label,fragment2])] = { 271 | "cc_length": np.round(contact.length,4), 272 | "cc_type": contact_type, 273 | "cc_is_in_los": contact.is_in_line_of_sight, 274 | "cc_central_atom": { 275 | "atom": contact.atoms[at1].atomic_symbol, 276 | "fragment": central_fragment[4:], 277 | "coordinates": { 278 | "cartesian": np.round(contact.atoms[at1].coordinates,4), 279 | "fractional": np.round(contact.atoms[at1].fractional_coordinates,4) 280 | }, 281 | "bond_vectors": np.round(central_bond_vector,4), 282 | "reference_bond_vectors": np.round(central_bond_vector_r,4) 283 | }, 284 | "cc_contact_atom": { 285 | "atom": contact.atoms[at2].atomic_symbol, 286 | "fragment": contact_fragment[4:], 287 | "coordinates": { 288 | "cartesian": np.round(contact.atoms[at2].coordinates,4), 289 | "fractional": np.round(contact.atoms[at2].fractional_coordinates,4) 290 | }, 291 | "bond_vectors": np.round(contact_bond_vector,4), 292 | "reference_bond_vectors": np.round(contact_bond_vector_r,4), 293 | "reference_bond_vectors_spherical": np.round(contact_bond_vector_spherical,4) 294 | }, 295 | } 296 | structure["crystal"]["close_contacts"] = structure_contacts 297 | 298 | # Create the hydrogen bonds dictionary 299 | structure_hbonds = {} 300 | for hbond in structure["crystal"]["hbonds"]: 301 | # Get the donor atom 302 | hbond_atom_labels = [atom.label for atom in hbond.atoms] 303 | for bond in structure["molecule"]["bonds"]: 304 | if hbond_atom_labels[1] in bond: 305 | if hbond_atom_labels[0] in bond: 306 | hbond_donor = 0 307 | hbond_acceptor = 2 308 | if hbond_atom_labels[2] in bond: 309 | hbond_donor = 2 310 | hbond_acceptor = 0 311 | 312 | structure_hbonds['_'.join([hbond.atoms[hbond_donor].label,hbond.atoms[1].label,hbond.atoms[hbond_acceptor].label])] = { 313 | "hb_atoms": (hbond.atoms[hbond_donor].atomic_symbol,hbond.atoms[1].atomic_symbol,hbond.atoms[hbond_acceptor].atomic_symbol), 314 | "hb_length": np.round(hbond.length,4), 315 | "hb_da_distance": np.round(hbond.da_distance,4), 316 | "hb_angle": np.round(hbond.angle,4), 317 | "hb_is_in_los": hbond.is_in_line_of_sight, 318 | "hb_donor_coordinates": np.round(hbond.atoms[hbond_donor].coordinates,4), 319 | "hb_h_coordinates": np.round(hbond.atoms[1].coordinates,4), 320 | "hb_acceptor_coordinates": np.round(hbond.atoms[hbond_acceptor].coordinates,4), 321 | } 322 | structure["crystal"]["hbonds"] = structure_hbonds 323 | 324 | # Create the crystal dictionary 325 | structure_crystal = { 326 | "str_id": structure["crystal"]["ID"], 327 | "space_group": structure["crystal"]["space_group"], 328 | "z_crystal": structure["crystal"]["z_crystal"], 329 | "z_prime": structure["crystal"]["z_prime"], 330 | "formula": structure["crystal"]["formula"], 331 | "species": structure["crystal"]["species"], 332 | "cell_lengths": structure["crystal"]["cell_lengths"], 333 | "scaled_cell_lengths": structure["crystal"]["scaled_cell_lengths"], 334 | "cell_angles": structure["crystal"]["cell_angles"], 335 | "cell_volume": structure["crystal"]["cell_volume"], 336 | "cell_density": structure["crystal"]["cell_density"], 337 | "vdWFV": structure["crystal"]["vdWFV"], 338 | "SAS": structure["crystal"]["SAS"], 339 | "lattice_vectors": structure["crystal"]["lattice_vectors"], 340 | "lattice_energy": structure["crystal"]["lattice_energy"], 341 | "close_contacts": structure["crystal"]["close_contacts"], 342 | "hbonds": structure["crystal"]["hbonds"], 343 | } 344 | 345 | # Create the fragments dictionary 346 | structure_fragments = {} 347 | for fragment in structure["fragments"]: 348 | # Get the data for the atoms 349 | at_labels = structure["fragments"][fragment]["atoms_labels"] 350 | at_species = structure["fragments"][fragment]["atoms_species"] 351 | at_coordinates_c = structure["fragments"][fragment]["atoms_coordinates_c"] 352 | at_coordinates_f = structure["fragments"][fragment]["atoms_coordinates_f"] 353 | at_bond_vectors_c = structure["fragments"][fragment]["atoms_bond_vectors_c"] 354 | at_bond_vectors_f = structure["fragments"][fragment]["atoms_bond_vectors_f"] 355 | min_distance_to_zzp = structure["fragments"][fragment]["minimum_atom_distances_to_zzp_planes"] 356 | fragment_atoms = {} 357 | for label, species, coor_c, coor_f, bv_c, bv_f, d_min in zip(at_labels,at_species,at_coordinates_c,at_coordinates_f,at_bond_vectors_c,at_bond_vectors_f,min_distance_to_zzp): 358 | fragment_atoms[label] = { 359 | "species": species, 360 | "coordinates": { 361 | "cartesian": coor_c, 362 | "fractional": coor_f 363 | }, 364 | "bond_vectors": { 365 | "cartesian": bv_c, 366 | "fractional": bv_f 367 | }, 368 | "dzzp_min": d_min 369 | } 370 | 371 | # Get the data for the inertia planes 372 | fragment_inertia_planes = {} 373 | eigvecs_c = structure["fragments"][fragment]["rotation_matrix"] 374 | eigvecs_f = structure["fragments"][fragment]["principal_inertia_planes_f"] 375 | n_max_vectors = structure["fragments"][fragment]["n_max_vectors"] 376 | eigvecs_dmin = structure["fragments"][fragment]["principal_inertia_planes_distances_to_cell_points"] 377 | for i_vector, (e, w, (_, ((n1, ang1), (n2, ang2))), d_min) in enumerate(zip(eigvecs_c,eigvecs_f,n_max_vectors,eigvecs_dmin)): 378 | fragment_inertia_planes["e_" + str(i_vector + 1)] = { 379 | "cartesian": e, 380 | "crystallographic": w, 381 | "perpendicular_vectors": { 382 | "vector_1": n1, 383 | "vector_2": n2, 384 | "angle_1": ang1, 385 | "angle_2": ang2}, 386 | "min_distance_to reference_points": d_min 387 | } 388 | 389 | # Set the fragment name 390 | if fragment[4:-2] == "component": 391 | fragment_name = "component" 392 | else: 393 | fragment_name = fragment[4:] 394 | 395 | # Set the fragment data 396 | structure_fragments[fragment] = { 397 | "fragment": fragment_name, 398 | "coordinates": { 399 | "cartesian": structure["fragments"][fragment]["coordinates_c"], 400 | "fractional": structure["fragments"][fragment]["coordinates_f"] 401 | }, 402 | "inertia_planes": fragment_inertia_planes, 403 | "atoms": fragment_atoms 404 | } 405 | 406 | # Set the complete structure data 407 | structure_data = { 408 | "crystal": structure_crystal, 409 | "fragments": structure_fragments 410 | } 411 | 412 | # Convert data to json format 413 | structure_data = convert_to_json(structure_data) 414 | 415 | # Write data to file 416 | io_operations.write_structure_data_file(db_structures_folder,structure_crystal,structure_data) 417 | 418 | return 419 | 420 | def get_structure_filter_data(input_parameters): 421 | """ 422 | Creates a dictionary with structure information that can be used to rapidly 423 | filter structures for analysis 424 | 425 | Parameters 426 | ---------- 427 | input_parameters : dict 428 | A dictionary with the input parameters for the search. 429 | 430 | Returns 431 | ------- 432 | None. 433 | 434 | """ 435 | # Set the files to read and write data 436 | db_folder = "../csd_db_analysis/db_data/" 437 | prefix = input_parameters["data_prefix"] 438 | db_structures_folder = db_folder + "_".join([prefix,"structures"]) + "/" 439 | 440 | # Read the structures list 441 | structures_list = os.listdir(db_structures_folder) 442 | 443 | # Get the structure filter data 444 | structures_filter_data = {} 445 | for structure in structures_list: 446 | with open(db_structures_folder + "/" + structure,"r") as f: 447 | structure_data = json.load(f) 448 | 449 | structure_crystal = structure_data["crystal"] 450 | structure_fragments = structure_data["fragments"] 451 | structure_contacts = structure_crystal["close_contacts"] 452 | 453 | fragments = [] 454 | for fragment in structure_fragments: 455 | if structure_fragments[fragment]["fragment"] not in fragments: 456 | fragments.append(structure_fragments[fragment]["fragment"]) 457 | 458 | contact_pairs = [] 459 | contact_central_fragments = [] 460 | contact_fragment_pairs = [] 461 | for contact in structure_contacts: 462 | contact_pair = [structure_contacts[contact]["cc_central_atom"]["atom"],structure_contacts[contact]["cc_contact_atom"]["atom"],structure_contacts[contact]["cc_type"],structure_contacts[contact]["cc_is_in_los"]] 463 | if contact_pair not in contact_pairs: 464 | contact_pairs.append(contact_pair) 465 | 466 | contact_central_fragment = [structure_contacts[contact]["cc_central_atom"]["fragment"],structure_contacts[contact]["cc_type"],structure_contacts[contact]["cc_is_in_los"]] 467 | if contact_central_fragment not in contact_central_fragments: 468 | contact_central_fragments.append(contact_central_fragment) 469 | 470 | contact_fragment_pair = [structure_contacts[contact]["cc_central_atom"]["fragment"],structure_contacts[contact]["cc_contact_atom"]["fragment"],structure_contacts[contact]["cc_type"],structure_contacts[contact]["cc_is_in_los"]] 471 | if contact_fragment_pair not in contact_fragment_pairs: 472 | contact_fragment_pairs.append(contact_fragment_pair) 473 | 474 | structures_filter_data[structure_data["crystal"]["str_id"]] = { 475 | "space_group": structure_crystal["space_group"], 476 | "z_crystal": structure_crystal["z_crystal"], 477 | "z_prime": structure_crystal["z_prime"], 478 | "species": structure_crystal["species"], 479 | "fragments": fragments, 480 | "contact_pairs": contact_pairs, 481 | "contact_central_fragments": contact_central_fragments, 482 | "contact_fragment_pairs": contact_fragment_pairs 483 | } 484 | 485 | # Convert data to json format 486 | structures_filter_data = convert_to_json(structures_filter_data) 487 | 488 | # Write data to file 489 | io_operations.write_structures_filter_data(input_parameters,structures_filter_data) 490 | 491 | return 492 | 493 | 494 | 495 | 496 | -------------------------------------------------------------------------------- /source_code/get_structures_list.py: -------------------------------------------------------------------------------- 1 | def get_structures_list(input_parameters,reference_structures): 2 | ''' 3 | Returns the structure list for the analysis 4 | 5 | Parameters 6 | ---------- 7 | inpu_parameters : dict 8 | A dictionary with the user defined input parameters. 9 | reference_structures : dict 10 | The unique reference structures calculated based on the user defined 11 | criteria 12 | 13 | Returns 14 | ------- 15 | structures_list : dict 16 | A dictionary with the structures to analyze 17 | ''' 18 | # Create the structures list 19 | structures_list = {} 20 | if input_parameters["structure_list"][0] in ["csd-all","csd-unique"]: 21 | if input_parameters["structure_list"][0] == "csd-all": 22 | if input_parameters["structure_list"][1] == "all": 23 | for family in reference_structures: 24 | for group in reference_structures[family]: 25 | for structure in group: 26 | structures_list[structure] = {} 27 | else: 28 | target_families = [families[0] for families in input_parameters["structure_list"][1]] 29 | target_families_structures = [families[1] for families in input_parameters["structure_list"][1]] 30 | for target_family, target_structures in zip(target_families, target_families_structures): 31 | if target_structures == "all": 32 | for group in reference_structures[target_family]: 33 | for structure in group: 34 | structures_list[structure] = {} 35 | 36 | else: 37 | structure_indices = [str(target_structure).zfill(2) if target_structure != 0 else '' for target_structure in target_structures ] 38 | for index in structure_indices: 39 | if target_family + index not in [structure for target_family in reference_structures for group in reference_structures[target_family] for structure in group]: 40 | print(f'Structure {target_family + index} is not found in reference structures and will be excluded from the data extraction process.') 41 | continue 42 | structures_list[target_family + index] = {} 43 | 44 | if input_parameters["structure_list"][0] == "csd-unique": 45 | if input_parameters["structure_list"][1] == "all": 46 | for family in reference_structures: 47 | for structure in reference_structures[family]: 48 | structures_list[structure] = {} 49 | else: 50 | target_families = [families[0] for families in input_parameters["structure_list"][1]] 51 | target_families_structures = [families[1] for families in input_parameters["structure_list"][1]] 52 | for target_family, target_structures in zip(target_families, target_families_structures): 53 | if target_structures == "all": 54 | for structure in reference_structures[target_family]: 55 | structures_list[structure] = {} 56 | 57 | else: 58 | structure_indices = [str(target_structure).zfill(2) if target_structure != 0 else '' for target_structure in target_structures ] 59 | for index in structure_indices: 60 | if target_family + index not in reference_structures[target_family]: 61 | print(f'Structure {target_family + index} is not found in reference structures and will be excluded from the data extraction process.') 62 | continue 63 | structures_list[target_family + index] = {} 64 | 65 | elif input_parameters["structure_list"][0] == "cif": 66 | for family in reference_structures: 67 | for structure in reference_structures[family]: 68 | structures_list[structure] = {} 69 | 70 | return structures_list -------------------------------------------------------------------------------- /source_code/input_checks.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # Define a function to check if the given variables are boolean 4 | def check_boolean_variables(variables): 5 | for var_name, value in variables.items(): 6 | if not isinstance(value, bool): 7 | print(f"Error: The variable '{var_name}' must be a boolean (True or False).") 8 | sys.exit(1) 9 | 10 | # Define a function to check if the given variables are integers 11 | def check_integer_variables(variables): 12 | for var_name, value in variables.items(): 13 | if not isinstance(value, int): 14 | print(f"Error: The variable '{var_name}' must be an integer (True or False).") 15 | sys.exit(1) 16 | 17 | # Define functions to check if the single value variables get values from their respective lists 18 | def check_single_value_variables(variables): 19 | for var_name, values in variables.items(): 20 | if values[0] not in values[1]: 21 | print(f"Error: The variable '{var_name}' has an invalid value '{values[0]}'. Allowed values are {values[1]}.") 22 | sys.exit(1) 23 | 24 | # Define functions to check if the list variables get values from their respective lists 25 | def check_list_variables(variables): 26 | for var_name, values in variables.items(): 27 | if any(value not in values[1] for value in values[0]): 28 | print(f"Error: The variable '{var_name}' has an invalid value '{values[0]}'. Allowed values are {values[1]}.") 29 | sys.exit(1) 30 | 31 | def check_input_parameters(data_analysis,data_extraction,extraction_actions,extraction_filters,analysis_actions,topological_properties): 32 | # Define the check for the boolean variables 33 | boolean_variables = { 34 | 'data_analysis': data_analysis, 35 | 'data_extraction': data_extraction, 36 | **extraction_actions, 37 | 'center_molecule': extraction_filters['center_molecule'], 38 | 'add_full_component': extraction_filters['add_full_component'], 39 | **analysis_actions 40 | } 41 | 42 | # Define the dictionary mapping integer variables to their available values 43 | integer_variables = { 44 | 'proposed_vectors_n_max': topological_properties['proposed_vectors_n_max'] 45 | } 46 | 47 | # Define the dictionary mapping single value variables to their available values 48 | single_value_variables = { 49 | 'unique_structures_clustering_method': [extraction_filters['unique_structures_clustering_method'], ['energy', 'vdWFV']], 50 | 'structure_list': [extraction_filters['structure_list'][0], ['csd-all', 'csd-unique', 'cif']] 51 | } 52 | 53 | # Define the dictionary mapping list variables to their available values 54 | list_variables = { 55 | 'crystal_type': [extraction_filters['crystal_type'], ['homomolecular', 'co-crystal', 'hydrate']] 56 | } 57 | 58 | check_boolean_variables(boolean_variables) 59 | check_integer_variables(integer_variables) 60 | check_single_value_variables(single_value_variables) 61 | check_list_variables(list_variables) -------------------------------------------------------------------------------- /source_code/io_operations.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def check_for_file(save_dir,filename): 4 | """ 5 | Check if a file exists and if it does, ask the user whether to overwrite it or not. 6 | 7 | Parameters 8 | ---------- 9 | filename : str 10 | The name of the file. 11 | 12 | Returns 13 | ------- 14 | file object 15 | """ 16 | if os.path.exists(save_dir + filename): 17 | # If the file exists, ask for confirmation to overwrite 18 | user_input = input(f'File {filename} already exists. Do you want to overwrite it? (yes/no): ').lower() 19 | if user_input not in ['yes','y','YES','Y']: 20 | print(f'\tWARNING! Exiting without overwriting the file.\n\tNo data will be writen in file: {filename}') 21 | return None # or manage the scenario where the user doesn't want to overwrite 22 | 23 | # If the file doesn't exist or if overwrite is confirmed, open and return the file object 24 | try: 25 | file_object = open(save_dir + filename, 'w') # Open the file with writing mode, which will also create it if it doesn't exist 26 | return file_object 27 | except Exception as e: 28 | print(f'An error occurred: {e}') 29 | return None 30 | 31 | def write_structure_data_file(db_structures_folder,structure_crystal,structure_data): 32 | """ 33 | Writes structure data to a json format file. 34 | 35 | Parameters 36 | ---------- 37 | db_structures_folder : str 38 | The folder where the data for each structure will be stored. 39 | structure_crystal : dict 40 | A dictionary with the structure crystal data. 41 | structure_data : dict 42 | A dictionary with the structure data. 43 | 44 | Returns 45 | ------- 46 | """ 47 | 48 | structure_data_file = db_structures_folder + structure_crystal["str_id"] + ".json" 49 | with open(structure_data_file,"w") as f: 50 | f.write(structure_data) 51 | 52 | return 53 | 54 | def write_structures_filter_data(input_parameters,structures_filter_data): 55 | """ 56 | Writes compact structure data for the filtering step. 57 | 58 | Parameters 59 | ---------- 60 | input parameters : dict 61 | A dictionary with the user defined input data. 62 | structures_filter_data : dict 63 | A dictionary with the compact structure data. 64 | 65 | Returns 66 | ------- 67 | None 68 | 69 | """ 70 | # Set the file name 71 | db_folder = "../csd_db_analysis/db_data/" 72 | prefix = input_parameters["data_prefix"] 73 | structures_filter_data_file = check_for_file(db_folder, prefix + "_structures_filter_data.json") 74 | 75 | # Write data and close file 76 | structures_filter_data_file.write(structures_filter_data) 77 | structures_filter_data_file.close() 78 | 79 | return 80 | 81 | -------------------------------------------------------------------------------- /source_code/maths.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from scipy.spatial.distance import cdist 4 | 5 | ### COORDINATE OPERATIONS ##################################################### 6 | def kabsch_rotation_matrix(A, B): 7 | """ 8 | Calculate the optimal rotation matrix to align structure A to structure B 9 | using Kabsch algorithm 10 | 11 | Parameters 12 | ---------- 13 | A: Nx3 matrix of coordinates (N atoms, 3 dimensions) for molecule A 14 | B: Nx3 matrix of coordinates (N atoms, 3 dimensions) for molecule B 15 | 16 | Returns 17 | ------- 18 | R: 3x3 optimal rotation matrix 19 | """ 20 | 21 | # Step 1: Centroid and Covariance 22 | # Centroid already at origin, so we skip to covariance matrix 23 | H = A.T @ B 24 | 25 | # Step 2: Singular value decomposition (SVD) 26 | V, S, Wt = np.linalg.svd(H) 27 | 28 | # Step 3: Check for reflection and calculate rotation matrix 29 | d = (np.linalg.det(V) * np.linalg.det(Wt)) < 0.0 30 | 31 | if d: 32 | S[-1] = -S[-1] 33 | V[:, -1] = -V[:, -1] 34 | 35 | # Create optimal rotation matrix 36 | R = V @ Wt 37 | 38 | return R 39 | 40 | def align_structures(R, A, B): 41 | """ 42 | Aligns structure A to structure B and calculates RMSD 43 | 44 | Pamameters: 45 | R: 3x3 optimal rotation matrix 46 | A: Nx3 matrix of coordinates for molecule A 47 | B: Nx3 matrix of coordinates for molecule B 48 | Returns: 49 | R, aligned A, RMSD 50 | """ 51 | A_aligned = A @ R # Apply the rotation matrix to A to align it with B 52 | rmsd = np.sqrt(np.mean(np.sum((A_aligned - B)**2, axis=1))) # Calculate RMSD 53 | 54 | return rmsd 55 | 56 | def cartesian_to_spherical(vector): 57 | """ 58 | Convert a vector from Cartesian coordinates (x, y, z) to spherical 59 | coordinates (r, theta, phi) 60 | 61 | Parameters 62 | ---------- 63 | vector : ndarray 64 | A NumPy array containing the x, y, and z coordinates of the vector. 65 | 66 | Returns 67 | ------- 68 | ndarray 69 | A NumPy array containing the spherical coordinates (r, theta, phi), 70 | where r is the radius, theta is the polar angle (in radians), and 71 | phi is the azimuthal angle (in radians). 72 | """ 73 | # Convert the input list or tuple to a NumPy array if it's not already 74 | vector = np.asarray(vector) 75 | 76 | # Compute the radial distance 77 | r = np.linalg.norm(vector) 78 | 79 | # Compute the polar angle (theta) - angle from the z-axis 80 | # Guard against the r being zero to avoid division by zero 81 | theta = np.arccos(vector[2] / r) if r != 0 else 0 82 | 83 | # Compute the azimuthal angle (phi) - angle from the x-axis in the xy-plane 84 | phi = np.arctan2(vector[1], vector[0]) 85 | 86 | return np.array([r, np.degrees(theta), np.degrees(phi)]) 87 | 88 | ### DISTANCE OPERATIONS ####################################################### 89 | def distance_to_plane(point,plane_normal,plane_point,normal=False): 90 | """ 91 | Calculate the distance from a point to a plane defined by 92 | a normal vector and a point on the plane. 93 | """ 94 | if normal: 95 | d = np.abs(np.dot(plane_normal, point - plane_point)) 96 | else: 97 | d = np.abs(np.dot(plane_normal, point - plane_point)) / np.linalg.norm(plane_normal) 98 | return d 99 | 100 | def distance_to_zzp_planes_family(point,plane_normal,plane_norm): 101 | """ 102 | Calculate the distance from a point to a family of zzp planes defined by a normal vector 103 | """ 104 | distance = distance_to_plane(point,plane_normal,np.array([0.0,0.0,0.0])) % (0.25 / plane_norm) 105 | if distance > 0.125 / plane_norm: 106 | distance = 0.25 / plane_norm - distance 107 | 108 | return distance 109 | 110 | ### PHYSICAL PROPERTIES OPERATIONS ############################################ 111 | def calculate_inertia(mass,pos): 112 | """ 113 | Calculates and returns the inertia tensor and inertia eigenvectors for a 114 | configuration of atoms 115 | 116 | Parameters 117 | ---------- 118 | mass : numpy.ndarray 119 | The mass of the atoms 120 | pos : numpy.ndarray 121 | The positions of the atoms 122 | 123 | Returns 124 | ------- 125 | A tupple with the eigenvalues and eigenvectors of the inertia tensor 126 | """ 127 | # Calculate the inertia tensor for the reference molecule 128 | inertia_tensor = -np.einsum('k,ki,kj->ij', mass, pos, pos, optimize=True) 129 | np.fill_diagonal(inertia_tensor, np.einsum('k,k->', mass, np.sum(pos**2, axis=1), optimize=True) + np.diag(inertia_tensor)) 130 | 131 | return np.linalg.eig(inertia_tensor) 132 | 133 | def ensure_right_handed_coordinate_system(vectors): 134 | """ 135 | Ensure the eigenvectors form a right-handed coordinate system. 136 | 137 | Parameters 138 | ---------- 139 | vectors : numpy.ndarray 140 | 2D array where each column is an eigenvector. 141 | 142 | Returns 143 | ------- 144 | adjusted_vectors : numpy.ndarray 145 | Eigenvectors adjusted to form a right-handed coordinate system. 146 | """ 147 | # Compute the scalar triple product 148 | scalar_triple_product = np.dot(vectors[:, 0], np.cross(vectors[:, 1], vectors[:, 2])) 149 | 150 | # Check if the system is left-handed 151 | if scalar_triple_product < 0: 152 | # Switch the direction of the third eigenvector to make the system right-handed. 153 | vectors[:, 2] = -vectors[:, 2] 154 | 155 | return vectors 156 | 157 | def sort_eigenvectors(eigenvalues,eigenvectors): 158 | """ 159 | Sort eigenvalues and their corresponding eigenvectors in ascending order. 160 | 161 | Parameters: 162 | eigenvalues (numpy.ndarray): 1D array containing the eigenvalues. 163 | eigenvectors (numpy.ndarray): 2D array where each column is an 164 | eigenvector. 165 | 166 | Returns: 167 | sorted_eigenvalues (numpy.ndarray): Eigenvalues sorted in ascending 168 | order. 169 | sorted_eigenvectors (numpy.ndarray): Eigenvectors sorted to correspond 170 | to sorted_eigenvalues. 171 | """ 172 | # Get the indices that would sort eigenvalues in ascending order. 173 | idx = np.argsort(eigenvalues) 174 | 175 | # Use fancy indexing to reorder eigenvalues and eigenvectors. 176 | sorted_eigenvalues = eigenvalues[idx] 177 | sorted_eigenvectors = eigenvectors[:, idx] 178 | 179 | return sorted_eigenvalues, sorted_eigenvectors 180 | 181 | def center_of_mass(mass,pos): 182 | ''' 183 | Calculates and returns the center of mass for a configuration of atoms 184 | 185 | Parameters 186 | ---------- 187 | mass : numpy.ndarray 188 | An array with the mass of the atoms 189 | pos : numpy.ndarray 190 | An array with the positions of the atoms 191 | 192 | Returns 193 | ------- 194 | numpy.ndarray 195 | The center of mass of the configuration (3) 196 | ''' 197 | return np.sum(mass[:,np.newaxis] * pos, axis = 0) / np.sum(mass) 198 | 199 | ### TOPOLOGICAL PROPERTIES OPERATIONS ######################################### 200 | def set_zzp_planes(): 201 | ''' 202 | Sets and returns the ZZP planes 203 | 204 | Returns 205 | ------- 206 | zzp_planes : list 207 | The family of zzp planes in the unit cell. 208 | ''' 209 | zzp_planes = ((np.array([1, 0, 0]),1.), 210 | (np.array([0, 1, 0]),1.), 211 | (np.array([0, 0, 1]),1.), 212 | (np.array([1, 1, 0]),np.sqrt(2.)), 213 | (np.array([1,-1, 0]),np.sqrt(2.)), 214 | (np.array([1, 0, 1]),np.sqrt(2.)), 215 | (np.array([1, 0,-1]),np.sqrt(2.)), 216 | (np.array([0, 1, 1]),np.sqrt(2.)), 217 | (np.array([0, 1,-1]),np.sqrt(2.))) 218 | 219 | return zzp_planes 220 | 221 | def generate_proposed_eigenvectors(n_max): 222 | """ 223 | Generate a list of 3D vectors with specific criteria. 224 | 225 | Parameters 226 | ---------- 227 | n_max : int 228 | The maximum absolute value for the vector components. 229 | 230 | Returns 231 | ------- 232 | A list of valid 3D vectors as tuples. 233 | """ 234 | 235 | # Initialize a list to hold the valid vectors 236 | proposed_eigenvectors = [] 237 | 238 | # Create all combinations of vector components within the range [-n_max, n_max] for a 3D vector 239 | # Note: we adjust the range for the first component to [0, n_max] to satisfy your second condition 240 | alternating_range = [0] + [val for i in range(1, n_max + 1) for val in (i, -i)] 241 | for combination in itertools.product(range(0,n_max+1), alternating_range, alternating_range): 242 | # Unpack the combination into the individual components 243 | x, y, z = combination 244 | 245 | # Check if one and only one of the components is zero (condition 3) 246 | if [x, y, z].count(0) >= 1 and [x, y, z].count(0) <= 2: 247 | # Create a vector from the components 248 | vector = (x, y, z) 249 | 250 | # Check for parallel vectors 251 | # A new vector is parallel to an existing vector if their cross product is the zero vector 252 | is_parallel = False 253 | for valid_vector in proposed_eigenvectors: 254 | cross_product = (valid_vector[1]*vector[2] - valid_vector[2]*vector[1], 255 | valid_vector[2]*vector[0] - valid_vector[0]*vector[2], 256 | valid_vector[0]*vector[1] - valid_vector[1]*vector[0]) 257 | 258 | if cross_product == (0, 0, 0): 259 | is_parallel = True 260 | break # No need to check further, move to the next combination 261 | 262 | # If the vector is not parallel to any vector in the list, we add it to our valid vectors 263 | if not is_parallel: 264 | proposed_eigenvectors.append(vector) 265 | return proposed_eigenvectors 266 | 267 | def vectors_closest_to_perpendicular(I, n_max): 268 | """ 269 | For each vector v in I, find the vectors in valid_vectors closest to be 270 | perpendicular to v. 271 | 272 | Parameters 273 | ---------- 274 | I : list) 275 | List of inquiry vectors. 276 | valid_vectors : list 277 | List of valid vectors to check against. 278 | 279 | Returns: 280 | A list of tuples (v_i, [(w_i, a_i), (w_j, aj)]), where v_i is a vector 281 | from I, and w_i, w_j are the vectors from valid_vectors that are closest to 282 | be perpendicular to v_i, with a_i, a_j the respective angles. 283 | """ 284 | # Generate the set of the proposed vectors 285 | proposed_vectors = generate_proposed_eigenvectors(n_max) 286 | 287 | # Convert the list of vectors into numpy arrays for easier computation 288 | I_array = np.array(I, dtype=np.float64) # ensure floating point precision 289 | valid_vectors_array = np.array(proposed_vectors, dtype=np.float64) # same here 290 | 291 | # Normalize the vectors, since we're interested in the angle between them. 292 | # This normalization step is crucial to ensure that the dot product only measures the angle between vectors. 293 | I_norms = np.linalg.norm(I_array, axis=1, keepdims=True) 294 | valid_vectors_norms = np.linalg.norm(valid_vectors_array, axis=1, keepdims=True) 295 | 296 | # To avoid division by zero, we will use np.divide which can handle these cases gracefully. 297 | # 'out' is used to specify the array where the result is stored. If division by zero occurs, it will be replaced by zero. 298 | I_array = np.divide(I_array, I_norms, out=np.zeros_like(I_array), where=I_norms!=0) 299 | valid_vectors_array = np.divide(valid_vectors_array, valid_vectors_norms, out=np.zeros_like(valid_vectors_array), where=valid_vectors_norms!=0) 300 | 301 | # Compute the cosine distances between each pair of vectors in I and valid_vectors. 302 | sin_distances = cdist(I_array, valid_vectors_array, metric='cosine') - 1.0 303 | 304 | # Find the index of the vector in valid_vectors that forms the smallest angle with each vector in I. 305 | # closest_to_perpendicular_indices = np.argmin(np.abs(sin_distances), axis=1) 306 | closest_to_perpendicular_indices = np.argsort(np.abs(sin_distances), axis=1)[:,:2] 307 | 308 | # Prepare a list to store the pairs of vectors with the minimum angle 309 | closest_to_perpendicular_vectors = [] 310 | 311 | for i, indices in enumerate(closest_to_perpendicular_indices): 312 | v_i = I[i] 313 | results_for_v = [] 314 | 315 | for index in indices: 316 | w_i = proposed_vectors[index] 317 | 318 | # Since we used cosine, we convert it back to the angle. The cosine of the angle between the vectors is the dot product 319 | # because we normalized the vectors. 320 | cos_similarity = sin_distances[i, index] 321 | angle = np.round(np.arccos(cos_similarity) * 180.0 / np.pi,2) # converting to degrees from radians 322 | 323 | results_for_v.append((w_i, angle)) 324 | 325 | # Append the pair of closest vectors along with their angles relative to v_i 326 | closest_to_perpendicular_vectors.append((v_i, results_for_v)) 327 | return closest_to_perpendicular_vectors 328 | 329 | def get_reference_cell_points(min_value,max_value,step): 330 | ''' 331 | Returns a list of the reference cell points for the calculation of the 332 | distances of the principal inertia planes to the points 333 | 334 | Parameters 335 | ---------- 336 | min_value : float 337 | The minimum coordinate in fractional coordinates 338 | max_value : float 339 | The maximum coordinate in fractional coordinates 340 | step : float 341 | The step fractional coordinates 342 | 343 | Returns 344 | ------- 345 | list 346 | The reference cell points for the calculation of the 347 | distances of the principal inertia planes to the points 348 | 349 | ''' 350 | c_list = np.arange(min_value, max_value, step) 351 | return [[u, v, w] for u in c_list for v in c_list for w in c_list] 352 | 353 | 354 | 355 | 356 | -------------------------------------------------------------------------------- /source_code/space_group_operations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def parse_symmetry_operation(op): 4 | ''' 5 | Parse a symmetry operation string into a transformation function. 6 | ''' 7 | # Replace fractional coordinate symbols with array references 8 | op = op.replace('x', 'coord[0]').replace('y', 'coord[1]').replace('z', 'coord[2]') 9 | 10 | # Create a lambda function for the operation 11 | return lambda coord: np.array(eval(op)) 12 | 13 | def apply_symmetry_operations(fractional_positions,symmetry_operation,translation=True): 14 | ''' 15 | Apply symmetry operations to the fractional coordinates of a molecule and return a list for each operation. 16 | 17 | Parameters 18 | ---------- 19 | fractional_positions : numpy.ndarray 20 | The fractional coordinates of the atoms in the reference molecule. 21 | symmetry_operation : str 22 | The symmetry operation for the symmetric molecule 23 | 24 | Returns 25 | ------- 26 | symmetric_positions : numpy.ndarray 27 | The fractional coordinates of the atoms in the symmetric molecule 28 | ''' 29 | # Parse the symmetry operations 30 | if not translation: 31 | for translation_coefficient in ["0.25","0.5","0.75","1.0","1/3","2/3","5/6"]: 32 | symmetry_operation = symmetry_operation.replace(translation_coefficient,"0.0") 33 | 34 | transformation = parse_symmetry_operation(symmetry_operation) 35 | 36 | # Apply transformations to each fractional position 37 | symmetric_positions = np.array([transformation(pos) for pos in fractional_positions]) 38 | 39 | return symmetric_positions 40 | -------------------------------------------------------------------------------- /source_code/structure_operations.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import re 4 | from collections import defaultdict 5 | 6 | def get_unique_species(molecule): 7 | """ 8 | Extracts and returns unique species (elements) from a molecular formula string 9 | in alphabetical order. 10 | 11 | Parameters 12 | ---------- 13 | molecule : str 14 | The molecular formula as a string. 15 | 16 | Return 17 | ------ 18 | list of str 19 | Unique species in alphabetical order. 20 | """ 21 | # Regex pattern to extract element symbols 22 | pattern = re.compile(r'([A-Z][a-z]?)(\d+)?') 23 | 24 | # Extracting elements 25 | elements = pattern.findall(molecule) 26 | 27 | # Extracting unique element symbols and sorting them 28 | unique_species = sorted(set([element[0] for element in elements])) 29 | 30 | return unique_species 31 | 32 | def get_atoms_from_formula(formula): 33 | ''' 34 | Reads the atomic formula and returns the number of atoms for each species 35 | 36 | Parameters 37 | ---------- 38 | formula : str 39 | The molecular formula. 40 | 41 | Returns 42 | ------- 43 | species_counts : dict 44 | A dictionary with the count for each species. 45 | n_atoms : int 46 | The total number of atoms in the molecule. 47 | ''' 48 | # This regular expression will match speciess and their counts 49 | species_regex = r'([A-Z][a-z]?)(\d*)' 50 | 51 | # Use defaultdict to handle speciess with no specified count (count of 1) 52 | species_counts = defaultdict(int) 53 | n_atoms = 0 54 | n_heavy_atoms = 0 55 | 56 | # Find all matches of the species regex in the formula 57 | for species, count in re.findall(species_regex, formula): 58 | # If count is empty, it means the species count is 1 59 | species_count = int(count) if count else 1 60 | # Add the count to the species in the dictionary 61 | species_counts[species] += species_count 62 | # Add the count to the total number of atoms 63 | n_atoms += species_count 64 | # Add the count to the total number of heavy atoms 65 | if species != 'H': 66 | n_heavy_atoms += species_count 67 | 68 | return dict(species_counts), n_atoms, n_heavy_atoms 69 | 70 | def similarity_check(structures,similarity_engine): 71 | ''' 72 | Performs a similarity check between a group of structures 73 | 74 | Parameters 75 | ---------- 76 | structures : dict 77 | A dictionary with the structures to check. 78 | similarity_engine : obj 79 | The csd python API similarity check engine. 80 | 81 | Returns 82 | ------- 83 | similar_structure_groups : list 84 | A list with groups of similar structures. 85 | 86 | ''' 87 | # Create a new graph for the structures to be checked 88 | G = nx.Graph() 89 | 90 | # Add nodes for each structure 91 | for structure, _ in structures: 92 | G.add_node(structure) 93 | 94 | for i1, (structure1,crystal1) in enumerate(structures): 95 | for i2, (structure2,crystal2) in enumerate(structures): 96 | if i1 >= i2: 97 | continue 98 | 99 | try: 100 | h = similarity_engine.compare(crystal1, crystal2) 101 | except RuntimeError: 102 | h = None 103 | 104 | if h == None: 105 | continue 106 | 107 | # If structures meet similarity criteria, add an edge 108 | if h.nmatched_molecules == 15 and h.rmsd < 1.0: 109 | G.add_edge(structure1, structure2) 110 | 111 | # Find groups of similar structures 112 | # Each set in 'similar_groups' contains structures that are considered similar 113 | similar_structure_groups = list(nx.connected_components(G)) 114 | 115 | return similar_structure_groups 116 | 117 | def get_lattice_vectors(cell_lengths,cell_angles,cell_volume,inverse=False): 118 | ''' 119 | Calculates and returns the coordinate transformation matrices . 120 | 121 | Parameters 122 | ---------- 123 | cell_lengths : numpy.ndarray 124 | The cell lengths of the unit cell. 125 | cell_angles : numpy.ndarray 126 | The cell angles of the unit cell. 127 | cell_volume : float 128 | The volume of the unit cell. 129 | 130 | Returns 131 | ------- 132 | numpy.ndarray 133 | The transformation matrix from Cartesian to fractional (3,3) 134 | ''' 135 | # Set the individual cell lengths and angles in radians 136 | a, b, c = cell_lengths 137 | alpha, beta, gamma = cell_angles * np.pi / 180.0 138 | 139 | # Calculate trigomometric numbers for the angles 140 | cos_alpha = np.cos(alpha) 141 | cos_beta = np.cos(beta) 142 | cos_gamma = np.cos(gamma) 143 | sin_gamma = np.sin(gamma) 144 | 145 | if inverse: 146 | return np.array([[1.0 / a, -cos_gamma / a / sin_gamma, b * c * (cos_alpha * cos_gamma - cos_beta) / cell_volume / sin_gamma], 147 | [ 0.0, 1.0 / b / sin_gamma, a * c * (cos_beta * cos_gamma - cos_alpha) / cell_volume / sin_gamma], 148 | [ 0.0, 0.0, a * b * sin_gamma / cell_volume]]).T 149 | 150 | else: 151 | return np.array([[ a, b * cos_gamma, c * cos_beta], 152 | [ 0.0, b * sin_gamma, c * (cos_alpha - cos_beta * cos_gamma) / sin_gamma], 153 | [ 0.0, 0.0, cell_volume / a / b / sin_gamma]]).T 154 | 155 | -------------------------------------------------------------------------------- /source_code/utilities.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | def convert_seconds_to_hms(seconds): 5 | ''' 6 | Converts CPU time in HMS format 7 | 8 | Parameters 9 | ---------- 10 | seconds : float 11 | The input time in seconds. 12 | 13 | Returns 14 | ------- 15 | The time in HMS format. 16 | ''' 17 | hours, remainder = divmod(seconds, 3600) 18 | minutes, seconds = divmod(remainder, 60) 19 | 20 | return int(hours), int(minutes), round(seconds, 2) 21 | 22 | class NumpyArrayEncoder(json.JSONEncoder): 23 | ''' 24 | Custom encoder for numpy data types 25 | ''' 26 | def default(self, obj): 27 | if isinstance(obj, np.ndarray): 28 | return obj.tolist() 29 | return json.JSONEncoder.default(self, obj) 30 | 31 | def format_lists(json_str): 32 | ''' 33 | Formats lists in the JSON string to remove unnecessary whitespace and newlines. 34 | 35 | Parameters : str 36 | ---------------- 37 | The input JSON string 38 | 39 | Returns 40 | ------- 41 | The JSON string with unnecessary whitespace and newlines removed. 42 | ''' 43 | pattern = re.compile(r'\[\s*((?:[^[\]]|\n)+?)\s*\]', re.DOTALL) 44 | return re.sub(pattern, lambda x: '[' + x.group(1).replace('\n', '').replace(' ', '') + ']', json_str) 45 | 46 | def convert_to_json(data): 47 | ''' 48 | Converts Python dictionary to formatted JSON string. 49 | 50 | Parameters 51 | ---------- 52 | data : dict 53 | The dictionary that needs to be converted to formatted JSON string. 54 | 55 | Returns 56 | ------- 57 | formatted_json : str 58 | The formatted JSON string corresponding to the data dictionary. 59 | ''' 60 | json_str = json.dumps(data, cls=NumpyArrayEncoder, indent=4) 61 | formatted_json = format_lists(json_str) 62 | return formatted_json -------------------------------------------------------------------------------- /source_data/atomic_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "H": { 3 | "atomic_number": 1, 4 | "atomic_mass": 1.008, 5 | "covalent_radius": 0.31, 6 | "van_der_waals_radius": 1.20, 7 | "electronegativity": 2.20, 8 | "other_properties": {} 9 | }, 10 | "C": { 11 | "atomic_number": 6, 12 | "atomic_mass": 12.011, 13 | "covalent_radius": 0.76, 14 | "van_der_waals_radius": 1.70, 15 | "electronegativity": 2.55, 16 | "other_properties": {} 17 | }, 18 | "N": { 19 | "atomic_number": 7, 20 | "atomic_mass": 14.007, 21 | "covalent_radius": 0.71, 22 | "van_der_waals_radius": 1.55, 23 | "electronegativity": 3.04, 24 | "other_properties": {} 25 | }, 26 | "O": { 27 | "atomic_number": 8, 28 | "atomic_mass": 15.999, 29 | "covalent_radius": 0.66, 30 | "van_der_waals_radius": 1.52, 31 | "electronegativity": 3.44, 32 | "other_properties": {} 33 | }, 34 | "F": { 35 | "atomic_number": 9, 36 | "atomic_mass": 18.998, 37 | "covalent_radius": 0.57, 38 | "van_der_waals_radius": 1.47, 39 | "electronegativity": 3.98, 40 | "other_properties": {} 41 | }, 42 | "Cl": { 43 | "atomic_number": 17, 44 | "atomic_mass": 35.453, 45 | "covalent_radius": 0.99, 46 | "van_der_waals_radius": 1.75, 47 | "electronegativity": 3.16, 48 | "other_properties": {} 49 | }, 50 | "Br": { 51 | "atomic_number": 35, 52 | "atomic_mass": 79.904, 53 | "covalent_radius": 1.14, 54 | "van_der_waals_radius": 1.85, 55 | "electronegativity": 2.96, 56 | "other_properties": {} 57 | }, 58 | "S": { 59 | "atomic_number": 16, 60 | "atomic_mass": 32.06, 61 | "covalent_radius": 1.02, 62 | "van_der_waals_radius": 1.80, 63 | "electronegativity": 2.58, 64 | "other_properties": {} 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /source_data/close_contacts_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "BrBr": {"vdW": 3.97, "HBond": 3.97, "Tolerance": 0.0}, 3 | "BrC": {"vdW": 3.88, "HBond": 3.88, "Tolerance": 0.0}, 4 | "BrCl": {"vdW": 3.87, "HBond": 3.87, "Tolerance": 0.0}, 5 | "BrF": {"vdW": 3.59, "HBond": 3.59, "Tolerance": 0.0}, 6 | "BrH": {"vdW": 3.29, "HBond": 2.43, "Tolerance": 0.0}, 7 | "BrN": {"vdW": 3.72, "HBond": 3.72, "Tolerance": 0.0}, 8 | "BrO": {"vdW": 3.65, "HBond": 3.65, "Tolerance": 0.0}, 9 | "BrS": {"vdW": 3.93, "HBond": 3.93, "Tolerance": 0.0}, 10 | "CC": {"vdW": 3.75, "HBond": 3.75, "Tolerance": 0.50}, 11 | "CCl": {"vdW": 3.78, "HBond": 3.78, "Tolerance": 0.50}, 12 | "CF": {"vdW": 3.48, "HBond": 3.48, "Tolerance": 0.0}, 13 | "CH": {"vdW": 3.21, "HBond": 3.21, "Tolerance": 0.50}, 14 | "CN": {"vdW": 3.61, "HBond": 3.61, "Tolerance": 0.50}, 15 | "CO": {"vdW": 3.54, "HBond": 3.54, "Tolerance": 0.50}, 16 | "CS": {"vdW": 3.84, "HBond": 3.84, "Tolerance": 0.50}, 17 | "ClCl": {"vdW": 3.78, "HBond": 3.78, "Tolerance": 0.50}, 18 | "ClF": {"vdW": 3.50, "HBond": 3.50, "Tolerance": 0.0}, 19 | "ClH": {"vdW": 3.19, "HBond": 2.24, "Tolerance": 0.50}, 20 | "ClN": {"vdW": 3.63, "HBond": 3.14, "Tolerance": 0.50}, 21 | "ClO": {"vdW": 3.58, "HBond": 3.14, "Tolerance": 0.50}, 22 | "ClS": {"vdW": 3.84, "HBond": 3.74, "Tolerance": 0.0}, 23 | "FF": {"vdW": 3.17, "HBond": 3.17, "Tolerance": 0.0}, 24 | "FH": {"vdW": 2.89, "HBond": 2.89, "Tolerance": 0.0}, 25 | "FN": {"vdW": 3.34, "HBond": 2.70, "Tolerance": 0.0}, 26 | "FO": {"vdW": 3.29, "HBond": 2.61, "Tolerance": 0.0}, 27 | "FS": {"vdW": 3.57, "HBond": 3.57, "Tolerance": 0.0}, 28 | "HH": {"vdW": 2.73, "HBond": 2.73, "Tolerance": 0.50}, 29 | "HN": {"vdW": 3.01, "HBond": 2.08, "Tolerance": 0.50}, 30 | "HO": {"vdW": 2.87, "HBond": 1.96, "Tolerance": 0.50}, 31 | "HS": {"vdW": 3.27, "HBond": 2.56, "Tolerance": 0.50}, 32 | "NN": {"vdW": 3.50, "HBond": 3.02, "Tolerance": 0.50}, 33 | "NO": {"vdW": 3.37, "HBond": 2.87, "Tolerance": 0.50}, 34 | "NS": {"vdW": 3.70, "HBond": 3.41, "Tolerance": 0.50}, 35 | "OO": {"vdW": 3.37, "HBond": 2.75, "Tolerance": 0.50}, 36 | "OS": {"vdW": 3.65, "HBond": 3.30, "Tolerance": 0.50}, 37 | "SS": {"vdW": 3.87, "HBond": 3.87, "Tolerance": 0.50} 38 | } -------------------------------------------------------------------------------- /source_data/fragment_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "1-2-dichlorobenzene": { 3 | "smarts": "c1(Cl)c(Cl)cccc1", 4 | "species": ["C","Cl","C","Cl","C","C","C","C"], 5 | "coordinates": [ 6 | [ 1.3750, 0.0000, 0.0000], 7 | [ 3.1150, 0.0000, 0.0000], 8 | [ 0.6875, 1.1908, 0.0000], 9 | [ 1.5575, 2.6977, 0.0000], 10 | [-0.6875, 1.1908, 0.0000], 11 | [-1.3750, 0.0000, 0.0000], 12 | [-0.6875,-1.1908, 0.0000], 13 | [ 0.6875,-1.1908, 0.0000]], 14 | "mass": [12.0107, 35.430, 12.0107, 35.430, 12.0107, 12.0107, 12.0107, 12.0107], 15 | "atoms_to_align": "all"}, 16 | "1-3-4-thiadozole": { 17 | "smarts": "[SX2r5]1[CX3r5]=[NX2r5][NX2r5]=[CX3r5]1", 18 | "species": ["S","C","N","N","C"], 19 | "coordinates": [ 20 | [ 1.2445, 0.0000, 0.0000], 21 | [ 0.0000, 1.2018, 0.0000], 22 | [-1.2059, 0.6649, 0.0000], 23 | [-1.2059,-0.6649, 0.0000], 24 | [ 0.0000,-1.2018, 0.0000]], 25 | "mass": [32.0650, 12.0107, 14.0067, 14.0067, 12.0107], 26 | "atoms_to_align": "all"}, 27 | "1-methylimidazole_aliphatic": { 28 | "smarts": "C1=CN=CN([#6])1", 29 | "species": ["C","C","N","C","N","C"], 30 | "coordinates": [ 31 | [-0.0064,-0.7598, 0.0000], 32 | [ 0.0000, 0.6406, 0.0000], 33 | [ 1.2851, 1.1007, 0.0000], 34 | [ 2.0137, 0.0000, 0.0000], 35 | [ 1.2553,-1.1458, 0.0000], 36 | [ 1.7228,-2.5268, 0.0000]], 37 | "mass": [12.0107, 12.0107, 14.0067, 12.0107, 14.0067, 12.0107], 38 | "atoms_to_align": "all"}, 39 | "1-methylimidazole_aromatic": { 40 | "smarts": "c1cN=CN([#6])1", 41 | "species": ["C","C","N","C","N","C"], 42 | "coordinates": [ 43 | [-0.0064,-0.7598, 0.0000], 44 | [ 0.0000, 0.6406, 0.0000], 45 | [ 1.2851, 1.1007, 0.0000], 46 | [ 2.0137, 0.0000, 0.0000], 47 | [ 1.2553,-1.1458, 0.0000], 48 | [ 1.7228,-2.5268, 0.0000]], 49 | "mass": [12.0107, 12.0107, 14.0067, 12.0107, 14.0067, 12.0107], 50 | "atoms_to_align": "all"}, 51 | "2-amino-5-methyl-3-thiophenecarbonitrile": { 52 | "smarts": "CC1=CC(=C(S1)N)C#N", 53 | "species": ["C","C","C","C","C","S","N","C","N"], 54 | "coordinates": [ 55 | [ 0.3605, 2.6830, 0.0000], 56 | [ 0.0000, 1.2373, 0.0000], 57 | [-1.2610, 0.7100, 0.0000], 58 | [-1.2610,-0.7100, 0.0000], 59 | [ 0.0000,-1.2373, 0.0000], 60 | [ 1.1948, 0.0000, 0.0000], 61 | [ 0.3387,-2.5957, 0.0000], 62 | [-2.4179,-1.5505, 0.0000], 63 | [-3.3402,-2.2206, 0.0000]], 64 | "mass": [12.0107,12.0107,12.0107,12.0107,12.0107,32.065,14.0067,12.0107,14.0067], 65 | "atoms_to_align": "all"}, 66 | "2-amino-5-methylpyridine": { 67 | "smarts": "Nc1[nr6]cc([#6])cc1", 68 | "species": ["N","C","N","C","C","C","C","C"], 69 | "coordinates": [ 70 | [ 2.7650, 0.0000, 0.0000], 71 | [ 1.3750, 0.0000, 0.0000], 72 | [ 0.6875, 1.1908, 0.0000], 73 | [-0.6875, 1.1908, 0.0000], 74 | [-1.3750, 0.0000, 0.0000], 75 | [-2.8650, 0.0000, 0.0000], 76 | [-0.6875,-1.1908, 0.0000], 77 | [ 0.6875,-1.1908, 0.0000]], 78 | "mass": [14.0067, 12.0107, 14.0067, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107], 79 | "atoms_to_align": "all"}, 80 | "3,4-dihydro-6-methyl-2H-1-benzothiopyran-1,1-dioxide": { 81 | "smarts": "c1(cc(C)cc2)c2S(=O)(=O)CCC1", 82 | "species": ["C","C","C","C","C","C","C","S","O","O","C","C","C"], 83 | "coordinates": [ 84 | [-0.4945, 1.0391, 0.3642], 85 | [-1.4765, 1.2151, 1.3522], 86 | [-2.0765, 0.1971, 2.0502], 87 | [-3.0915, 0.5341, 3.1202], 88 | [-1.6695,-1.1089, 1.7352], 89 | [-0.7375,-1.3329, 0.7352], 90 | [-0.1485,-0.2669, 0.0562], 91 | [ 1.1135,-0.6809,-1.0898], 92 | [ 2.3305,-0.8849,-0.3648], 93 | [ 0.6555,-1.7579,-1.9438], 94 | [ 1.2625, 0.8221,-2.0138], 95 | [ 1.3835, 2.0051,-1.0808], 96 | [ 0.0975, 2.2351,-0.3338]], 97 | "mass": [12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 32.0650, 15.9994, 15.9994, 12.0107, 12.0107, 12.0107], 98 | "atoms_to_align": [0,1,2,3,4,5,6,7,10,11,12]}, 99 | "3-fluoro-2,6-dichlorotoluene": { 100 | "smarts": "[#6]c1c(Cl)c(F)ccc(Cl)1", 101 | "species": ["C","C","C","Cl","C","F","C","C","C","Cl"], 102 | "coordinates": [ 103 | [ 2.8650, 0.0000, 0.0000], 104 | [ 1.3750, 0.0000, 0.0000], 105 | [ 0.6875, 1.1908, 0.0000], 106 | [ 1.5575, 2.6977, 0.0000], 107 | [-0.6875, 1.1908, 0.0000], 108 | [-1.3269, 2.3599, 0.0000], 109 | [-1.3750, 0.0000, 0.0000], 110 | [-0.6875,-1.1908, 0.0000], 111 | [ 0.6875,-1.1908, 0.0000], 112 | [ 1.5575,-2.6977, 0.0000]], 113 | "mass": [12.0107, 12.0107, 12.0107, 35.4530, 12.0107, 18.9980, 12.0107, 12.0107, 12.0107, 35.4530], 114 | "atoms_to_align": "all"}, 115 | "3-methylpyridine_aliphatic": { 116 | "smarts": "[CH3]c1c[nr6]c(C)cc1", 117 | "species": ["C","C","C","N","C","C","C","C"], 118 | "coordinates": [ 119 | [ 2.8650, 0.0000, 0.0000], 120 | [ 1.3750, 0.0000, 0.0000], 121 | [ 0.6875, 1.1908, 0.0000], 122 | [-0.6875, 1.1908, 0.0000], 123 | [-1.3750, 0.0000, 0.0000], 124 | [-1.8650, 0.0000, 0.0000], 125 | [-0.6875,-1.1908, 0.0000], 126 | [ 0.6875,-1.1908, 0.0000]], 127 | "mass": [12.0107, 12.0107, 12.0107, 14.0067, 12.0107, 12.0107, 12.0107, 12.0107], 128 | "atoms_to_align": "all"}, 129 | "3-methylpyridine_aromatic": { 130 | "smarts": "[CH3]c1c[nr6]c(c)cc1", 131 | "species": ["C","C","C","N","C","C","C","C"], 132 | "coordinates": [ 133 | [ 2.8650, 0.0000, 0.0000], 134 | [ 1.3750, 0.0000, 0.0000], 135 | [ 0.6875, 1.1908, 0.0000], 136 | [-0.6875, 1.1908, 0.0000], 137 | [-1.3750, 0.0000, 0.0000], 138 | [-2.7500, 0.0000, 0.0000], 139 | [-0.6875,-1.1908, 0.0000], 140 | [ 0.6875,-1.1908, 0.0000]], 141 | "mass": [12.0107, 12.0107, 12.0107, 14.0067, 12.0107, 12.0107, 12.0107, 12.0107], 142 | "atoms_to_align": "all"}, 143 | "4-aminopiperidine_chair": { 144 | "smarts": "N[CH]1[CH2][CH2][NH][CH2][CH2]1", 145 | "species": ["N","C","C","C","N","C","C"], 146 | "coordinates": [ 147 | [-1.4300, 0.0000, 0.0000], 148 | [ 0.0000, 0.0000, 0.0000], 149 | [ 0.6256, 1.2862,-0.5655], 150 | [ 2.1636, 1.2862,-0.5655], 151 | [ 2.7892, 0.0000,-1.1310], 152 | [ 2.1636,-1.2862,-0.5655], 153 | [ 0.6256,-1.2862,-0.5655]], 154 | "mass": [14.0067, 12.0107, 12.0107, 12.0107, 14.0067, 12.0107, 12.0107], 155 | "atoms_to_align": [0,1,4]}, 156 | "4-aminopiperidine_boat": { 157 | "smarts": "N[CH]1[CH2][CH2][NH][CH2][CH2]1", 158 | "species": ["N","C","C","C","N","C","C"], 159 | "coordinates": [ 160 | [-1.4300, 0.0000, 0.0000], 161 | [ 0.0000, 0.0000, 0.0000], 162 | [ 0.6256, 1.2862,-0.5655], 163 | [ 2.1636, 1.2862,-0.5655], 164 | [ 2.7892, 0.0000, 0.0000], 165 | [ 2.1636,-1.2862,-0.5655], 166 | [ 0.6256,-1.2862,-0.5655]], 167 | "mass": [14.0067, 12.0107, 12.0107, 12.0107, 14.0067, 12.0107, 12.0107], 168 | "atoms_to_align": [0,1,4]}, 169 | "4-aminopyridamine": { 170 | "smarts": "[NH2]c1[nr6]c[nr6]cc1", 171 | "species": ["N","C","N","C","N","C","C"], 172 | "coordinates": [ 173 | [-1.0691, 2.7106, 0.0000], 174 | [-1.1604, 1.3461, 0.0000], 175 | [-2.2179, 0.5199, 0.0000], 176 | [-2.1823,-0.8127, 0.0000], 177 | [-1.0594,-1.5475, 0.0000], 178 | [-0.0064,-0.7598, 0.0000], 179 | [ 0.0000, 0.6406, 0.0000]], 180 | "mass": [14.0067, 12.0107, 14.0067, 12.0107, 14.0067, 12.0107, 12.0107], 181 | "atoms_to_align": "all"}, 182 | "4-hydroxyquinoline": { 183 | "smarts": "c1(cccc2)c2C(=O)C=CN1", 184 | "species": ["C","C","C","C","C","C","C","O","C","C","N"], 185 | "coordinates": [ 186 | [ 0.6875,-1.1908, 0.0000], 187 | [ 1.3750,-2.3816, 0.0000], 188 | [ 0.6875,-3.5724, 0.0000], 189 | [-0.6875,-3.5724, 0.0000], 190 | [-1.3750,-2.3816, 0.0000], 191 | [-0.6875,-1.1908, 0.0000], 192 | [-1.3750, 0.0000, 0.0000], 193 | [-2.7550, 0.0000, 0.0000], 194 | [-0.6875, 1.1908, 0.0000], 195 | [ 0.6875, 1.1908, 0.0000], 196 | [ 1.3750, 0.0000, 0.0000]], 197 | "mass": [12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 15.9994, 12.0107, 12.0107, 14.0067], 198 | "atoms_to_align": "all"}, 199 | "5-amino-1-3-4-thiadozole": { 200 | "smarts": "[SX2r5]1[CX3r5]=[NX2r5][NX2r5]=[CX3r5]1(N)", 201 | "species": ["S","C","N","N","C","N"], 202 | "coordinates": [ 203 | [ 1.2445, 0.0000, 0.0000], 204 | [ 0.0000, 1.2018, 0.0000], 205 | [-1.2059, 0.6649, 0.0000], 206 | [-1.2059,-0.6649, 0.0000], 207 | [ 0.0000,-1.2018, 0.0000], 208 | [ 0.2869,-2.5516, 0.0000]], 209 | "mass": [32.0650, 12.0107, 14.0067, 14.0067, 12.0107, 14.0067], 210 | "atoms_to_align": "all"}, 211 | "9-methyladenine": { 212 | "smarts": "[#7]c1[nr6]c[nr6]c(N([#6])[CH]=N2)c21", 213 | "species": ["N","C","N","C","N","C","N","C","C","N","C"], 214 | "coordinates": [ 215 | [-1.0691, 2.7106, 0.0000], 216 | [-1.1604, 1.3461, 0.0000], 217 | [-2.2179, 0.5199, 0.0000], 218 | [-2.1823,-0.8127, 0.0000], 219 | [-1.0594,-1.5475, 0.0000], 220 | [-0.0064,-0.7598, 0.0000], 221 | [ 1.2553,-1.1458, 0.0000], 222 | [ 1.7228,-2.5268, 0.0000], 223 | [ 2.0137, 0.0000, 0.0000], 224 | [ 1.2851, 1.1007, 0.0000], 225 | [ 0.0000, 0.6406, 0.0000]], 226 | "mass": [14.0067, 12.0107, 14.0067, 12.0107, 14.0067, 12.0107, 14.0067, 12.0107, 12.0107, 14.0067, 12.0107], 227 | "atoms_to_align": "all"}, 228 | "acetamide": { 229 | "smarts": "[#6]C(=O)N", 230 | "species": ["C","C","O","N"], 231 | "coordinates": [ 232 | [-1.4900, 0.0000, 0.0000], 233 | [ 0.0000, 0.0000, 0.0000], 234 | [ 0.6335,-1.2018, 0.0000], 235 | [ 0.6069, 1.0543, 0.0000]], 236 | "mass": [12.0107, 12.0107, 15.9994, 14.0067], 237 | "atoms_to_align": "all"}, 238 | "acetate_aliphatic": { 239 | "smarts": "C[CX3](=O)[OX2!H]", 240 | "species": ["C","C","O","O"], 241 | "coordinates": [ 242 | [-1.5220, 0.0000, 0.0000], 243 | [ 0.0000, 0.0000, 0.0000], 244 | [ 0.6662,-1.0220, 0.0000], 245 | [ 0.4781, 1.2454, 0.0000]], 246 | "mass": [12.0107, 12.0107, 15.9994, 15.9994], 247 | "atoms_to_align": "all"}, 248 | "acetate_aromatic": { 249 | "smarts": "c[CX3](=O)[OX2!H]", 250 | "species": ["C","C","O","O"], 251 | "coordinates": [ 252 | [-1.4800, 0.0000, 0.0000], 253 | [ 0.0000, 0.0000, 0.0000], 254 | [ 0.6662,-1.0220, 0.0000], 255 | [ 0.4781, 1.2454, 0.0000]], 256 | "mass": [12.0107, 12.0107, 15.9994, 15.9994], 257 | "atoms_to_align": "all"}, 258 | "acridin":{ 259 | "smarts": "c1ccc2cc3ccccc3nc2c1", 260 | "species": ["C","C","C","C","C","C","C","C","C","C","C","N","C","C"], 261 | "coordinates": [ 262 | [ 0.6875, 3.5724, 0.0000], 263 | [-0.6875, 3.5724, 0.0000], 264 | [-1.3750, 2.3816, 0.0000], 265 | [-0.6875, 1.1908, 0.0000], 266 | [-1.3750, 0.0000, 0.0000], 267 | [-0.6875,-1.1908, 0.0000], 268 | [-1.3750,-2.3816, 0.0000], 269 | [-0.6875,-3.5724, 0.0000], 270 | [ 0.6875,-3.5724, 0.0000], 271 | [ 1.3750,-2.3816, 0.0000], 272 | [ 0.6875,-1.1908, 0.0000], 273 | [ 1.3750, 0.0000, 0.0000], 274 | [ 0.6875, 1.1908, 0.0000], 275 | [ 1.3750, 2.3816, 0.0000]], 276 | "mass": [12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,14.0067,12.0107,12.0107], 277 | "atoms_to_align": "all"}, 278 | "aniline_NX2": { 279 | "smarts": "[NX2]c1ccccc1", 280 | "species": ["N","C","C","C","C","C","C"], 281 | "coordinates": [ 282 | [ 2.7850, 0.0000, 0.0000], 283 | [ 1.3750, 0.0000, 0.0000], 284 | [ 0.6875, 1.1908, 0.0000], 285 | [-0.6875, 1.1908, 0.0000], 286 | [-1.3750, 0.0000, 0.0000], 287 | [-0.6875,-1.1908, 0.0000], 288 | [ 0.6875,-1.1908, 0.0000]], 289 | "mass": [14.0067, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107], 290 | "atoms_to_align": "all"}, 291 | "aniline_NX3": { 292 | "smarts": "[NX3]c1ccccc1", 293 | "species": ["N","C","C","C","C","C","C"], 294 | "coordinates": [ 295 | [ 2.7850, 0.0000, 0.0000], 296 | [ 1.3750, 0.0000, 0.0000], 297 | [ 0.6875, 1.1908, 0.0000], 298 | [-0.6875, 1.1908, 0.0000], 299 | [-1.3750, 0.0000, 0.0000], 300 | [-0.6875,-1.1908, 0.0000], 301 | [ 0.6875,-1.1908, 0.0000]], 302 | "mass": [14.0067, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107], 303 | "atoms_to_align": "all"}, 304 | "anthracene":{ 305 | "smarts": "c1ccc2cc3ccccc3cc2c1", 306 | "species": ["C","C","C","C","C","C","C","C","C","C","C","C","C","C"], 307 | "coordinates": [ 308 | [ 0.6875, 3.5724, 0.0000], 309 | [-0.6875, 3.5724, 0.0000], 310 | [-1.3750, 2.3816, 0.0000], 311 | [-0.6875, 1.1908, 0.0000], 312 | [-1.3750, 0.0000, 0.0000], 313 | [-0.6875,-1.1908, 0.0000], 314 | [-1.3750,-2.3816, 0.0000], 315 | [-0.6875,-3.5724, 0.0000], 316 | [ 0.6875,-3.5724, 0.0000], 317 | [ 1.3750,-2.3816, 0.0000], 318 | [ 0.6875,-1.1908, 0.0000], 319 | [ 1.3750, 0.0000, 0.0000], 320 | [ 0.6875, 1.1908, 0.0000], 321 | [ 1.3750, 2.3816, 0.0000]], 322 | "mass": [12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107,12.0107], 323 | "atoms_to_align": "all"}, 324 | "benzene": { 325 | "smarts": "c1ccccc1", 326 | "species": ["C","C","C","C","C","C"], 327 | "coordinates": [ 328 | [ 1.3750, 0.0000, 0.0000], 329 | [ 0.6875, 1.1908, 0.0000], 330 | [-0.6875, 1.1908, 0.0000], 331 | [-1.3750, 0.0000, 0.0000], 332 | [-0.6875,-1.1908, 0.0000], 333 | [ 0.6875,-1.1908, 0.0000]], 334 | "mass": [12.0107, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107], 335 | "atoms_to_align": "all"}, 336 | "butane": { 337 | "smarts": "[#6]CC[#6]", 338 | "species": ["C","C","C","C"], 339 | "coordinates": [ 340 | [ 1.8875, 0.6192, 0.0000], 341 | [ 0.3575, 0.6192, 0.0000], 342 | [-0.3575,-0.6192, 0.0000], 343 | [-1.8875,-0.6192, 0.0000]], 344 | "mass": [12.0107,12.0107,12.0107,12.0107], 345 | "atoms_to_align": "all"}, 346 | "carboxylic_acid": { 347 | "smarts": "[#6][CX3](=O)[OH]", 348 | "species": ["C","C","O","O"], 349 | "coordinates": [ 350 | [-1.5220, 0.0000, 0.0000], 351 | [ 0.0000, 0.0000, 0.0000], 352 | [ 0.6998,-0.9994, 0.0000], 353 | [ 0.4903, 1.3113, 0.0000]], 354 | "mass": [12.0107, 12.0107, 15.9994, 15.9994], 355 | "atoms_to_align": "all"}, 356 | "cis-tetrahydrofuran-3,4-diol": { 357 | "smarts": "O1CC([OH])C([OH])C1", 358 | "species": ["O","C","C","O","C","O","C"], 359 | "coordinates": [ 360 | [-0.7840,-0.0022,-0.9770], 361 | [-0.1190,-1.1740,-0.5270], 362 | [ 0.5940,-0.8120, 0.7840], 363 | [-0.2960,-0.9570, 1.8640], 364 | [ 0.8420, 0.6910, 0.5760], 365 | [ 1.0930, 1.4200, 1.7580], 366 | [-0.4090, 1.1330,-0.1860]], 367 | "mass": [15.9994, 12.0107, 12.0107, 15.9994, 12.0107, 15.9994, 12.0107], 368 | "atoms_to_align": "all"}, 369 | "dimethyl-sulfone": { 370 | "smarts": "[#6]S(=O)(=O)[#6]", 371 | "species": ["C","S","O","O","C"], 372 | "coordinates": [ 373 | [-1.3445, 0.0000,-1.0887], 374 | [ 0.0000, 0.0000, 0.0000], 375 | [ 1.3445, 0.0000,-1.0887], 376 | [ 0.0000, 1.2533, 0.7091], 377 | [ 0.0000,-1.2533, 0.7091]], 378 | "mass": [12.0107, 32.0650, 15.9994, 15.9994, 12.0107], 379 | "atoms_to_align": [0,1,4]}, 380 | "ester_aliphatic-aliphatic": { 381 | "smarts": "C[!R;O]C(=O)C", 382 | "species": ["C","O","C","O","C"], 383 | "coordinates": [ 384 | [ 1.8642, 1.2513, 0.0000], 385 | [ 0.4343, 1.2613, 0.0000], 386 | [ 0.0000, 0.0000, 0.0000], 387 | [ 0.6998,-0.9994, 0.0000], 388 | [-1.5220, 0.0000, 0.0000]], 389 | "mass": [12.0107, 15.9994, 12.0107, 15.9994, 12.0107], 390 | "atoms_to_align": "all"}, 391 | "ester_aliphatic-aromatic": { 392 | "smarts": "C[!R;O]C(=O)c", 393 | "species": ["C","O","C","O","C"], 394 | "coordinates": [ 395 | [ 1.8642, 1.2513, 0.0000], 396 | [ 0.4343, 1.2613, 0.0000], 397 | [ 0.0000, 0.0000, 0.0000], 398 | [ 0.6998,-0.9994, 0.0000], 399 | [-1.5220, 0.0000, 0.0000]], 400 | "mass": [12.0107, 15.9994, 12.0107, 15.9994, 12.0107], 401 | "atoms_to_align": "all"}, 402 | "ester_aromatic-aliphatic": { 403 | "smarts": "c[!R;O]C(=O)C", 404 | "species": ["C","O","C","O","C"], 405 | "coordinates": [ 406 | [ 1.8642, 1.2513, 0.0000], 407 | [ 0.4343, 1.2613, 0.0000], 408 | [ 0.0000, 0.0000, 0.0000], 409 | [ 0.6998,-0.9994, 0.0000], 410 | [-1.5220, 0.0000, 0.0000]], 411 | "mass": [12.0107, 15.9994, 12.0107, 15.9994, 12.0107], 412 | "atoms_to_align": "all"}, 413 | "ester_aromatic-aromatic": { 414 | "smarts": "c[!R;O]C(=O)c", 415 | "species": ["C","O","C","O","C"], 416 | "coordinates": [ 417 | [ 1.8642, 1.2513, 0.0000], 418 | [ 0.4343, 1.2613, 0.0000], 419 | [ 0.0000, 0.0000, 0.0000], 420 | [ 0.6998,-0.9994, 0.0000], 421 | [-1.5220, 0.0000, 0.0000]], 422 | "mass": [12.0107, 15.9994, 12.0107, 15.9994, 12.0107], 423 | "atoms_to_align": "all"}, 424 | "ethanol": { 425 | "smarts": "[#6][CH2][OH]", 426 | "species": ["C","C"], 427 | "coordinates": [ 428 | [-1.5280, 0.0000, 0.0000], 429 | [ 0.0000, 0.0000, 0.0000], 430 | [ 0.4880, 1.3335, 0.0000]], 431 | "mass": [12.0107, 12.0107, 15.9994], 432 | "atoms_to_align": "all"}, 433 | "ethoxide": { 434 | "smarts": "[OX2][CH2][CH3]", 435 | "species": ["O","C","C"], 436 | "coordinates": [ 437 | [-1.3300, 0.0000, 0.0000], 438 | [ 0.0000, 0.0000, 0.0000], 439 | [ 0.8097, 1.2958, 0.0000]], 440 | "mass": [15.9994, 12.0107, 12.0107], 441 | "atoms_to_align": "all"}, 442 | "ethyl": { 443 | "smarts": "[#6][CH2][CH3]", 444 | "species": ["C","C","C"], 445 | "coordinates": [ 446 | [-1.5280, 0.0000, 0.0000], 447 | [ 0.0000, 0.0000, 0.0000], 448 | [ 0.8544, 1.2668, 0.0000]], 449 | "mass": [12.0107, 12.0107, 12.0107], 450 | "atoms_to_align": "all"}, 451 | "ethylamine": { 452 | "smarts": "[NX3][CH2][CH3]", 453 | "species": ["N","C","C"], 454 | "coordinates": [ 455 | [-1.4380, 0.0000, 0.0000], 456 | [ 0.0000, 0.0000, 0.0000], 457 | [ 0.5879, 1.4104, 0.0000]], 458 | "mass": [14.0067, 12.0107, 12.0107], 459 | "atoms_to_align": "all"}, 460 | "methanesulfonamide": { 461 | "smarts": "[#6]S(=O)(=O)[NH2]", 462 | "species": ["C","S","O","O","N"], 463 | "coordinates": [ 464 | [-1.7300, 0.0000, 0.0000], 465 | [ 0.0000, 0.0000, 0.0000], 466 | [ 0.3778,-1.2731,-0.5567], 467 | [ 0.3778, 1.2731,-0.5567], 468 | [ 0.4408, 0.0000, 1.6211]], 469 | "mass": [12.0107, 32.0650, 15.9994, 15.9994, 14.0067], 470 | "atoms_to_align": [0,1,4]}, 471 | "methyl_ethyl_ether_L_aliphatic": { 472 | "smarts": "[#6]OC[CH3]", 473 | "species": ["C","O","C","C"], 474 | "coordinates": [ 475 | [-1.5200, 0.0000, 0.0000], 476 | [ 0.0000, 0.0000, 0.0000], 477 | [ 0.5186, 1.3165, 0.0000], 478 | [ 1.9329, 1.2721, 0.0000]], 479 | "mass": [12.0107, 15.9994, 12.0107, 12.0107], 480 | "atoms_to_align": "all"}, 481 | "methyl_ethyl_ether_L_aromatic": { 482 | "smarts": "cOC[CH3]", 483 | "species": ["C","O","C","C"], 484 | "coordinates": [ 485 | [-1.5200, 0.0000, 0.0000], 486 | [ 0.0000, 0.0000, 0.0000], 487 | [ 0.5186, 1.3165, 0.0000], 488 | [ 1.9329, 1.2721, 0.0000]], 489 | "mass": [12.0107, 15.9994, 12.0107, 12.0107], 490 | "atoms_to_align": "all"}, 491 | "methyl_ethyl_ether_R_aliphatic": { 492 | "smarts": "[CH3]OC[#6]", 493 | "species": ["C","O","C","C"], 494 | "coordinates": [ 495 | [-1.5200, 0.0000, 0.0000], 496 | [ 0.0000, 0.0000, 0.0000], 497 | [ 0.5186, 1.3165, 0.0000], 498 | [ 1.9329, 1.2721, 0.0000]], 499 | "mass": [12.0107, 15.9994, 12.0107, 12.0107], 500 | "atoms_to_align": "all"}, 501 | "methyl_ethyl_ether_R_aromatic": { 502 | "smarts": "[CH3]OCc", 503 | "species": ["C","O","C","C"], 504 | "coordinates": [ 505 | [-1.5200, 0.0000, 0.0000], 506 | [ 0.0000, 0.0000, 0.0000], 507 | [ 0.5186, 1.3165, 0.0000], 508 | [ 1.9329, 1.2721, 0.0000]], 509 | "mass": [12.0107, 15.9994, 12.0107, 12.0107], 510 | "atoms_to_align": "all"}, 511 | "nitromethane": { 512 | "smarts": "[#6]N(=O)=O", 513 | "species": ["C","N","O","O"], 514 | "coordinates": [ 515 | [-1.4460, 0.0000, 0.0000], 516 | [ 0.0000, 0.0000, 0.0000], 517 | [ 0.6150, 1.0650, 0.0000], 518 | [ 0.6150,-1.0650, 0.0000]], 519 | "mass": [12.0107,14.0067,15.9994,15.9994], 520 | "atoms_to_align": "all"}, 521 | "pyrazole": { 522 | "smarts": "[#6r5]1=[#6r5][Nr5][Nr5]=[#6r5]1", 523 | "species": ["C","C","N","N","C"], 524 | "coordinates": [ 525 | [ 0.8185, 0.0000, 0.0000], 526 | [ 0.0000, 1.0861, 0.0000], 527 | [-1.2883, 0.5916, 0.0000], 528 | [-1.2848,-0.7446, 0.0000], 529 | [-0.0240,-1.1181, 0.0000]], 530 | "mass": [12.0107, 12.0107, 14.0067, 14.0067, 12.0107], 531 | "atoms_to_align": "all"}, 532 | "pyridine": { 533 | "smarts": "[nr6]1ccccc1", 534 | "species": ["N","C","C","C","C","C"], 535 | "coordinates": [ 536 | [ 1.3750, 0.0000, 0.0000], 537 | [ 0.6875, 1.1908, 0.0000], 538 | [-0.6875, 1.1908, 0.0000], 539 | [-1.3750, 0.0000, 0.0000], 540 | [-0.6875,-1.1908, 0.0000], 541 | [ 0.6875,-1.1908, 0.0000]], 542 | "mass": [14.0067, 12.0107, 12.0107, 12.0107, 12.0107, 12.0107], 543 | "atoms_to_align": "all"}, 544 | "thioacetamide": { 545 | "smarts": "[#6]C(=S)[NH2]", 546 | "species": ["C","C","S","N"], 547 | "coordinates": [ 548 | [-1.4750, 0.0000, 0.0000], 549 | [ 0.0000, 0.0000, 0.0000], 550 | [ 0.7707, 1.2577, 0.0000], 551 | [ 0.5522,-1.2264, 0.0000]], 552 | "mass": [12.0107, 12.0107, 32.0650, 14.0067], 553 | "atoms_to_align": "all"} 554 | } -------------------------------------------------------------------------------- /source_data/space_group_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "P1": { 3 | "name_HM": "P 1", 4 | "name_Hall": "P 1", 5 | "number": 1, 6 | "crystal_system": "triclinic", 7 | "z_crystal": 1, 8 | "symmetry_operations": ["x, y, z"] 9 | }, 10 | "P-1": { 11 | "name_HM": "P-1", 12 | "name_Hall": "-P 1", 13 | "number": 2, 14 | "crystal_system": "triclinic", 15 | "z_crystal": 2, 16 | "symmetry_operations": ["x, y, z", 17 | "1.0-x, 1.0-y, 1.0-z"] 18 | }, 19 | "P21": { 20 | "name_HM": "P 1 21 1", 21 | "name_Hall": "P 2yb", 22 | "number": 4, 23 | "crystal_system": "monoclinic", 24 | "z_crystal": 2, 25 | "symmetry_operations": ["x, y, z", 26 | "1.0-x, 0.5+y, 1.0-z"] 27 | }, 28 | "C2": { 29 | "name_HM": "C 1 2 1", 30 | "name_Hall": "C 2y", 31 | "number": 5, 32 | "crystal_system": "monoclinic", 33 | "z_crystal": 4, 34 | "symmetry_operations": ["x, y, z", 35 | "1.0-x, y, 1.0-z", 36 | "0.5+x, 0.5+y, z", 37 | "0.5-x, 0.5+y, 1.0-z"] 38 | }, 39 | "Pc": { 40 | "name_HM": "P 1 c 1", 41 | "name_Hall": "P -2yc", 42 | "number": 7, 43 | "crystal_system": "monoclinic", 44 | "z_crystal": 2, 45 | "symmetry_operations": ["x, y, z", 46 | "x, 1.0-y, 0.5+z"] 47 | }, 48 | "Cc": { 49 | "name_HM": "C 1 c 1", 50 | "name_Hall": "C -2yc", 51 | "number": 9, 52 | "crystal_system": "monoclinic", 53 | "z_crystal": 4, 54 | "symmetry_operations": ["x, y, z", 55 | "x, 1.0-y, 0.5+z", 56 | "0.5+x, 0.5+y, z", 57 | "0.5+x, 0.5-y, 0.5+z"] 58 | }, 59 | "P21/m": { 60 | "name_HM": "P 1 21/m 1", 61 | "name_Hall": "-P 2yb", 62 | "number": 11, 63 | "crystal_system": "monoclinic", 64 | "z_crystal": 4, 65 | "symmetry_operations": ["x, y, z", 66 | "1.0-x, 0.5+y, 1.0-z", 67 | "1.0-x, 1.0-y, 1.0-z", 68 | "x, 0.5-y, z"] 69 | }, 70 | "C2/m": { 71 | "name_HM": "C 1 2/m 1", 72 | "name_Hall": "-C 2y", 73 | "number": 12, 74 | "crystal_system": "monoclinic", 75 | "z_crystal": 8, 76 | "symmetry_operations": ["x, y, z", 77 | "1.0-x, y, 1.0-z", 78 | "1.0-x, 1.0-y, 1.0-z", 79 | "x, 1.0-y, z", 80 | "0.5+x, 0.5+y, z", 81 | "0.5-x, 0.5+y, 1.0-z", 82 | "0.5-x, 0.5-y, 1.0-z", 83 | "0.5+x, 0.5-y, z"] 84 | }, 85 | "P2/c": { 86 | "name_HM": "P 1 2/c 1", 87 | "name_Hall": "-P 2yc", 88 | "number": 13, 89 | "crystal_system": "monoclinic", 90 | "z_crystal": 4, 91 | "symmetry_operations": ["x, y, z", 92 | "1.0-x, y, 0.5-z", 93 | "1.0-x, 1.0-y, 1.0-z", 94 | "x, 1.0-y, 0.5+z"] 95 | }, 96 | "P21/c": { 97 | "name_HM": "P 1 21/c 1", 98 | "name_Hall": "-P 2ybc", 99 | "number": 14, 100 | "crystal_system": "monoclinic", 101 | "z_crystal": 4, 102 | "symmetry_operations": ["x, y, z", 103 | "1.0-x, 0.5+y, 0.5-z", 104 | "1.0-x, 1.0-y, 1.0-z", 105 | "x, 0.5-y, 0.5+z"] 106 | }, 107 | "P21/n": { 108 | "name_HM": "P 1 21/n 1", 109 | "name_Hall": "-P 2yn", 110 | "number": 14, 111 | "crystal_system": "monoclinic", 112 | "z_crystal": 4, 113 | "symmetry_operations": ["x, y, z", 114 | "0.5-x, 0.5+y, 0.5-z", 115 | "1.0-x, 1.0-y, 1.0-z", 116 | "0.5+x, 0.5-y, 0.5+z"] 117 | }, 118 | "C2/c": { 119 | "name_HM": "C 1 2/c 1", 120 | "name_Hall": "-c 2yc", 121 | "number": 15, 122 | "crystal_system": "monoclinic", 123 | "z_crystal": 8, 124 | "symmetry_operations": ["x, y, z", 125 | "1.0-x, y, 0.5-z", 126 | "1.0-x, 1.0-y, 1.0-z", 127 | "x, 1.0-y, 0.5+z", 128 | "0.5+x, 0.5+y, z", 129 | "0.5-x, 0.5+y, 0.5-z", 130 | "0.5-x, 0.5-y, 1.0-z", 131 | "0.5+x, 0.5-y, 0.5+z"] 132 | }, 133 | "P21212": { 134 | "name_HM": "P 21 21 2", 135 | "name_Hall": "P 2 2ab", 136 | "number": 18, 137 | "crystal_system": "orthorhombic", 138 | "z_crystal": 4, 139 | "symmetry_operations": ["x, y, z", 140 | "0.5+x, 0.5-y, 1.0-z", 141 | "0.5-x, 0.5+y, 1.0-z", 142 | "1.0-x, 1.0-y, z"] 143 | }, 144 | "P212121": { 145 | "name_HM": "P 21 21 21", 146 | "name_Hall": "P 2ac 2ab", 147 | "number": 19, 148 | "crystal_system": "orthorhombic", 149 | "z_crystal": 4, 150 | "symmetry_operations": ["x, y, z", 151 | "0.5+x, 0.5-y, 1.0-z", 152 | "1.0-x, 0.5+y, 0.5-z", 153 | "0.5-x, 1.0-y, 0.5+z"] 154 | }, 155 | "Pca21": { 156 | "name_HM": "P c a 21", 157 | "name_Hall": "P 2c -2ac", 158 | "number": 29, 159 | "crystal_system": "orthorhombic", 160 | "z_crystal": 4, 161 | "symmetry_operations": ["x, y, z", 162 | "0.5-x, y, 0.5+z", 163 | "0.5+x, 1.0-y, z", 164 | "1.0-x, 1.0-y, 0.5+z"] 165 | }, 166 | "Pna21": { 167 | "name_HM": "P n a 21", 168 | "name_Hall": "P 2c -2n", 169 | "number": 33, 170 | "crystal_system": "orthorhombic", 171 | "z_crystal": 4, 172 | "symmetry_operations": ["x, y, z", 173 | "0.5-x, 0.5+y, 0.5+z", 174 | "0.5+x, 0.5-y, z", 175 | "1.0-x, 1.0-y, 0.5+z"] 176 | }, 177 | "Pbcn": { 178 | "name_HM": "P b c n", 179 | "name_Hall": "-P 2n 2ab", 180 | "number": 60, 181 | "crystal_system": "orthorhombic", 182 | "z_crystal": 8, 183 | "symmetry_operations": ["x, y, z", 184 | "0.5-x, 0.5+y, z", 185 | "x, 1.0-y, 0.5+z", 186 | "0.5+x, 0.5+y, 0.5-z", 187 | "1.0-x, 1.0-y, 1.0-z", 188 | "0.5+x, 0.5-y, 1.0-z", 189 | "1.0-x, y, 0.5-z", 190 | "0.5-x, 0.5-y, 0.5+z"] 191 | }, 192 | "Pbca": { 193 | "name_HM": "P b c a", 194 | "name_Hall": "-P 2ac 2ab", 195 | "number": 61, 196 | "crystal_system": "orthorhombic", 197 | "z_crystal": 8, 198 | "symmetry_operations": ["x, y, z", 199 | "0.5-x, 0.5+y, z", 200 | "x, 0.5-y, 0.5+z", 201 | "0.5+x, y, 0.5-z", 202 | "1.0-x, 1.0-y, 1.0-z", 203 | "0.5+x, 0.5-y, 1.0-z", 204 | "1.0-x, 0.5+y, 0.5-z", 205 | "0.5-x, 1.0-y, 0.5+z"] 206 | }, 207 | "Pnma": { 208 | "name_HM": "P n m a", 209 | "name_Hall": "-P 2ac 2n", 210 | "number": 62, 211 | "crystal_system": "orthorhombic", 212 | "z_crystal": 8, 213 | "symmetry_operations": ["x, y, z", 214 | "0.5-x, 0.5+y, 0.5+z", 215 | "x, 0.5-y, z", 216 | "0.5+x, y, 0.5-z", 217 | "1.0-x, 1.0-y, 1.0-z", 218 | "0.5+x, 0.5-y, 0.5-z", 219 | "1.0-x, 0.5+y, 1.0-z", 220 | "0.5-x, 1.0-y, 0.5+z"] 221 | }, 222 | "I41/a": { 223 | "name_HM": "I 41/a", 224 | "name_Hall": "I 4bw -1bw", 225 | "number": 88, 226 | "crystal_system": "tetragonal", 227 | "z_crystal": 16, 228 | "symmetry_operations": ["x, y, z", 229 | "1.0-x, 1.0-y, z", 230 | "1.0-y, 0.5+x, 0.25+z", 231 | "y, 0.5-x, 0.25+z", 232 | "1.0-x, 0.5-y, 0.25-z", 233 | "x, 0.5+y, 0.25-z", 234 | "y, 1.0-x, 1.0-z", 235 | "1.0-y, x, 1.0-z", 236 | "0.5+x, y, z", 237 | "0.5-x, 0.5-y, 0.5+z", 238 | "0.5-y, x, 0.75+z", 239 | "0.5+y, 1.0-x, 0.75+z", 240 | "0.5-x, 1.0-y, 0.75-z", 241 | "0.5+x, y, 0.75-z", 242 | "0.5+y, 0.5-x, 0.5-z", 243 | "0.5-y, 0.5+x, 0.5-z"] 244 | }, 245 | "R-3": { 246 | "name_HM": "R -3", 247 | "name_Hall": "-R 3", 248 | "number": 148, 249 | "crystal_system": "trigonal", 250 | "z_crystal": 18, 251 | "symmetry_operations": ["x, y, z", 252 | "1.0-y, x-y, z", 253 | "-x+y, 1.0-x, z", 254 | "1.0-x, 1.0-y, 1.0-z", 255 | "y, -x+y, 1.0-z", 256 | "x-y, x, 1.0-z", 257 | "2/3+x, 1/3+y, 1/3+z", 258 | "2/3-y, 1/3+x-y, 1/3+z", 259 | "2/3-x+y, 1/3-x, 1/3+z", 260 | "2/3-x, 1/3-y, 1/3-z", 261 | "2/3+y, 1/3-x+y, 1/3-z", 262 | "2/3+x-y, 1/3+x, 1/3-z", 263 | "1/3+x, 2/3+y, 2/3+z", 264 | "1/3-y, 2/3+x-y, 2/3+z", 265 | "1/3-x+y, 2/3+x, 2/3+z", 266 | "1/3-x, 2/3-y, 2/3+z", 267 | "1/3+y, 2/3-x+y, 2/3+z", 268 | "1/3+x-y, 2/3+x, 2/3+z"] 269 | } 270 | } 271 | --------------------------------------------------------------------------------