├── .gitignore ├── LICENCE ├── MANIFEST.in ├── README.md ├── examples ├── data │ └── desc.npy ├── img │ └── dpeva-workflow.png └── sampling │ ├── 2-direct.ipynb │ └── direct.ipynb ├── pyproject.toml ├── requirements.txt ├── src └── dpeva │ ├── __init__.py │ ├── sampling │ ├── __init__.py │ ├── clustering.py │ ├── direct.py │ ├── pca.py │ └── stratified_sampling.py │ └── uncertain │ └── __init__.py └── utils ├── dpdata_update.py └── dpdescriptor ├── calc_desc.py ├── desc_all.sh ├── gen_desc.py ├── gen_desc_npy.py ├── gen_desc_stru.py ├── pca_desc_stru.py └── tsne_cuml_stru.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | src/dpeva.egg-info 3 | src/dpeva/*/*pycache* 4 | src/*/*pycache* 5 | src/*pycache* 6 | examples/uncertain/models -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENCE 2 | include README.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DP-EVA 2 | Deep Potential EVolution Accelerator 3 | 4 | ## Target: 5 | Using **single model** cuncurrent learning method to accelerate the evolution of deep potential (and other machine learning interatomic potentials). 6 | 7 | Methods used in this project: 8 | - Data sampling baesd on encoder space: 9 | - - DIRECT (from [maml](https://github.com/materialsvirtuallab/maml) package) 10 | - - 2-DIRECT and atomic-DIECT (usage in notebook) 11 | - Uncertainty estimation on atomic force evaluation in double variables: 12 | - - Query-by-committee uncertainty 13 | - - Random-Network-Distillation-like uncertainty 14 | 15 | ## Installation: 16 | 17 | Note: the src libraries only implemented DIRECT method now. 18 | 19 | Install the package via pip: 20 | ```bash 21 | pip install git+https://github.com/quantummisaka/dpeva.git 22 | ``` 23 | 24 | Or clone the repository and install the package: 25 | ```bash 26 | pip install . 27 | ``` 28 | 29 | ## Usage 30 | See example directory for usage examples. 31 | 32 | More notebook and scripts will be added soon. 33 | 34 | ## Notice 35 | 36 | This project is still under development. Please feel free to open an issue or pull request if you have any suggestions or questions. 37 | 38 | ## License 39 | This project is licensed under the LGPL-v3 License - see the [LICENSE](LICENSE) file for details. 40 | ``` -------------------------------------------------------------------------------- /examples/data/desc.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantumMisaka/dpeva/04ffae02cfcd5e24d77325061ea2e1eee0bb8591/examples/data/desc.npy -------------------------------------------------------------------------------- /examples/img/dpeva-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QuantumMisaka/dpeva/04ffae02cfcd5e24d77325061ea2e1eee0bb8591/examples/img/dpeva-workflow.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "dpeva" 7 | version = "0.2.0-alpha" 8 | description = "A package for Deep Potential EVolution Accelerator (DP-EVA)" 9 | readme = "README.md" 10 | authors = [ 11 | { name = "James Misaka", email = "ff6757442@gmail.com" } 12 | ] 13 | license = { file = "LICENCE" } 14 | classifiers = [ 15 | "Natural Language :: English", 16 | "Operating System :: POSIX :: Linux", 17 | "Programming Language :: Python :: 3 :: Only", 18 | "Intended Audience :: Science/Research", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)", 24 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 25 | "Topic :: Scientific/Engineering :: Physics", 26 | "Topic :: Scientific/Engineering :: Chemistry", 27 | "Environment :: Console", 28 | ] 29 | dependencies = [ 30 | "numpy", 31 | "scikit-learn", 32 | "torch", 33 | "dpdata", 34 | "ase", 35 | "matplotlib", 36 | "seaborn", 37 | ] 38 | requires-python = ">=3.8" 39 | 40 | [project.urls] 41 | Homepage = "https://github.com/QuantumMisaka/dpeva" 42 | 43 | [tool.setuptools.packages.find] 44 | where = ["src"] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scikit-learn 3 | dpdata 4 | ase 5 | matplotlib 6 | seaborn -------------------------------------------------------------------------------- /src/dpeva/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | DPEVA: Deep Potential EVolution Accelerator 4 | """ 5 | 6 | from __future__ import annotations 7 | from importlib.metadata import PackageNotFoundError, metadata # version 8 | 9 | package_metadata = metadata("dpeva") 10 | 11 | try: 12 | __version__ = package_metadata.get("version") 13 | except PackageNotFoundError: 14 | __version__ = "0.2.0-alpha" 15 | 16 | __author__ = "James Misaka" 17 | 18 | # from .sampling import clustering, direct, pca, stratified_sampling 19 | # from .uncertain import rnd, rnd_models 20 | 21 | print(f"Initializing DP-EVA version {__version__}") -------------------------------------------------------------------------------- /src/dpeva/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | """Package implementing direct sampling methods, copyed by maml package in 2025-01-09.""" 2 | 3 | if __name__ == '__main__': 4 | ... 5 | 6 | -------------------------------------------------------------------------------- /src/dpeva/sampling/clustering.py: -------------------------------------------------------------------------------- 1 | """Clustering methods.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | 7 | from sklearn.base import BaseEstimator, TransformerMixin 8 | from sklearn.cluster import Birch 9 | 10 | logging.basicConfig( 11 | format='%(asctime)s - %(levelname)s - %(message)s', 12 | ) 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | 16 | 17 | class BirchClustering(BaseEstimator, TransformerMixin): 18 | """Birch Clustering as one step of the DIRECT pipeline.""" 19 | 20 | def __init__(self, n=None, threshold_init=0.5, **kwargs): 21 | """ 22 | Args: 23 | n: Clustering the PCs into n clusters. When n is None, the number of clusters 24 | is dependent on threshold_init and other kwargs, and the final 25 | (global) clustering step is skipped. Default to None. 26 | threshold_init: The initial radius of the subcluster obtained by merging 27 | a new sample and the closest subcluster should be lesser than 28 | the threshold. Otherwise, a new subcluster is started. See details in: 29 | https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html. 30 | Users may tune this value for desired performance of birch, while 0.5 31 | is generally a good starting point, and some automatic tuning is done 32 | with our built-in codes to achieve n clusters if given. 33 | **kwargs: Pass to BIRCH. 34 | """ 35 | self.n = n 36 | self.threshold_init = threshold_init 37 | self.kwargs = kwargs 38 | 39 | def fit(self, X, y=None): 40 | """ 41 | Place holder for fit API. 42 | 43 | Args: 44 | X: Any inputs 45 | y: Any outputs 46 | 47 | Returns: self 48 | """ 49 | return self 50 | 51 | def transform(self, PCAfeatures): 52 | """ 53 | Perform Birch Clustering to an array of input PCA features. 54 | 55 | Args: 56 | PCAfeatures: An array of PCA features. 57 | 58 | Returns: 59 | A dict of Birch Clustering results, including labels of each 60 | PCA feature, centroid positions of each cluster in PCA feature s 61 | pace, and the array of input PCA features. 62 | """ 63 | model = Birch(n_clusters=self.n, threshold=self.threshold_init, **self.kwargs).fit(PCAfeatures) 64 | if self.n is not None: 65 | while ( 66 | len(set(model.subcluster_labels_)) < self.n 67 | ): # decrease threshold until desired n clusters is achieved 68 | logger.info( 69 | f"BirchClustering with threshold_init={self.threshold_init} and n={self.n} " 70 | f"gives {len(set(model.subcluster_labels_))} clusters.", 71 | ) 72 | self.threshold_init = self.threshold_init / self.n * len(set(model.subcluster_labels_)) 73 | model = Birch(n_clusters=self.n, threshold=self.threshold_init, **self.kwargs).fit(PCAfeatures) 74 | 75 | labels = model.predict(PCAfeatures) 76 | self.model = model 77 | logger.info( 78 | f"BirchClustering with threshold_init={self.threshold_init} and n={self.n} " 79 | f"gives {len(set(model.subcluster_labels_))} clusters.", 80 | ) 81 | label_centers = dict(zip(model.subcluster_labels_, model.subcluster_centers_, strict=False)) 82 | return { 83 | "labels": labels, 84 | "label_centers": label_centers, 85 | "PCAfeatures": PCAfeatures, 86 | } 87 | -------------------------------------------------------------------------------- /src/dpeva/sampling/direct.py: -------------------------------------------------------------------------------- 1 | """DIRECT sampling.""" 2 | 3 | from __future__ import annotations 4 | 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | from .clustering import BirchClustering 9 | from .pca import PrincipalComponentAnalysis 10 | from .stratified_sampling import SelectKFromClusters 11 | 12 | 13 | class DIRECTSampler(Pipeline): 14 | """ 15 | DImensionality REduction-Clustering-sTratified (DIRECT) 16 | sampling Pipeline. For more details, please refer to our 17 | manuscript: https://arxiv.org/abs/2307.13710. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | structure_encoder=None, 23 | scaler="StandardScaler", 24 | pca="PrincipalComponentAnalysis", 25 | weighting_PCs=True, 26 | clustering="Birch", 27 | select_k_from_clusters="select_k_from_clusters", 28 | ): 29 | """ 30 | Args: 31 | structure_encoder: Structure featurizer. It can be any encoder 32 | that takes in a list of N structures and returns a 2-D array 33 | of N*D features, where D is the fixed dimensionality of the 34 | feature vector. By default, the M3GNet formation energy model 35 | is used. Set this to False to skip the encoding step when needed. 36 | scaler: StandardScaler to perform normalization before PCA. 37 | pca: PCA for dimensionality reduction. 38 | weighting_PCs: Whether to weight PC with their explained variance. 39 | clustering: Clustering method to clustering based on PCs. 40 | select_k_from_clusters: Straitified sampling of k structures from 41 | each cluster. 42 | """ 43 | self.structure_encoder = structure_encoder 44 | self.scaler = StandardScaler() if scaler == "StandardScaler" else scaler 45 | self.pca = ( 46 | PrincipalComponentAnalysis(weighting_PCs=weighting_PCs) if pca == "PrincipalComponentAnalysis" else pca 47 | ) 48 | self.weighting_PCs = weighting_PCs 49 | self.clustering = BirchClustering() if clustering == "Birch" else clustering 50 | self.select_k_from_clusters = ( 51 | SelectKFromClusters() if select_k_from_clusters == "select_k_from_clusters" else select_k_from_clusters 52 | ) 53 | steps = [ 54 | (i.__class__.__name__, i) 55 | for i in [ 56 | self.structure_encoder, 57 | self.scaler, 58 | self.pca, 59 | self.clustering, 60 | self.select_k_from_clusters, 61 | ] 62 | if i 63 | ] 64 | super().__init__(steps) 65 | -------------------------------------------------------------------------------- /src/dpeva/sampling/pca.py: -------------------------------------------------------------------------------- 1 | """This file contains the PCA implementation.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | 7 | from sklearn.base import BaseEstimator, TransformerMixin 8 | from sklearn.decomposition import PCA 9 | 10 | logging.basicConfig( 11 | format='%(asctime)s - %(levelname)s - %(message)s', 12 | ) 13 | logger = logging.getLogger(__name__) 14 | logger.setLevel(logging.INFO) 15 | 16 | class PrincipalComponentAnalysis(BaseEstimator, TransformerMixin): 17 | """Wrap around PCA in scikit-learn to support weighting PCs.""" 18 | 19 | def __init__(self, weighting_PCs=True): 20 | """ 21 | Args: 22 | weighting_PCs: Whether to weight PCs with explained variances. 23 | """ 24 | self.pca = PCA() 25 | self.weighting_PCs = weighting_PCs 26 | 27 | def fit(self, normalized_features): 28 | """ 29 | Fit PCA with normalized features as input. 30 | 31 | Args: 32 | normalized_features: An array of normalized features with fixed 33 | dimensionality. The features have to be normalized first 34 | beforehand. 35 | 36 | Returns: self 37 | """ 38 | self.pca.fit(normalized_features) 39 | return self 40 | 41 | def transform(self, normalized_features): 42 | """ 43 | Transform normalized features into weighted or unweighted PCA features 44 | and select the first m PCs according to Kaiser's rule. 45 | 46 | Args: 47 | normalized_features: An array of normalized features with fixed 48 | dimensionality. The features have to be normalized first 49 | beforehand. 50 | 51 | Returns: 52 | An array of weighted or unweighted PCA feature with dimension of 53 | N x m, where N and m are the numbers of input normalized features 54 | and selected PCs according to Kaiser's rule, respectively. 55 | """ 56 | m = len([e for e in self.pca.explained_variance_ if e > 1]) 57 | explained_variance = self.pca.explained_variance_ratio_ 58 | logger.info(f"Selected first {m} PCs, explaining {100 * sum(explained_variance[:m]):.2f}% variance") 59 | if not self.weighting_PCs: 60 | return self.pca.transform(normalized_features)[:, :m] 61 | return self.pca.transform(normalized_features)[:, :m] * explained_variance[:m] 62 | -------------------------------------------------------------------------------- /src/dpeva/sampling/stratified_sampling.py: -------------------------------------------------------------------------------- 1 | """Implementation of stratefied sampling approaches.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import warnings 7 | 8 | import numpy as np 9 | from sklearn.base import BaseEstimator, TransformerMixin 10 | 11 | logging.basicConfig( 12 | format='%(asctime)s - %(levelname)s - %(message)s', 13 | ) 14 | logger = logging.getLogger(__name__) 15 | logger.setLevel(logging.INFO) 16 | 17 | 18 | class SelectKFromClusters(BaseEstimator, TransformerMixin): 19 | """Wrapper around selection of k data from each cluster.""" 20 | 21 | def __init__( 22 | self, 23 | k: int = 1, 24 | allow_duplicate=False, 25 | selection_criteria="center", 26 | n_sites=None, 27 | ): 28 | """ 29 | Args: 30 | k: Select k structures from each cluster. 31 | allow_duplicate: Whether structures are allowed to be selected over once. 32 | selection_criteria: The criteria to do stratified sampling from each cluster. Supported criterion 33 | include "random", "smallest", and "center" (default). By default, structures are ranked with respect to 34 | their distances to the centroid of the cluster they locate, then up to k structures with fixed ranking 35 | intervals are selected from each cluster, and when k=1, the structure with the smallest Euclidean 36 | distance to the centroid of each cluster is sampled. For "random", k structures are randomly sampled 37 | with replacement. For "smallest", it is ensured to select the k structures with the least number of 38 | atoms in each cluster. 39 | n_sites: The number of sites in all the structures to sample from. Only needed when 40 | selection_criteria="smallest". 41 | """ 42 | self.k = k 43 | self.allow_duplicate = allow_duplicate 44 | allowed_selection_criterion = ["random", "smallest", "center"] 45 | if selection_criteria not in allowed_selection_criterion: 46 | raise ValueError(f"Invalid selection_criteria, it must be one of {allowed_selection_criterion}.") 47 | if selection_criteria == "smallest" and not n_sites: 48 | raise ValueError('n_sites must be provided when selection_criteria="smallest."') 49 | self.selection_criteria = selection_criteria 50 | self.n_sites = n_sites 51 | 52 | def fit(self, X, y=None): 53 | """ 54 | Fit the model. 55 | 56 | Args: 57 | X: Input features 58 | y: Target. 59 | """ 60 | return self 61 | 62 | def transform(self, clustering_data: dict): 63 | """ 64 | Perform stratified sampling of data from each cluster 65 | based on clustering results. 66 | 67 | Args: 68 | clustering_data: Results from clustering in a dict. The dict 69 | should at least contain "PCAfeatures" and their respective 70 | "labels" of belonged clusters. The positions of centroid 71 | for each cluster should also be provided with "label_centers", 72 | with which data in each cluster can be ranked according to 73 | their Euclidean distance to centroid and then selected by 74 | interval for optimal coverage. 75 | 76 | Returns: 77 | A dict with "PCAfeatures" used in clustering and "selected_indices" 78 | as the indices of DIRECT sampled structures. 79 | """ 80 | if any(key not in clustering_data for key in ["labels", "PCAfeatures"]): 81 | raise Exception( 82 | "The data returned by clustering step should at least provide label and feature information." 83 | ) 84 | if self.selection_criteria == "center" and "label_centers" not in clustering_data: 85 | warnings.warn( 86 | "Centroid location is not provided, so random selection from each cluster will be performed, " 87 | "which likely will still outperform manual sampling in terms of feature coverage. " 88 | ) 89 | if self.selection_criteria == "smallest": 90 | try: 91 | assert len(self.n_sites) == len(clustering_data["PCAfeatures"]) 92 | except Exception: 93 | raise ValueError("n_sites must have same length as features processed in clustering.") 94 | 95 | selected_indices = [] 96 | for label in set(clustering_data["labels"]): 97 | indices_same_label = np.where(label == clustering_data["labels"])[0] 98 | features_same_label = clustering_data["PCAfeatures"][indices_same_label] 99 | n_same_label = len(features_same_label) 100 | if "label_centers" in clustering_data and self.selection_criteria == "center": 101 | center_same_label = clustering_data["label_centers"][label] 102 | distance_to_center = np.linalg.norm(features_same_label - center_same_label, axis=1).reshape( 103 | len(indices_same_label) 104 | ) 105 | select_k_indices = np.array([int(i) for i in np.linspace(0, n_same_label - 1, self.k)]) 106 | selected_indices.extend( 107 | indices_same_label[np.argpartition(distance_to_center, select_k_indices)[select_k_indices]] 108 | ) 109 | elif self.selection_criteria == "smallest": 110 | if self.k >= n_same_label: 111 | selected_indices.extend(indices_same_label) 112 | else: 113 | select_k_indices = np.arange(self.k) 114 | selected_indices.extend( 115 | indices_same_label[ 116 | np.argpartition( 117 | np.array(self.n_sites)[indices_same_label], 118 | select_k_indices, 119 | )[select_k_indices] 120 | ] 121 | ) 122 | else: 123 | selected_indices.extend(indices_same_label[np.random.randint(n_same_label, size=self.k)]) 124 | n_duplicate = len(selected_indices) - len(set(selected_indices)) 125 | if not self.allow_duplicate and n_duplicate > 0: 126 | selected_indices = list(set(selected_indices)) 127 | elif self.allow_duplicate and n_duplicate > 0: 128 | warnings.warn(f"There are {n_duplicate} duplicated selections.") 129 | logger.info(f"Finally selected {len(selected_indices)} configurations.") 130 | return { 131 | "PCAfeatures": clustering_data["PCAfeatures"], 132 | "selected_indices": selected_indices, 133 | } 134 | -------------------------------------------------------------------------------- /src/dpeva/uncertain/__init__.py: -------------------------------------------------------------------------------- 1 | """Package implementing random network distillation""" 2 | 3 | 4 | if __name__ == '__main__': 5 | ... -------------------------------------------------------------------------------- /utils/dpdata_update.py: -------------------------------------------------------------------------------- 1 | import dpdata 2 | from copy import deepcopy 3 | 4 | # load data 5 | data_train_string = "target-data-0" 6 | data_valid_string = "pool-data-0" 7 | data_select_string = "select-data-0" 8 | train_update_string = "target-data-1" 9 | valid_update_string = "pool-data-1" 10 | dpdata_string = "O*" 11 | 12 | dpdata_train = dpdata.MultiSystems.from_dir(data_train_string, dpdata_string, fmt="deepmd/npy") 13 | dpdata_valid = dpdata.MultiSystems.from_dir(data_valid_string, dpdata_string, fmt="deepmd/npy") 14 | dpdata_select = dpdata.MultiSystems.from_dir(data_select_string, dpdata_string, fmt="deepmd/npy") 15 | 16 | # info 17 | print("Target Data:", dpdata_train) 18 | print("Pool Data:", dpdata_valid) 19 | 20 | dpdata_train_update = deepcopy(dpdata_train) 21 | dpdata_valid_update = dpdata.MultiSystems() 22 | 23 | # add the selected data to the training data 24 | # and remove the selected data from the validation data 25 | select_data_list = [] 26 | for lbsys in dpdata_select: 27 | dpdata_train_update.append(lbsys) 28 | for sys in lbsys: 29 | select_data_list.append(sys.data["energies"]) 30 | for lbsys in dpdata_valid: 31 | for sys in lbsys: 32 | if sys.data['energies'] not in select_data_list: 33 | dpdata_valid_update.append(sys) 34 | # info 35 | print("Selected Data:", dpdata_select) 36 | print("Updated Target Data:", dpdata_train_update) 37 | print("Updated Pool Data:", dpdata_valid_update) 38 | 39 | # save 40 | dpdata_train_update.to_deepmd_npy(train_update_string) 41 | dpdata_valid_update.to_deepmd_npy(valid_update_string) 42 | dpdata_train_update.to_deepmd_npy_mixed(f"{train_update_string}-mixed") 43 | dpdata_valid_update.to_deepmd_npy_mixed(f"{valid_update_string}-mixed") 44 | -------------------------------------------------------------------------------- /utils/dpdescriptor/calc_desc.py: -------------------------------------------------------------------------------- 1 | import dpdata 2 | from deepmd.infer.deep_pot import DeepPot 3 | #from deepmd.calculator import DP 4 | import numpy as np 5 | import os 6 | import time 7 | import gc 8 | import sys 9 | import logging 10 | 11 | modelpath = "./FeCHO-dpa231-v2-7-3heads-100w.pt" 12 | onedir = sys.argv[1] 13 | if len(sys.argv) < 3: 14 | savedir = "descriptors" 15 | else: 16 | savedir= sys.argv[2] 17 | 18 | omp = 16 19 | os.environ['OMP_NUM_THREADS'] = f'{omp}' 20 | 21 | def descriptor_from_model(sys: dpdata.LabeledSystem, model:DeepPot): 22 | coords = sys.data["coords"] 23 | cells = sys.data["cells"] 24 | model_type_map = model.get_type_map() 25 | type_trans = np.array([model_type_map.index(i) for i in sys.data['atom_names']]) 26 | atypes = list(type_trans[sys.data['atom_types']]) 27 | predict = model.eval_descriptor(coords, cells, atypes) 28 | return predict 29 | 30 | 31 | logging.basicConfig( 32 | level=logging.INFO, 33 | format='%(asctime)s - %(levelname)s - %(message)s', 34 | datefmt='%Y-%m-%d %H:%M:%S' 35 | ) 36 | 37 | 38 | # generate descriptor for alldata 39 | # desc_dict = {} 40 | logging.info("Start Generating Descriptors") 41 | 42 | if not os.path.exists(savedir): 43 | os.mkdir(savedir) 44 | 45 | onedata = dpdata.LabeledSystem(onedir, fmt="deepmd/npy") 46 | key = onedata.short_name 47 | save_key = f"{savedir}/{key}" 48 | logging.info(f"Generating descriptors for {key}") 49 | if os.path.exists(save_key): 50 | if os.path.exists(f"{save_key}/desc.npy"): 51 | logging.info(f"Descriptors for {key} already exist, skip") 52 | sys.exit(0) 53 | else: 54 | model = DeepPot(modelpath, head="Target_FTS") 55 | desc_list = [] 56 | for onesys in onedata: 57 | desc_onesys = descriptor_from_model(onesys, model) 58 | desc_list.append(desc_onesys) 59 | desc = np.concatenate(desc_list, axis=0) 60 | os.mkdir(save_key) 61 | np.save(f"{savedir}/{key}/desc.npy", desc) 62 | logging.info(f"Descriptors for {key} Done") 63 | -------------------------------------------------------------------------------- /utils/dpdescriptor/desc_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -J desc_all 3 | #SBATCH -p amd 4 | ##SBATCH -t 48:00:00 5 | #SBATCH --ntasks=1 6 | #SBATCH --cores-per-socket=1 7 | #SBATCH --cpus-per-task=16 8 | #SBATCH -o desc_genall.out 9 | #SBATCH -e desc_genall.err 10 | 11 | # use calc_desc.py to calculate descriptors for deepmd/npy in all directories 12 | # usage: bash desc_all.sh 13 | 14 | if [[ -z $SLURM_CPUS_PER_TASK ]] 15 | then 16 | SLURM_CPUS_PER_TASK=4 17 | fi 18 | data_dirs=$(ls -d ./data-clean-v2-7-20873-npy/*) 19 | 20 | done 21 | export OMP_NUM_THREADS=`expr $SLURM_CPUS_PER_TASK \* 4` 22 | desc_dir="./descriptors" 23 | for dir in $data_dirs 24 | do 25 | echo "deal with $dir" 26 | dirname=$(basename "$dir") 27 | if [[ -e "${desc_dir}/${dirname}/desc.npy" ]] 28 | then 29 | echo "desc for ${dir} already exists" 30 | continue 31 | else 32 | python calc_desc.py $dir $desc_dir 33 | fi 34 | done -------------------------------------------------------------------------------- /utils/dpdescriptor/gen_desc.py: -------------------------------------------------------------------------------- 1 | # generated desc for deepmd/npy/mixed (and deepmd/npy) 2 | import dpdata 3 | from deepmd.infer.deep_pot import DeepPot 4 | #from deepmd.calculator import DP 5 | import numpy as np 6 | import os 7 | import logging 8 | import time 9 | import glob 10 | from torch.cuda import empty_cache 11 | 12 | datadir = "./sampled-data" 13 | format = "deepmd/npy/mixed" # default 14 | modelpath = "./model.ckpt.pt" 15 | savedir = "descriptors" 16 | head = None # multi head for LAM 17 | 18 | omp = 16 19 | batch_size = 4000 20 | os.environ['OMP_NUM_THREADS'] = f'{omp}' 21 | 22 | # notice: DeepPot.eval_descriptor have a parameter "mixed_type" 23 | def descriptor_from_model(sys: dpdata.System, model:DeepPot, nopbc=False) -> np.ndarray: 24 | coords = sys.data["coords"] 25 | cells = sys.data["cells"] 26 | if nopbc: 27 | cells = None 28 | model_type_map = model.get_type_map() 29 | type_trans = np.array([model_type_map.index(i) for i in sys.data['atom_names']]) 30 | atypes = list(type_trans[sys.data['atom_types']]) 31 | predict = model.eval_descriptor(coords, cells, atypes) 32 | return predict 33 | 34 | def get_desc_by_batch(sys: dpdata.System, model:DeepPot, batch_size: int, nopbc=False) -> list: 35 | desc_list = [] 36 | for i in range(0, len(sys), batch_size): 37 | batch = sys[i:i + batch_size] 38 | desc_batch = descriptor_from_model(batch, model, nopbc=nopbc) 39 | desc_list.append(desc_batch) 40 | return desc_list 41 | 42 | # init 43 | # mixed type need to be read-in iteratively in desc-gen 44 | # perhaps npy data can be dealed with in same manner 45 | # alldata = dpdata.MultiSystems() 46 | 47 | model = DeepPot(modelpath, head=head) 48 | logging.basicConfig( 49 | level=logging.INFO, 50 | format='%(asctime)s - %(levelname)s - %(message)s', 51 | datefmt='%Y-%m-%d %H:%M:%S' 52 | ) 53 | 54 | logging.info("Start Generating Descriptors") 55 | 56 | if not os.path.exists(savedir): 57 | os.makedirs(savedir) 58 | 59 | with open("running", "w") as fo: 60 | starting_time = time.perf_counter() 61 | for item in sorted(glob.glob(f"{datadir}/*")): 62 | key = os.path.split(item)[-1] 63 | save_key = f"{savedir}/{key}" 64 | logging.info(f"Generating descriptors for {key} system") 65 | if os.path.exists(save_key): 66 | if os.path.exists(f"{save_key}/desc.npy"): 67 | logging.info(f"Descriptors for {key} already exist, skip") 68 | continue 69 | if format == "deepmd/npy/mixed": 70 | onedata = dpdata.MultiSystems.from_file(item, fmt=format) 71 | else: 72 | onedata = dpdata.System(item, fmt=format) 73 | # use for-loop to avoid OOM in old ver 74 | desc_list = [] 75 | if format == "deepmd/npy/mixed": 76 | for onesys in onedata: 77 | nopbc = onesys.data.get('nopbc', False) 78 | one_desc_list = get_desc_by_batch(onedata, model, batch_size, nopbc=nopbc) 79 | desc_list.extend(one_desc_list) 80 | else: 81 | nopbc = onedata.data.get('nopbc', False) 82 | desc_list = get_desc_by_batch(onedata, model, batch_size, nopbc=nopbc) 83 | 84 | desc = np.concatenate(desc_list, axis=0) 85 | logging.info(f"Descriptors for {key} generated") 86 | os.mkdir(save_key) 87 | np.save(f"{savedir}/{key}/desc.npy", desc) 88 | logging.info(f"Descriptors for {key} saved") 89 | del onedata, desc, desc_list 90 | empty_cache() 91 | ending_time = time.perf_counter() 92 | fo.write(f"DONE in {ending_time - starting_time} sec !") 93 | 94 | logging.info("All Done !!!") 95 | os.system("mv running done") -------------------------------------------------------------------------------- /utils/dpdescriptor/gen_desc_npy.py: -------------------------------------------------------------------------------- 1 | import dpdata 2 | from deepmd.infer.deep_pot import DeepPot 3 | #from deepmd.calculator import DP 4 | import numpy as np 5 | import os 6 | import logging 7 | import time 8 | from torch.cuda import empty_cache 9 | 10 | datadir = "./sampled-data-direct-10p-npy" 11 | format = "deepmd/npy" # default 12 | modelpath = "./model.ckpt.pt" 13 | savedir = "descriptors" 14 | data_string = "O*" # for dpdata.MultiSystems.from_dir 15 | head = None # multi head for LAM 16 | 17 | omp = 16 18 | batch_size = 4000 19 | os.environ['OMP_NUM_THREADS'] = f'{omp}' 20 | 21 | def descriptor_from_model(sys: dpdata.System, model:DeepPot) -> np.ndarray: 22 | coords = sys.data["coords"] 23 | cells = sys.data["cells"] 24 | model_type_map = model.get_type_map() 25 | type_trans = np.array([model_type_map.index(i) for i in sys.data['atom_names']]) 26 | atypes = list(type_trans[sys.data['atom_types']]) 27 | predict = model.eval_descriptor(coords, cells, atypes) 28 | return predict 29 | #alldata = dpdata.MultiSystems.from_file(datadir, fmt=format) 30 | alldata = dpdata.MultiSystems.from_dir(datadir, data_string, fmt=format) 31 | model = DeepPot(modelpath, head=head) 32 | 33 | logging.basicConfig( 34 | level=logging.INFO, 35 | format='%(asctime)s - %(levelname)s - %(message)s', 36 | datefmt='%Y-%m-%d %H:%M:%S' 37 | ) 38 | 39 | logging.info("Start Generating Descriptors") 40 | 41 | if not os.path.exists(savedir): 42 | os.mkdir(savedir) 43 | 44 | with open("running", "w") as fo: 45 | starting_time = time.perf_counter() 46 | for onedata in alldata: 47 | onedata: dpdata.System 48 | key = onedata.short_name 49 | save_key = f"{savedir}/{key}" 50 | logging.info(f"Generating descriptors for {key} system") 51 | if os.path.exists(save_key): 52 | if os.path.exists(f"{save_key}/desc.npy"): 53 | logging.info(f"Descriptors for {key} already exist, skip") 54 | continue 55 | # use for-loop to avoid OOM in old ver 56 | desc_list = [] 57 | for i in range(0, len(onedata), batch_size): 58 | batch = onedata[i:i + batch_size] 59 | desc_batch = descriptor_from_model(batch, model) 60 | desc_list.append(desc_batch) 61 | desc = np.concatenate(desc_list, axis=0) 62 | logging.info(f"Descriptors for {key} generated") 63 | os.mkdir(save_key) 64 | np.save(f"{savedir}/{key}/desc.npy", desc) 65 | logging.info(f"Descriptors for {key} saved") 66 | del onedata, desc, desc_list 67 | empty_cache() 68 | ending_time = time.perf_counter() 69 | fo.write(f"DONE in {ending_time - starting_time} sec !") 70 | 71 | logging.info("All Done !!!") 72 | os.system("mv running done") -------------------------------------------------------------------------------- /utils/dpdescriptor/gen_desc_stru.py: -------------------------------------------------------------------------------- 1 | # generated desc for deepmd/npy/mixed (and deepmd/npy) 2 | # directly get desc for stru by pooling, saving storage 3 | import dpdata 4 | from deepmd.infer.deep_pot import DeepPot 5 | #from deepmd.calculator import DP 6 | import numpy as np 7 | import os 8 | import logging 9 | import time 10 | import glob 11 | from torch.cuda import empty_cache 12 | 13 | datadir = "./sampled-data" 14 | format = "deepmd/npy/mixed" # default 15 | modelpath = "./model.ckpt.pt" 16 | savedir = "descriptors" 17 | head = None # multi head for LAM 18 | 19 | omp = 16 20 | batch_size = 4000 21 | os.environ['OMP_NUM_THREADS'] = f'{omp}' 22 | 23 | def descriptor_from_model(sys: dpdata.System, model:DeepPot) -> np.ndarray: 24 | coords = sys.data["coords"] 25 | cells = sys.data["cells"] 26 | model_type_map = model.get_type_map() 27 | type_trans = np.array([model_type_map.index(i) for i in sys.data['atom_names']]) 28 | atypes = list(type_trans[sys.data['atom_types']]) 29 | predict = model.eval_descriptor(coords, cells, atypes) 30 | return predict 31 | 32 | # init 33 | # mixed type need to be read-in iteratively in desc-gen 34 | # perhaps npy data can be dealed with in same manner 35 | # alldata = dpdata.MultiSystems() 36 | 37 | model = DeepPot(modelpath, head=head) 38 | logging.basicConfig( 39 | level=logging.INFO, 40 | format='%(asctime)s - %(levelname)s - %(message)s', 41 | datefmt='%Y-%m-%d %H:%M:%S' 42 | ) 43 | 44 | logging.info("Start Generating Descriptors") 45 | 46 | if not os.path.exists(savedir): 47 | os.mkdir(savedir) 48 | 49 | with open("running", "w") as fo: 50 | starting_time = time.perf_counter() 51 | for item in sorted(glob.glob(f"{datadir}/*")): 52 | key = os.path.split(item)[-1] 53 | save_key = f"{savedir}/{key}" 54 | logging.info(f"Generating descriptors for {key} system") 55 | if os.path.exists(save_key): 56 | if os.path.exists(f"{save_key}/desc.npy"): 57 | logging.info(f"Descriptors for {key} already exist, skip") 58 | continue 59 | onedata = dpdata.MultiSystems.from_file(item, fmt=format) 60 | # use for-loop to avoid OOM in old ver 61 | desc_list = [] 62 | for onesys in onedata: 63 | onesys: dpdata.System 64 | for i in range(0, len(onesys), batch_size): 65 | batch = onesys[i:i + batch_size] 66 | desc_batch = descriptor_from_model(batch, model) 67 | desc_list.append(desc_batch) 68 | desc = np.concatenate(desc_list, axis=0) 69 | desc_stru = np.mean(desc, axis=1) 70 | logging.info(f"Descriptors for {key} generated") 71 | os.mkdir(save_key) 72 | np.save(f"{savedir}/{key}/desc.npy", desc_stru) 73 | logging.info(f"Descriptors for {key} saved") 74 | del onedata, desc, desc_stru, desc_list 75 | empty_cache() 76 | ending_time = time.perf_counter() 77 | fo.write(f"DONE in {ending_time - starting_time} sec !") 78 | 79 | logging.info("All Done !!!") 80 | os.system("mv running done") -------------------------------------------------------------------------------- /utils/dpdescriptor/pca_desc_stru.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import dpdata 4 | import glob 5 | import seaborn as sns 6 | import pandas as pd 7 | import os 8 | from sklearn.decomposition import PCA 9 | 10 | 11 | # load data 12 | desc_string = 'descriptors-30656/*/desc.npy' 13 | dpdata_name = "FeCHO-clean-30656" 14 | dpdata_path = "./clean-30656" 15 | dpdata_string = "C*" 16 | 17 | # PCA setting 18 | ndim = 20 19 | save_name = f"PCA-desc-stru-{ndim}dim" 20 | 21 | # define used function 22 | def extract_elements_array(data: dpdata.LabeledSystem) -> list: 23 | '''extract elements array from dpdata for draw PCA''' 24 | types = data.data['atom_types'] 25 | names = data.data['atom_names'] 26 | ele_array = [names[ind] for ind in types] 27 | return ele_array 28 | 29 | # 30 | if os.path.exists(f'{save_name}.pickle'): 31 | print(f"{save_name}.pickle already exists, skip PCA.") 32 | print(f"Data loaded from {save_name}.pickle") 33 | df_desc = pd.read_pickle(f'{save_name}.pickle') 34 | else: 35 | # read descriptors/*/desc.npy data 36 | print("Reading descriptor results...") 37 | desc_keys = [] 38 | all_desc_stru = [] 39 | for f in glob.glob(desc_string): 40 | # extract dirname of desc.npy from descriptors/* 41 | directory, _ = os.path.split(f) 42 | _, keyname = os.path.split(directory) 43 | desc_keys.append(keyname) 44 | one_desc = np.load(f) # nframe, natoms, ndesc 45 | # do average in natoms dimension 46 | one_desc_stru = np.mean(one_desc, axis=1) 47 | all_desc_stru.append(one_desc_stru) 48 | all_desc_stru = np.concatenate(all_desc_stru, axis=0) 49 | 50 | # read dpdata for element type information 51 | print("Reading corresponding dpdata...") 52 | alldata = dpdata.MultiSystems.from_dir(dpdata_path, dpdata_string, fmt="deepmd/npy") 53 | 54 | alldata_dict = {} 55 | for lbsys in alldata: 56 | alldata_dict[lbsys.short_name] = lbsys 57 | 58 | # get list of system name 59 | sys_list = [] 60 | for keyname in desc_keys: 61 | target_sys = alldata_dict[keyname] 62 | for ind in range(target_sys.get_nframes()): 63 | sys_list.append(f"{keyname}-{ind}") 64 | 65 | # get element ratio 66 | element_ratio_dict = {} 67 | element_names = alldata[0].get_atom_names() 68 | for element in element_names: 69 | ratio_for_ele = [] 70 | for keyname in desc_keys: 71 | target_sys = alldata_dict[keyname] 72 | ratio = target_sys.get_atom_numbs()[target_sys.get_atom_names().index(element)] / np.sum(target_sys.get_atom_numbs()) 73 | ratio_for_ele.extend([ratio] * target_sys.get_nframes()) 74 | element_ratio_dict[element] = ratio_for_ele 75 | 76 | # do PCA, most time consuming step 77 | pdf = pd.DataFrame(all_desc_stru) 78 | 79 | print("Doing PCA...") 80 | pca = PCA( 81 | n_components=ndim, 82 | ) 83 | 84 | embedding = pca.fit_transform(pdf) 85 | embedding_np = embedding[:, :2] 86 | print("PCA done.") 87 | 88 | # get formation energy of each stru 89 | # 生成能字典 90 | elements_ref_ene = { 91 | "C": -155.07351, 92 | "Fe": -3220.20451, 93 | "H": -15.849995, 94 | "O": -432.63044825, 95 | } 96 | 97 | # 根据dataname得到生成能 98 | def get_ref_ene(dataname, elements_ref_ene=elements_ref_ene): 99 | ref_ene = 0 100 | ene_list = list(elements_ref_ene.keys()) 101 | for ele in ene_list: 102 | dataname = dataname.replace(ele, f" {ele},") 103 | ene_string_dict = dataname.strip().split(" ") # O,2 as example 104 | natom = 0 105 | for ind, ele_string in enumerate(ene_string_dict): 106 | ele_list = ele_string.split(',') 107 | ele = ele_list[0] 108 | num = ele_list[1] 109 | natom += eval(num) 110 | ref_ene += eval(num) * elements_ref_ene[ele] 111 | ref_ene /= natom # 归一化到ev-per-atom 112 | return ref_ene 113 | 114 | form_ene_list = [] 115 | for keyname in desc_keys: 116 | target_sys = alldata_dict[keyname] 117 | for ene in target_sys.data['energies']: 118 | form_ene_list.append(ene/np.sum(target_sys.get_atom_numbs()) - get_ref_ene(target_sys.short_name)) 119 | 120 | # to pandas 121 | df_desc = pd.DataFrame(embedding_np, columns=['Dim1','Dim2']) 122 | df_desc['sys_name'] = sys_list 123 | df_desc['E_Form'] = form_ene_list 124 | for ele,ratio_for_ele in element_ratio_dict.items(): 125 | df_desc[f'ratio_{ele}'] = ratio_for_ele 126 | 127 | df_desc.to_pickle(f'{save_name}.pickle') 128 | print(f"Data saved as {save_name}.pickle") 129 | 130 | 131 | # draw graph 132 | print("Drawing graph...") 133 | plt.figure(figsize=(10, 8)) 134 | sns.scatterplot(x='Dim1', y='Dim2', hue='ratio_Fe', data=df_desc, palette='viridis', s=100, alpha=0.7) 135 | plt.title(f'PCA of {dpdata_name} dataset stru by ndim {ndim}') 136 | plt.xlabel('Principal Component 1') 137 | plt.ylabel('Principal Component 2') 138 | plt.legend(title='Fe ratio') 139 | plt.grid(True) 140 | 141 | 142 | plt.savefig(f'{save_name}.png',dpi=200) 143 | print(f"Graph saved as {save_name}.png") 144 | 145 | print("All done.") 146 | -------------------------------------------------------------------------------- /utils/dpdescriptor/tsne_cuml_stru.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import dpdata 4 | import glob 5 | import seaborn as sns 6 | import pandas as pd 7 | import os 8 | import cudf 9 | import cuml 10 | 11 | 12 | # load data 13 | desc_string = 'desc-v2-2-radsp/*/desc.npy' 14 | dpdata_name = "FeCHO-clean-v2-2-28830" 15 | dpdata_path = "./data-clean-v2-2-28830" 16 | dpdata_string = "O*" 17 | 18 | # t-SNE setting 19 | perplex_num = 50 20 | n_iter = 1600 21 | save_name = f"tsne-desc-stru-{perplex_num}per-{n_iter}niter" 22 | 23 | # define used function 24 | def extract_elements_array(data: dpdata.LabeledSystem) -> list: 25 | '''extract elements array from dpdata for draw t-SNE''' 26 | types = data.data['atom_types'] 27 | names = data.data['atom_names'] 28 | ele_array = [names[ind] for ind in types] 29 | return ele_array 30 | 31 | # 32 | if os.path.exists(f'{save_name}.pickle'): 33 | print(f"{save_name}.pickle already exists, skip t-SNE.") 34 | print(f"Data loaded from {save_name}.pickle") 35 | df_desc = pd.read_pickle(f'{save_name}.pickle') 36 | else: 37 | # read descriptors/*/desc.npy data 38 | print("Reading descriptor results...") 39 | desc_keys = [] 40 | all_desc_stru = [] 41 | for f in glob.glob(desc_string): 42 | # extract dirname of desc.npy from descriptors/* 43 | directory, _ = os.path.split(f) 44 | _, keyname = os.path.split(directory) 45 | desc_keys.append(keyname) 46 | one_desc = np.load(f) # nframe, natoms, ndesc 47 | # do average in natoms dimension 48 | one_desc_stru = np.mean(one_desc, axis=1) 49 | all_desc_stru.append(one_desc_stru) 50 | all_desc_stru = np.concatenate(all_desc_stru, axis=0) 51 | 52 | # read dpdata for element type information 53 | print("Reading corresponding dpdata...") 54 | alldata = dpdata.MultiSystems.from_dir(dpdata_path, dpdata_string, fmt="deepmd/npy") 55 | 56 | alldata_dict = {} 57 | for lbsys in alldata: 58 | alldata_dict[lbsys.short_name] = lbsys 59 | 60 | # get list of system name 61 | sys_list = [] 62 | for keyname in desc_keys: 63 | target_sys = alldata_dict[keyname] 64 | for ind in range(target_sys.get_nframes()): 65 | sys_list.append(f"{keyname}-{ind}") 66 | 67 | # get element ratio 68 | element_ratio_dict = {} 69 | element_names = alldata[0].get_atom_names() 70 | for element in element_names: 71 | ratio_for_ele = [] 72 | for keyname in desc_keys: 73 | target_sys = alldata_dict[keyname] 74 | ratio = target_sys.get_atom_numbs()[target_sys.get_atom_names().index(element)] / np.sum(target_sys.get_atom_numbs()) 75 | ratio_for_ele.extend([ratio] * target_sys.get_nframes()) 76 | element_ratio_dict[element] = ratio_for_ele 77 | 78 | # do t-SNE, most time consuming step 79 | gdf = cudf.DataFrame(all_desc_stru) 80 | 81 | print("Doing t-SNE...") 82 | tsne = cuml.TSNE( 83 | n_components=2, 84 | random_state=42, 85 | perplexity=perplex_num, 86 | n_iter=n_iter, 87 | init='pca', 88 | ) 89 | 90 | embedding = tsne.fit_transform(gdf) 91 | embedding_np = embedding.to_pandas().values 92 | print("t-SNE done.") 93 | 94 | # get formation energy of each stru 95 | # 生成能字典 96 | elements_ref_ene = { 97 | "C": -155.07351, 98 | "Fe": -3220.20451, 99 | "H": -15.849995, 100 | "O": -432.63044825, 101 | } 102 | 103 | # 根据dataname得到生成能 104 | def get_ref_ene(dataname, elements_ref_ene=elements_ref_ene): 105 | ref_ene = 0 106 | ene_list = list(elements_ref_ene.keys()) 107 | for ele in ene_list: 108 | dataname = dataname.replace(ele, f" {ele},") 109 | ene_string_dict = dataname.strip().split(" ") # O,2 as example 110 | natom = 0 111 | for ind, ele_string in enumerate(ene_string_dict): 112 | ele_list = ele_string.split(',') 113 | ele = ele_list[0] 114 | num = ele_list[1] 115 | natom += eval(num) 116 | ref_ene += eval(num) * elements_ref_ene[ele] 117 | ref_ene /= natom # 归一化到ev-per-atom 118 | return ref_ene 119 | 120 | form_ene_list = [] 121 | for keyname in desc_keys: 122 | target_sys = alldata_dict[keyname] 123 | for ene in target_sys.data['energies']: 124 | form_ene_list.append(ene/np.sum(target_sys.get_atom_numbs()) - get_ref_ene(target_sys.short_name)) 125 | 126 | # to pandas 127 | df_desc = pd.DataFrame(embedding_np, columns=['Dim1','Dim2']) 128 | df_desc['sys_name'] = sys_list 129 | df_desc['E_Form'] = form_ene_list 130 | for ele,ratio_for_ele in element_ratio_dict.items(): 131 | df_desc[f'ratio_{ele}'] = ratio_for_ele 132 | 133 | df_desc.to_pickle(f'{save_name}.pickle') 134 | print(f"Data saved as {save_name}.pickle") 135 | 136 | 137 | # draw graph 138 | print("Drawing graph...") 139 | plt.figure(figsize=(10, 8)) 140 | sns.scatterplot(x='Dim1', y='Dim2', hue='ratio_Fe', data=df_desc, palette='viridis', s=100, alpha=0.7) 141 | plt.title(f't-SNE of {dpdata_name} dataset stru by perplexity {perplex_num} and niter {n_iter}') 142 | plt.xlabel('Principal Component 1') 143 | plt.ylabel('Principal Component 2') 144 | plt.legend(title='Fe ratio') 145 | plt.grid(True) 146 | 147 | 148 | plt.savefig(f'{save_name}.png',dpi=200) 149 | print(f"Graph saved as {save_name}.png") 150 | 151 | print("All done.") 152 | --------------------------------------------------------------------------------