├── .gitignore ├── LICENSE ├── README.md ├── algorithms ├── epigenomics │ └── epigenomic_algorithms.py ├── genomics │ └── genomics_algorithms.py └── proteomics │ └── proteomic_algorithms.py ├── config ├── config.json └── environment.yml ├── data ├── __init__.py ├── epigenomics │ ├── data_loader.py │ ├── data_processor.py │ ├── data_visualizer.py │ ├── epigenomics_data.csv │ └── epigenomics_metadata.csv ├── genomics │ ├── data_loader.py │ ├── data_processor.py │ ├── data_visualizer.py │ ├── genomics_data.csv │ └── genomics_metadata.csv └── proteomics │ ├── data_loader.py │ ├── data_processor.py │ ├── data_visualizer.py │ ├── proteomics_data.csv │ └── proteomics_metadata.csv ├── docs └── medaxis.jpeg ├── models ├── epigenomics │ └── model.py ├── genomics │ └── model.py └── proteomics │ └── model.py ├── pipelines ├── epigenomics │ └── epigenomics_pipeline.py ├── genomics │ └── genomics_pipeline.py └── proteomics │ └── proteomics_pipeline.py ├── requirements.txt └── utils ├── __init__.py ├── data_utils.py ├── logging.py └── math_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Google App Engine generated folder 2 | appengine-generated/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 KOSASIH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![WHO Certification](https://img.shields.io/badge/WHO%20Certification-2022-blue.svg)](https://www.who.int/) 2 | [![NIH Certification](https://img.shields.io/badge/NIH%20Certification-2022-blue.svg)](https://www.nih.gov/) 3 | [![CDC Certification](https://img.shields.io/badge/CDC%20Certification-2022-blue.svg)](https://www.cdc.gov/) 4 | [![ECDC Certification](https://img.shields.io/badge/ECDC%20Certification-2022-blue.svg)](https://www.ecdc.europa.eu/) 5 | [![PAHO Certification](https://img.shields.io/badge/PAHO%20Certification-2022-blue.svg)](https://www.paho.org/) 6 | [![UNICEF Certification](https://img.shields.io/badge/UNICEF%20Certification-2022-blue.svg)](https://www.unicef.org/) 7 | [![World Bank Certification](https://img.shields.io/badge/World%20Bank%20Certification-2022-blue.svg)](https://www.worldbank.org/) 8 | [![Bill and Melinda Gates Foundation Certification](https://img.shields.io/badge/Bill%20and%20Melinda%20Gates%20Foundation%20Certification-2022-blue.svg)](https://www.gatesfoundation.org/) 9 | [![OSI Certification](https://img.shields.io/badge/OSI%20Certification-2022-blue.svg)](https://opensource.org/) 10 | 11 |

MedAxis by KOSASIH is licensed under Creative Commons Attribution 4.0 International

12 | 13 | # medaxis-core 14 | The central repository for MedAxis, housing the core algorithms, models, and data pipelines for advanced genomics, proteomics, and epigenomics research. 15 | 16 | # MedAxis Core 17 | ================ 18 | 19 | The central repository for MedAxis, housing the core algorithms, models, and data pipelines for advanced genomics, proteomics, and epigenomics research. 20 | 21 | ## Overview 22 | -------- 23 | 24 | MedAxis Core is an open-source repository that provides a comprehensive suite of algorithms, models, and data pipelines for advanced genomics, proteomics, and epigenomics research. The repository is designed to facilitate the development of novel analytical tools and methods for the analysis of large-scale biological data. 25 | 26 | ## Features 27 | -------- 28 | 29 | * **Genomics Pipelines**: A comprehensive suite of pipelines for genomics data analysis, including data preprocessing, alignment, variant calling, and downstream analysis. 30 | * **Proteomics Pipelines**: A comprehensive suite of pipelines for proteomics data analysis, including data preprocessing, peptide identification, and protein quantification. 31 | * **Epigenomics Pipelines**: A comprehensive suite of pipelines for epigenomics data analysis, including data preprocessing, peak calling, and downstream analysis. 32 | * **Machine Learning Models**: A collection of machine learning models for the analysis of genomics, proteomics, and epigenomics data, including models for classification, regression, and clustering. 33 | * **Data Visualization Tools**: A suite of data visualization tools for the visualization of genomics, proteomics, and epigenomics data, including tools for heatmap generation, scatter plot generation, and genome browser visualization. 34 | 35 | ## Getting Started 36 | --------------- 37 | 38 | To get started with MedAxis Core, follow these steps: 39 | 40 | 1. Clone the repository: `git clone https://github.com/KOSASIH/medaxis-core.git` 41 | 2. Install the dependencies: `pip install -r requirements.txt` 42 | 3. Explore the repository: `cd medaxis-core` 43 | 44 | ## Contributing 45 | ------------ 46 | 47 | We welcome contributions to MedAxis Core! If you're interested in contributing, please follow these steps: 48 | 49 | 1. Fork the repository: `git fork https://github.com/KOSASIH/medaxis-core.git` 50 | 2. Create a new branch: `git branch my-feature` 51 | 3. Make your changes: `git add .` 52 | 4. Commit your changes: `git commit -m "My feature"` 53 | 5. Push your changes: `git push origin my-feature` 54 | 6. Submit a pull request: `git pull-request` 55 | 56 | ## License 57 | ------- 58 | 59 | MedAxis Core is released under the MIT License. 60 | 61 | ## Acknowledgments 62 | -------------- 63 | 64 | We would like to acknowledge the following individuals and organizations for their contributions to MedAxis Core: 65 | 66 | * All contributors 67 | * All organizations contributors 68 | 69 | -------------------------------------------------------------------------------- /algorithms/epigenomics/epigenomic_algorithms.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class EpigenomicAlgorithms: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels ) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /algorithms/genomics/genomics_algorithms.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class GenomicsAlgorithms: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method ="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /algorithms/proteomics/proteomic_algorithms.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class ProteomicAlgorithms: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method ="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "database": { 3 | "host": "localhost", 4 | "port": 5432, 5 | "username": "medaxis", 6 | "password": "medaxis_password", 7 | "database": "medaxis_db" 8 | }, 9 | "storage": { 10 | "bucket": "medaxis-bucket", 11 | "region": "us-west-2" 12 | }, 13 | "compute": { 14 | "cluster": "medaxis-cluster", 15 | "queue": "medaxis-queue" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /config/environment.yml: -------------------------------------------------------------------------------- 1 | name: medaxis-core 2 | dependencies: 3 | - python=3.9 4 | - numpy 5 | - pandas 6 | - scikit-learn 7 | - tensorflow 8 | - pytorch 9 | - scipy 10 | - matplotlib 11 | - seaborn 12 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/epigenomics/data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | from sklearn.model_selection import train_test_split 5 | from medaxis_core.utils.logging import logger 6 | 7 | class EpigenomicsDataLoader: 8 | def __init__(self, file_path, metadata_file_path=None, test_size=0.2, random_state=42): 9 | self.file_path = file_path 10 | self.metadata_file_path = metadata_file_path 11 | self.test_size = test_size 12 | self.random_state = random_state 13 | 14 | def load_data(self): 15 | logger.info(f"Loading data from {self.file_path}") 16 | data = pd.read_csv(self.file_path) 17 | return data 18 | 19 | def load_metadata(self): 20 | if self.metadata_file_path: 21 | logger.info(f"Loading metadata from {self.metadata_file_path}") 22 | metadata = pd.read_csv(self.metadata_file_path) 23 | return metadata 24 | else: 25 | logger.warning("No metadata file provided") 26 | return None 27 | 28 | def split_data(self, data, metadata=None): 29 | logger.info("Splitting data into training and testing sets") 30 | X = data.drop(['methylation_level'], axis=1) 31 | y = data['methylation_level'] 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) 33 | if metadata: 34 | metadata_train, metadata_test = train_test_split(metadata, test_size=self.test_size, random_state=self.random_state) 35 | return X_train, X_test, y_train, y_test, metadata_train, metadata_test 36 | else: 37 | return X_train, X_test, y_train, y_test 38 | 39 | def shuffle_data(self, data): 40 | logger.info("Shuffling data") 41 | return shuffle(data, random_state=self.random_state) 42 | 43 | def get_data_stats(self, data): 44 | logger.info("Calculating data statistics") 45 | stats = data.describe() 46 | return stats 47 | 48 | def get_data_correlation_matrix(self, data): 49 | logger.info("Calculating data correlation matrix") 50 | corr_matrix = data.corr() 51 | return corr_matrix 52 | 53 | def filter_data(self, threshold=0.5): 54 | logger.info("Filtering data") 55 | filtered_data = data[data['methylation_level'] > threshold] 56 | return filtered_data 57 | 58 | def normalize_data(self, data): 59 | logger.info("Normalizing data") 60 | normalized_data = data / data.max() 61 | return normalized_data 62 | 63 | def encode_data(self, data): 64 | logger.info("Encoding data") 65 | encoded_data = pd.get_dummies(data) 66 | return encoded_data 67 | 68 | def extract_features(self, data): 69 | logger.info("Extracting features") 70 | features = data[['chromosome', 'position']] 71 | return features 72 | 73 | def extract_labels(self, data): 74 | logger.info("Extracting labels") 75 | labels = data['methylation_level'] 76 | return labels 77 | -------------------------------------------------------------------------------- /data/epigenomics/data_processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler 4 | from sklearn.impute import SimpleImputer 5 | from medaxis_core.utils.logging import logger 6 | 7 | class EpigenomicsDataProcessor: 8 | def __init__(self, data, scaler=StandardScaler(), imputer=SimpleImputer()): 9 | self.data = data 10 | self.scaler = scaler 11 | self.imputer = imputer 12 | 13 | def preprocess_data(self): 14 | logger.info("Preprocessing data") 15 | self.data = self.handle_missing_values() 16 | self.data = self.scale_data() 17 | self.data = self.normalize_data() 18 | self.data = self.encode_data() 19 | return self.data 20 | 21 | def handle_missing_values(self): 22 | logger.info("Handling missing values") 23 | imputed_data = self.imputer.fit_transform(self.data) 24 | return pd.DataFrame(imputed_data, columns=self.data.columns) 25 | 26 | def scale_data(self): 27 | logger.info("Scaling data") 28 | scaled_data = self.scaler.fit_transform(self.data) 29 | return pd.DataFrame(scaled_data, columns=self.data.columns) 30 | 31 | def normalize_data(self): 32 | logger.info("Normalizing data") 33 | normalized_data = self.data / self.data.max() 34 | return normalized_data 35 | 36 | def encode_data(self): 37 | logger.info("Encoding data") 38 | encoded_data = pd.get_dummies(self.data) 39 | return encoded_data 40 | 41 | def set_scaler(self, scaler): 42 | self.scaler = scaler 43 | 44 | def set_imputer(self, imputer): 45 | self.imputer = imputer 46 | 47 | def transform_data(self, transformation='log2'): 48 | logger.info("Transforming data") 49 | if transformation == 'log2': 50 | transformed_data = np.log2(self.data) 51 | elif transformation == 'log10': 52 | transformed_data = np.log10(self.data) 53 | else: 54 | logger.warning("Invalid transformation method") 55 | return self.data 56 | return pd.DataFrame(transformed_data, columns=self.data.columns) 57 | 58 | def reduce_dimensions(self, method='pca', n_components=2): 59 | logger.info("Reducing dimensions") 60 | if method == 'pca': 61 | from sklearn.decomposition import PCA 62 | pca = PCA(n_components=n_components) 63 | reduced_data = pca.fit_transform(self.data) 64 | elif method == 'tsne': 65 | from sklearn.manifold import TSNE 66 | tsne = TSNE(n_components=n_components) 67 | reduced_data = tsne.fit_transform(self.data) 68 | else: 69 | logger.warning("Invalid dimensionality reduction method") 70 | return self.data 71 | return pd.DataFrame(reduced_data, columns=[f"Component {i+1}" for i in range(n_components)]) 72 | 73 | def extract_features(self, data): 74 | logger.info("Extracting features") 75 | features = data[['chromosome', 'position']] 76 | return features 77 | 78 | def extract_labels(self, data): 79 | logger.info("Extracting labels") 80 | labels = data['methylation_level'] 81 | return labels 82 | 83 | def create_feature_matrix(self, data): 84 | logger.info("Creating feature matrix") 85 | feature_matrix = pd.get_dummies(data[['chromosome', 'position']]) 86 | return feature_matrix 87 | 88 | def create_label_vector(self, data): 89 | logger.info("Creating label vector") 90 | label_vector = data['methylation_level'] 91 | return label_vector 92 | -------------------------------------------------------------------------------- /data/epigenomics/data_visualizer.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | from medaxis_core.utils.logging import logger 6 | 7 | class EpigenomicsDataVisualizer: 8 | def __init__(self, data): 9 | self.data = data 10 | 11 | def plot_histogram(self, column, bins=50): 12 | logger.info(f"Plotting histogram for column {column}") 13 | plt.hist(self.data[column], bins=bins) 14 | plt.title(f"Histogram of {column}") 15 | plt.xlabel(column) 16 | plt.ylabel("Frequency") 17 | plt.show() 18 | 19 | def plot_scatterplot(self, x_column, y_column): 20 | logger.info(f"Plotting scatterplot for columns {x_column} and {y_column}") 21 | sns.scatterplot(x=self.data[x_column], y=self.data[y_column]) 22 | plt.title(f"Scatterplot of {x_column} vs {y_column}") 23 | plt.xlabel(x_column) 24 | plt.ylabel(y_column) 25 | plt.show() 26 | 27 | def plot_barplot(self, column): 28 | logger.info(f"Plotting barplot for column {column}") 29 | sns.barplot(x=self.data[column].value_counts().index, y=self.data[column].value_counts()) 30 | plt.title(f"Barplot of {column}") 31 | plt.xlabel(column) 32 | plt.ylabel("Frequency") 33 | plt.show() 34 | 35 | def plot_heatmap(self, columns): 36 | logger.info(f"Plotting heatmap for columns {columns}") 37 | corr_matrix = self.data[columns].corr() 38 | sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", square=True) 39 | plt.title(f"Heatmap of {columns}") 40 | plt.show() 41 | 42 | def plot_boxplot(self, column): 43 | logger.info(f"Plotting boxplot for column {column}") 44 | sns.boxplot(self.data[column]) 45 | plt.title(f"Boxplot of {column}") 46 | plt.xlabel(column) 47 | plt.ylabel("Value") 48 | plt.show() 49 | 50 | def plot_violinplot(self, column): 51 | logger.info(f"Plotting violinplot for column {column}") 52 | sns.violinplot(self.data[column]) 53 | plt.title(f"Violinplot of {column}") 54 | plt.xlabel(column) 55 | plt.ylabel("Value") 56 | plt.show() 57 | 58 | def plot_pairplot(self, columns): 59 | logger.info(f"Plotting pairplot for columns {columns}") 60 | sns.pairplot(self.data[columns]) 61 | plt.title(f"Pairplot of {columns}") 62 | plt.show() 63 | 64 | def plot_clustermap(self, columns): 65 | logger.info(f"Plotting clustermap for columns {columns}") 66 | sns.clustermap(self.data[columns].corr(), annot=True, cmap="coolwarm", square=True) 67 | plt.title(f"Clustermap of {columns}") 68 | plt.show() 69 | 70 | def plot_methylation_levels(self, chromosome, position): 71 | logger.info(f"Plotting methylation levels for chromosome {chromosome} and position {position}") 72 | methylation_levels = self.data[(self.data['chromosome'] == chromosome) & (self.data['position'] == position)]['methylation_level'] 73 | sns.boxplot(methylation_levels) 74 | plt.title(f"Methylation levels of chromosome {chromosome} and position {position}") 75 | plt.xlabel("Methylation level") 76 | plt.ylabel("Value") 77 | plt.show() 78 | 79 | def plot_chromosome_methylation(self, chromosome): 80 | logger.info(f"Plotting chromosome methylation for chromosome {chromosome}") 81 | chromosome_methylation = self.data[self.data['chromosome'] == chromosome]['methylation_level'] 82 | sns.boxplot(chromosome_methylation) 83 | plt.title(f"Chromosome methylation of {chromosome}") 84 | plt.xlabel("Methylation level") 85 | plt.ylabel("Value") 86 | plt.show() 87 | 88 | def plot_position_methylation(self, position): 89 | logger.info(f"Plotting position methylation for position {position}") 90 | position_methylation = self.data[self.data['position'] == position]['methylation_level'] 91 | sns.boxplot(position_methylation) 92 | plt.title(f"Position methylation of {position}") 93 | plt.xlabel("Methylation level") 94 | plt.ylabel("Value") 95 | plt.show() 96 | -------------------------------------------------------------------------------- /data/epigenomics/epigenomics_data.csv: -------------------------------------------------------------------------------- 1 | sample_id,chromosome,position,methylation_level 2 | sample1,1,1000,0.5 3 | sample1,1,2000,0.7 4 | sample2,2,3000,0.3 5 | ... 6 | -------------------------------------------------------------------------------- /data/epigenomics/epigenomics_metadata.csv: -------------------------------------------------------------------------------- 1 | sample_id,age,sex,diagnosis 2 | sample1,30,male,healthy 3 | sample1,30,male,healthy 4 | sample2,40,female,diseased 5 | ... 6 | -------------------------------------------------------------------------------- /data/genomics/data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | from sklearn.model_selection import train_test_split 5 | from medaxis_core.utils.logging import logger 6 | 7 | class GenomicsDataLoader: 8 | def __init__(self, file_path, metadata_file_path=None, test_size=0.2, random_state=42): 9 | self.file_path = file_path 10 | self.metadata_file_path = metadata_file_path 11 | self.test_size = test_size 12 | self.random_state = random_state 13 | 14 | def load_data(self): 15 | logger.info(f"Loading data from {self.file_path}") 16 | data = pd.read_csv(self.file_path) 17 | return data 18 | 19 | def load_metadata(self): 20 | if self.metadata_file_path: 21 | logger.info(f"Loading metadata from {self.metadata_file_path}") 22 | metadata = pd.read_csv(self.metadata_file_path) 23 | return metadata 24 | else: 25 | logger.warning("No metadata file provided") 26 | return None 27 | 28 | def split_data(self, data, metadata=None): 29 | logger.info("Splitting data into training and testing sets") 30 | X = data.drop(['genotype'], axis=1) 31 | y = data['genotype'] 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) 33 | if metadata: 34 | metadata_train, metadata_test = train_test_split(metadata, test_size=self.test_size, random_state=self.random_state) 35 | return X_train, X_test, y_train, y_test, metadata_train, metadata_test 36 | else: 37 | return X_train, X_test, y_train, y_test 38 | 39 | def shuffle_data(self, data): 40 | logger.info("Shuffling data") 41 | return shuffle(data, random_state=self.random_state) 42 | 43 | def get_data_stats(self, data): 44 | logger.info("Calculating data statistics") 45 | stats = data.describe() 46 | return stats 47 | 48 | def get_data_correlation_matrix(self, data): 49 | logger.info("Calculating data correlation matrix") 50 | corr_matrix = data.corr() 51 | return corr_matrix 52 | -------------------------------------------------------------------------------- /data/genomics/data_processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler 4 | from sklearn.impute import SimpleImputer 5 | from medaxis_core.utils.logging import logger 6 | 7 | class GenomicsDataProcessor: 8 | def __init__(self, data, scaler=StandardScaler(), imputer=SimpleImputer()): 9 | self.data = data 10 | self.scaler = scaler 11 | self.imputer = imputer 12 | 13 | def preprocess_data(self): 14 | logger.info("Preprocessing data") 15 | self.data = self.handle_missing_values() 16 | self.data = self.scale_data() 17 | return self.data 18 | 19 | def handle_missing_values(self): 20 | logger.info("Handling missing values") 21 | imputed_data = self.imputer.fit_transform(self.data) 22 | return pd.DataFrame(imputed_data, columns=self.data.columns) 23 | 24 | def scale_data(self): 25 | logger.info("Scaling data") 26 | scaled_data = self.scaler.fit_transform(self.data) 27 | return pd.DataFrame(scaled_data, columns=self.data.columns) 28 | 29 | def set_scaler(self, scaler): 30 | self.scaler = scaler 31 | 32 | def set_imputer(self, imputer): 33 | self.imputer = imputer 34 | -------------------------------------------------------------------------------- /data/genomics/data_visualizer.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | from medaxis_core.utils.logging import logger 6 | 7 | class GenomicsDataVisualizer: 8 | def __init__(self, data): 9 | self.data = data 10 | 11 | def plot_histogram(self, column, bins=50): 12 | logger.info(f"Plotting histogram for column {column}") 13 | plt.hist(self.data[column], bins=bins) 14 | plt.title(f"Histogram of {column}") 15 | plt.xlabel(column) 16 | plt.ylabel("Frequency") 17 | plt.show() 18 | 19 | def plot_scatterplot(self, x_column, y_column): 20 | logger.info(f"Plotting scatterplot for columns {x_column} and {y_column}") 21 | sns.scatterplot(x=self.data[x_column], y=self.data[y_column]) 22 | plt.title(f"Scatterplot of {x_column} vs {y_column}") 23 | plt.xlabel(x_column) 24 | plt.ylabel(y_column) 25 | plt.show() 26 | 27 | def plot_barplot(self, column): 28 | logger.info(f"Plotting barplot for column {column}") 29 | sns.barplot(x=self.data[column].value_counts().index, y=self.data[column].value_counts()) 30 | plt.title(f"Barplot of {column}") 31 | plt.xlabel(column) 32 | plt.ylabel("Frequency") 33 | plt.show() 34 | 35 | def plot_heatmap(self, columns): 36 | logger.info(f"Plotting heatmap for columns {columns}") 37 | corr_matrix = self.data[columns].corr() 38 | sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", square=True) 39 | plt.title(f"Heatmap of {columns}") 40 | plt.show() 41 | 42 | def plot_boxplot(self, column): 43 | logger.info(f"Plotting boxplot for column {column}") 44 | sns.boxplot(self.data[column]) 45 | plt.title(f"Boxplot of {column}") 46 | plt.xlabel(column) 47 | plt.ylabel("Value") 48 | plt.show() 49 | 50 | def plot_violinplot(self, column): 51 | logger.info(f"Plotting violinplot for column {column}") 52 | sns.violinplot(self.data[column]) 53 | plt.title(f"Violinplot of {column}") 54 | plt.xlabel(column) 55 | plt.ylabel("Value") 56 | plt.show() 57 | 58 | def plot_pairplot(self, columns): 59 | logger.info(f"Plotting pairplot for columns {columns}") 60 | sns.pairplot(self.data[columns]) 61 | plt.title(f"Pairplot of {columns}") 62 | plt.show() 63 | 64 | def plot_clustermap(self, columns): 65 | logger.info(f"Plotting clustermap for columns {columns}") 66 | sns.clustermap(self.data[columns].corr(), annot=True, cmap="coolwarm", square=True) 67 | plt.title(f"Clustermap of {columns}") 68 | plt.show() 69 | -------------------------------------------------------------------------------- /data/genomics/genomics_data.csv: -------------------------------------------------------------------------------- 1 | sample_id,chromosome,position,genotype 2 | sample1,1,1000,AA 3 | sample1,1,2000,AB 4 | sample2,2,3000,BB 5 | ... 6 | -------------------------------------------------------------------------------- /data/genomics/genomics_metadata.csv: -------------------------------------------------------------------------------- 1 | sample_id,age,sex,diagnosis 2 | sample1,30,male,healthy 3 | sample1,30,male,healthy 4 | sample2,40,female,diseased 5 | ... 6 | -------------------------------------------------------------------------------- /data/proteomics/data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | from sklearn.model_selection import train_test_split 5 | from medaxis_core.utils.logging import logger 6 | 7 | class ProteomicsDataLoader: 8 | def __init__(self, file_path, metadata_file_path=None, test_size=0.2, random_state=42): 9 | self.file_path = file_path 10 | self.metadata_file_path = metadata_file_path 11 | self.test_size = test_size 12 | self.random_state = random_state 13 | 14 | def load_data(self): 15 | logger.info(f"Loading data from {self.file_path}") 16 | data = pd.read_csv(self.file_path) 17 | return data 18 | 19 | def load_metadata(self): 20 | if self.metadata_file_path: 21 | logger.info(f"Loading metadata from {self.metadata_file_path}") 22 | metadata = pd.read_csv(self.metadata_file_path) 23 | return metadata 24 | else: 25 | logger.warning("No metadata file provided") 26 | return None 27 | 28 | def split_data(self, data, metadata=None): 29 | logger.info("Splitting data into training and testing sets") 30 | X = data.drop(['protein_id'], axis=1) 31 | y = data['protein_id'] 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) 33 | if metadata: 34 | metadata_train, metadata_test = train_test_split(metadata, test_size=self.test_size, random_state=self.random_state) 35 | return X_train, X_test, y_train, y_test, metadata_train, metadata_test 36 | else: 37 | return X_train, X_test, y_train, y_test 38 | 39 | def shuffle_data(self, data): 40 | logger.info("Shuffling data") 41 | return shuffle(data, random_state=self.random_state) 42 | 43 | def get_data_stats(self, data): 44 | logger.info("Calculating data statistics") 45 | stats = data.describe() 46 | return stats 47 | 48 | def get_data_correlation_matrix(self, data): 49 | logger.info("Calculating data correlation matrix") 50 | corr_matrix = data.corr() 51 | return corr_matrix 52 | 53 | def filter_data(self, data, threshold=0.5): 54 | logger.info("Filtering data") 55 | filtered_data = data[data['expression_level'] > threshold] 56 | return filtered_data 57 | 58 | def normalize_data(self, data): 59 | logger.info("Normalizing data") 60 | normalized_data = data / data.max() 61 | return normalized_data 62 | 63 | def encode_data(self, data): 64 | logger.info("Encoding data") 65 | encoded_data = pd.get_dummies(data) 66 | return encoded_data 67 | -------------------------------------------------------------------------------- /data/proteomics/data_processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler 4 | from sklearn.impute import SimpleImputer 5 | from medaxis_core.utils.logging import logger 6 | 7 | class ProteomicsDataProcessor: 8 | def __init__(self, data, scaler=StandardScaler(), imputer=SimpleImputer()): 9 | self.data = data 10 | self.scaler = scaler 11 | self.imputer = imputer 12 | 13 | def preprocess_data(self): 14 | logger.info("Preprocessing data") 15 | self.data = self.handle_missing_values() 16 | self.data = self.scale_data() 17 | self.data = self.normalize_data() 18 | self.data = self.encode_data() 19 | return self.data 20 | 21 | def handle_missing_values(self): 22 | logger.info("Handling missing values") 23 | imputed_data = self.imputer.fit_transform(self.data) 24 | return pd.DataFrame(imputed_data, columns=self.data.columns) 25 | 26 | def scale_data(self): 27 | logger.info("Scaling data") 28 | scaled_data = self.scaler.fit_transform(self.data) 29 | return pd.DataFrame(scaled_data, columns=self.data.columns) 30 | 31 | def normalize_data(self): 32 | logger.info("Normalizing data") 33 | normalized_data = self.data / self.data.max() 34 | return normalized_data 35 | 36 | def encode_data(self): 37 | logger.info("Encoding data") 38 | encoded_data = pd.get_dummies(self.data) 39 | return encoded_data 40 | 41 | def set_scaler(self, scaler): 42 | self.scaler = scaler 43 | 44 | def set_imputer(self, imputer): 45 | self.imputer = imputer 46 | 47 | def filter_data(self, threshold=0.5): 48 | logger.info("Filtering data") 49 | filtered_data = self.data[self.data['expression_level'] > threshold] 50 | return filtered_data 51 | 52 | def transform_data(self, transformation='log2'): 53 | logger.info("Transforming data") 54 | if transformation == 'log2': 55 | transformed_data = np.log2(self.data) 56 | elif transformation == 'log10': 57 | transformed_data = np.log10(self.data) 58 | else: 59 | logger.warning("Invalid transformation method") 60 | return self.data 61 | return pd.DataFrame(transformed_data, columns=self.data.columns) 62 | 63 | def reduce_dimensions(self, method='pca', n_components=2): 64 | logger.info("Reducing dimensions") 65 | if method == 'pca': 66 | from sklearn.decomposition import PCA 67 | pca = PCA(n_components=n_components) 68 | reduced_data = pca.fit_transform(self.data) 69 | elif method == 'tsne': 70 | from sklearn.manifold import TSNE 71 | tsne = TSNE(n_components=n_components) 72 | reduced_data = tsne.fit_transform(self.data) 73 | else: 74 | logger.warning("Invalid dimensionality reduction method") 75 | return self.data 76 | return pd.DataFrame(reduced_data, columns=[f"Component {i+1}" for i in range(n_components)]) 77 | -------------------------------------------------------------------------------- /data/proteomics/data_visualizer.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | from medaxis_core.utils.logging import logger 6 | 7 | class ProteomicsDataVisualizer: 8 | def __init__(self, data): 9 | self.data = data 10 | 11 | def plot_histogram(self, column, bins=50): 12 | logger.info(f"Plotting histogram for column {column}") 13 | plt.hist(self.data[column], bins=bins) 14 | plt.title(f"Histogram of {column}") 15 | plt.xlabel(column) 16 | plt.ylabel("Frequency") 17 | plt.show() 18 | 19 | def plot_scatterplot(self, x_column, y_column): 20 | logger.info(f"Plotting scatterplot for columns {x_column} and {y_column}") 21 | sns.scatterplot(x=self.data[x_column], y=self.data[y_column]) 22 | plt.title(f"Scatterplot of {x_column} vs {y_column}") 23 | plt.xlabel(x_column) 24 | plt.ylabel(y_column) 25 | plt.show() 26 | 27 | def plot_barplot(self, column): 28 | logger.info(f"Plotting barplot for column {column}") 29 | sns.barplot(x=self.data[column].value_counts().index, y=self.data[column].value_counts()) 30 | plt.title(f"Barplot of {column}") 31 | plt.xlabel(column) 32 | plt.ylabel("Frequency") 33 | plt.show() 34 | 35 | def plot_heatmap(self, columns): 36 | logger.info(f"Plotting heatmap for columns {columns}") 37 | corr_matrix = self.data[columns].corr() 38 | sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", square=True) 39 | plt.title(f"Heatmap of {columns}") 40 | plt.show() 41 | 42 | def plot_boxplot(self, column): 43 | logger.info(f"Plotting boxplot for column {column}") 44 | sns.boxplot(self.data[column]) 45 | plt.title(f"Boxplot of {column}") 46 | plt.xlabel(column) 47 | plt.ylabel("Value") 48 | plt.show() 49 | 50 | def plot_violinplot(self, column): 51 | logger.info(f"Plotting violinplot for column {column}") 52 | sns.violinplot(self.data[column]) 53 | plt.title(f"Violinplot of {column}") 54 | plt.xlabel(column) 55 | plt.ylabel("Value") 56 | plt.show() 57 | 58 | def plot_pairplot(self, columns): 59 | logger.info(f"Plotting pairplot for columns {columns}") 60 | sns.pairplot(self.data[columns]) 61 | plt.title(f"Pairplot of {columns}") 62 | plt.show() 63 | 64 | def plot_clustermap(self, columns): 65 | logger.info(f"Plotting clustermap for columns {columns}") 66 | sns.clustermap(self.data[columns].corr(), annot=True, cmap="coolwarm", square=True) 67 | plt.title(f"Clustermap of {columns}") 68 | plt.show() 69 | 70 | def plot_expression_levels(self, protein_id): 71 | logger.info(f"Plotting expression levels for protein {protein_id}") 72 | expression_levels = self.data[self.data['protein_id'] == protein_id]['expression_level'] 73 | sns.boxplot(expression_levels) 74 | plt.title(f"Expression levels of {protein_id}") 75 | plt.xlabel("Expression level") 76 | plt.ylabel("Value") 77 | plt.show() 78 | 79 | def plot_protein_network(self, protein_ids): 80 | logger.info(f"Plotting protein network for proteins {protein_ids}") 81 | import networkx as nx 82 | G = nx.Graph() 83 | for protein_id in protein_ids: 84 | G.add_node(protein_id) 85 | for i in range(len(protein_ids)): 86 | for j in range(i+1, len(protein_ids)): 87 | G.add_edge(protein_ids[i], protein_ids[j]) 88 | nx.draw(G, with_labels=True) 89 | plt.title(f"Protein network of {protein_ids}") 90 | plt.show() 91 | -------------------------------------------------------------------------------- /data/proteomics/proteomics_data.csv: -------------------------------------------------------------------------------- 1 | sample_id,protein_id,expression_level 2 | sample1,protein1,10 3 | sample1,protein2,20 4 | sample2,protein1,30 5 | ... 6 | -------------------------------------------------------------------------------- /data/proteomics/proteomics_metadata.csv: -------------------------------------------------------------------------------- 1 | sample_id,age,sex,diagnosis 2 | sample1,30,male,healthy 3 | sample1,30,male,healthy 4 | sample2,40,female,diseased 5 | ... 6 | -------------------------------------------------------------------------------- /docs/medaxis.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KOSASIH/medaxis-core/13852b081159d8363e51df3427a2ef727e7f5936/docs/medaxis.jpeg -------------------------------------------------------------------------------- /models/epigenomics/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class EpigenomicsModel: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels ) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train , method="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /models/genomics/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class GenomicsModel: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method ="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /models/proteomics/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class ProteomicsModel: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method ="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /pipelines/epigenomics/epigenomics_pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class EpigenomicsPipeline: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /pipelines/genomics/genomics_pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class GenomicsPipeline: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | cal inski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz, davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method=" kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /pipelines/proteomics/proteomics_pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.preprocessing import StandardScaler 9 | from medaxis_core.utils.logging import logger 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | from sklearn.metrics import silhouette_score 14 | from sklearn.metrics import calinski_harabasz_score 15 | from sklearn.metrics import davies_bouldin_score 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import f_classif 20 | from sklearn.feature_selection import mutual_info_classif 21 | from sklearn.decomposition import NMF 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.random_projection import GaussianRandomProjection 24 | from sklearn.random_projection import SparseRandomProjection 25 | 26 | class ProteomicsPipeline: 27 | def __init__(self, data, target, test_size=0.2, random_state=42): 28 | self.data = data 29 | self.target = target 30 | self.test_size = test_size 31 | self.random_state = random_state 32 | 33 | def split_data(self): 34 | logger.info("Splitting data into training and testing sets") 35 | X_train, X_test, y_train, y_test = train_test_split(self.data, self.target, test_size=self.test_size, random_state=self.random_state) 36 | return X_train, X_test, y_train, y_test 37 | 38 | def scale_data(self, X_train, X_test): 39 | logger.info("Scaling data using StandardScaler") 40 | scaler = StandardScaler() 41 | X_train_scaled = scaler.fit_transform(X_train) 42 | X_test_scaled = scaler.transform(X_test) 43 | return X_train_scaled, X_test_scaled 44 | 45 | def reduce_dimensions(self, X_train_scaled, X_test_scaled, method="pca", n_components=2): 46 | logger.info(f"Reducing dimensions using {method}") 47 | if method == "pca": 48 | pca = PCA(n_components=n_components) 49 | X_train_reduced = pca.fit_transform(X_train_scaled) 50 | X_test_reduced = pca.transform(X_test_scaled) 51 | elif method == "tsne": 52 | tsne = TSNE(n_components=n_components) 53 | X_train_reduced = tsne.fit_transform(X_train_scaled) 54 | X_test_reduced = tsne.transform(X_test_scaled) 55 | elif method == "nmf": 56 | nmf = NMF(n_components=n_components) 57 | X_train_reduced = nmf.fit_transform(X_train_scaled) 58 | X_test_reduced = nmf.transform(X_test_scaled) 59 | elif method == "truncated_svd": 60 | truncated_svd = TruncatedSVD(n_components=n_components) 61 | X_train_reduced = truncated_svd.fit_transform(X_train_scaled) 62 | X_test_reduced = truncated_svd.transform(X_test_scaled) 63 | elif method == "gaussian_random_projection": 64 | gaussian_random_projection = GaussianRandomProjection(n_components=n_components) 65 | X_train_reduced = gaussian_random_projection.fit_transform(X_train_scaled) 66 | X_test_reduced = gaussian_random_projection.transform(X_test_scaled) 67 | elif method == "sparse_random_projection": 68 | sparse_random_projection = SparseRandomProjection(n_components=n_components) 69 | X_train_reduced = sparse_random_projection.fit_transform(X_train_scaled) 70 | X_test_reduced = sparse_random_projection.transform(X_test_scaled) 71 | else: 72 | logger.warning("Invalid dimensionality reduction method") 73 | return X_train_scaled, X_test_scaled 74 | return X_train_reduced, X_test_reduced 75 | 76 | def select_features(self, X_train_scaled, y_train, method="kbest", k=10): 77 | logger.info(f"Selecting features using {method}") 78 | if method == "kbest": 79 | selector = SelectKBest(f_classif, k=k) 80 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 81 | elif method == "mutual_info": 82 | selector = SelectKBest(mutual_info_classif, k=k) 83 | X_train_selected = selector.fit_transform(X_train_scaled, y_train) 84 | else: 85 | logger.warning("Invalid feature selection method") 86 | return X_train_scaled 87 | return X_train_selected 88 | 89 | def cluster_data(self, X_train_reduced, method="kmeans", n_clusters=2): 90 | logger.info(f"Clustering data using {method}") 91 | if method == "kmeans": 92 | kmeans = KMeans(n_clusters=n_clusters) 93 | kmeans.fit(X_train_reduced) 94 | labels = kmeans.labels_ 95 | else: 96 | logger.warning("Invalid clustering method") 97 | return None 98 | return labels 99 | 100 | def evaluate_clustering(self, X_train_reduced, labels): 101 | logger.info("Evaluating clustering performance") 102 | silhouette = silhouette_score(X_train_reduced, labels) 103 | calinski_harabasz = calinski_harabasz_score(X_train_reduced, labels) 104 | davies_bouldin = davies_bouldin_score(X_train_reduced, labels) 105 | return silhouette, calinski_harabasz , davies_bouldin 106 | 107 | def train_model(self, X_train_scaled, y_train, model_type="random_forest"): 108 | logger.info(f"Training {model_type} model") 109 | if model_type == "random_forest": 110 | model = RandomForestClassifier(n_estimators=100, random_state=self.random_state) 111 | elif model_type == "svm": 112 | model = SVC(kernel="rbf", C=1, random_state=self.random_state) 113 | elif model_type == "logistic_regression": 114 | model = LogisticRegression(max_iter=1000, random_state=self.random_state) 115 | else: 116 | logger.warning("Invalid model type") 117 | return None 118 | model.fit(X_train_scaled, y_train) 119 | return model 120 | 121 | def tune_model(self, X_train_scaled, y_train, model_type="random_forest"): 122 | logger.info(f"Tuning {model_type} model") 123 | if model_type == "random_forest": 124 | param_grid = { 125 | "n_estimators": [10, 50, 100, 200], 126 | "max_depth": [None, 5, 10, 15], 127 | "min_samples_split": [2, 5, 10], 128 | "min_samples_leaf": [1, 5, 10] 129 | } 130 | model = RandomForestClassifier(random_state=self.random_state) 131 | elif model_type == "svm": 132 | param_grid = { 133 | "C": [0.1, 1, 10], 134 | "kernel": ["linear", "rbf", "poly"], 135 | "gamma": ["scale", "auto"] 136 | } 137 | model = SVC(random_state=self.random_state) 138 | elif model_type == "logistic_regression": 139 | param_grid = { 140 | "C": [0.1, 1, 10], 141 | "penalty": ["l1", "l2"], 142 | "max_iter": [100, 500, 1000] 143 | } 144 | model = LogisticRegression(random_state=self.random_state) 145 | else: 146 | logger.warning("Invalid model type") 147 | return None 148 | grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy") 149 | grid_search.fit(X_train_scaled, y_train) 150 | return grid_search.best_estimator_ 151 | 152 | def evaluate_model(self, model, X_test_scaled, y_test): 153 | logger.info("Evaluating model performance") 154 | y_pred = model.predict(X_test_scaled) 155 | accuracy = accuracy_score(y_test, y_pred) 156 | report = classification_report(y_test, y_pred) 157 | matrix = confusion_matrix(y_test, y_pred) 158 | return accuracy, report, matrix 159 | 160 | def run_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 161 | X_train, X_test, y_train, y_test = self.split_data() 162 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 163 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 164 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 165 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 166 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 167 | model = self.train_model(X_train_selected, y_train, model_type) 168 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 169 | logger.info(f"Model accuracy: {accuracy:.3f}") 170 | logger.info(f"Classification report:\n{report}") 171 | logger.info(f"Confusion matrix:\n{matrix}") 172 | logger.info(f"Silhouette score: {silhouette:.3f}") 173 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 174 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 175 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 176 | 177 | def run_tuned_pipeline(self, model_type="random_forest", method="pca", n_components=2, n_clusters=2, k=10): 178 | X_train, X_test, y_train, y_test = self.split_data() 179 | X_train_scaled, X_test_scaled = self.scale_data(X_train, X_test) 180 | X_train_reduced, X_test_reduced = self.reduce_dimensions(X_train_scaled, X_test_scaled, method, n_components) 181 | X_train_selected = self.select_features(X_train_reduced, y_train, method="kbest", k=k) 182 | labels = self.cluster_data(X_train_selected, method="kmeans", n_clusters=n_clusters) 183 | silhouette, calinski_harabasz, davies_bouldin = self.evaluate_clustering(X_train_selected, labels) 184 | model = self.tune_model(X_train_selected, y_train, model_type) 185 | accuracy, report, matrix = self.evaluate_model(model, X_test_scaled, y_test) 186 | logger.info(f"Model accuracy: {accuracy:.3f}") 187 | logger.info(f"Classification report:\n{report}") 188 | logger.info(f"Confusion matrix:\n{matrix}") 189 | logger.info(f"Silhouette score: {silhouette:.3f}") 190 | logger.info(f"Calinski-Harabasz score: {calinski_harabasz:.3f}") 191 | logger.info(f"Davies-Bouldin score: {davies_bouldin:.3f}") 192 | return accuracy, report, matrix, silhouette, calinski_harabasz, davies_bouldin 193 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | numpy==1.20.0 3 | pandas==1.3.5 4 | scikit-learn==1.0.2 5 | scipy==1.7.3 6 | matplotlib==3.5.1 7 | seaborn==0.11.2 8 | plotly==5.5.0 9 | statsmodels==0.13.2 10 | joblib==1.1.0 11 | threadpoolctl==2.2.0 12 | 13 | # Data science dependencies 14 | jupyter==1.0.0 15 | jupyterlab==3.3.2 16 | notebook==6.4.6 17 | ipykernel==6.9.0 18 | ipython==8.5.0 19 | nbformat==5.4.0 20 | nbconvert==6.4.4 21 | nbformat==5.4.0 22 | 23 | # Machine learning dependencies 24 | tensorflow==2.8.0 25 | keras==2.8.0 26 | pytorch==1.11.0 27 | torchvision==0.12.0 28 | torchtext==0.12.0 29 | torchaudio==0.11.0 30 | 31 | # Visualization dependencies 32 | bokeh==2.4.2 33 | plotly==5.5.0 34 | matplotlib==3.5.1 35 | seaborn==0.11.2 36 | graphviz==0.19.1 37 | 38 | # Utilities dependencies 39 | python-dateutil==2.8.2 40 | pytz==2022.1 41 | six==1.16.0 42 | setuptools==58.1.0 43 | wheel==0.37.1 44 | pip==22.0.4 45 | virtualenv==20.13.0 46 | 47 | # Logging dependencies 48 | loguru==0.6.0 49 | logging==0.5.1.2 50 | 51 | # Testing dependencies 52 | pytest==6.2.5 53 | pytest-cov==4.0.0 54 | pytest-xdist==2.5.0 55 | pytest-timeout==2.1.0 56 | pytest-faulthandler==3.0.0 57 | 58 | # Documentation dependencies 59 | sphinx==4.4.0 60 | sphinx-autodoc-typehints==1.18.1 61 | sphinx-rtd-theme==1.0.0 62 | sphinxcontrib-apidoc==0.3.0 63 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.decomposition import PCA 8 | from sklearn.manifold import TSNE 9 | from sklearn.cluster import KMeans 10 | from sklearn.metrics import silhouette_score 11 | from sklearn.metrics import calinski_harabasz_score 12 | from sklearn.metrics import davies_bouldin_score 13 | 14 | # Define a function to load data 15 | def load_data(file_path): 16 | try: 17 | data = pd.read_csv(file_path) 18 | return data 19 | except Exception as e: 20 | print(f"Error loading data: {e}") 21 | 22 | # Define a function to preprocess data 23 | def preprocess_data(data): 24 | try: 25 | # Drop missing values 26 | data.dropna(inplace=True) 27 | 28 | # Scale data 29 | scaler = StandardScaler() 30 | data[['feature1', 'feature2', 'feature3']] = scaler.fit_transform(data[['feature1', 'feature2', 'feature3']]) 31 | 32 | return data 33 | except Exception as e: 34 | print(f"Error preprocessing data: {e}") 35 | 36 | # Define a function to split data into training and testing sets 37 | def split_data(data, test_size=0.2, random_state=42): 38 | try: 39 | X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1), data['target'], test_size=test_size, random_state=random_state) 40 | return X_train, X_test, y_train, y_test 41 | except Exception as e: 42 | print(f"Error splitting data: {e}") 43 | 44 | # Define a function to reduce dimensionality using PCA 45 | def reduce_dimensionality_pca(data, n_components=2): 46 | try: 47 | pca = PCA(n_components=n_components) 48 | data_reduced = pca.fit_transform(data) 49 | return data_reduced 50 | except Exception as e: 51 | print(f"Error reducing dimensionality using PCA: {e}") 52 | 53 | # Define a function to reduce dimensionality using t-SNE 54 | def reduce_dimensionality_tsne(data, n_components=2): 55 | try: 56 | tsne = TSNE(n_components=n_components) 57 | data_reduced = tsne.fit_transform(data) 58 | return data_reduced 59 | except Exception as e: 60 | print(f"Error reducing dimensionality using t-SNE: {e}") 61 | 62 | # Define a function to cluster data using K-Means 63 | def cluster_data(data, n_clusters=2): 64 | try: 65 | kmeans = KMeans(n_clusters=n_clusters) 66 | kmeans.fit(data) 67 | labels = kmeans.labels_ 68 | return labels 69 | except Exception as e: 70 | print(f"Error clustering data: {e}") 71 | 72 | # Define a function to evaluate clustering performance 73 | def evaluate_clustering(data, labels): 74 | try: 75 | silhouette = silhouette_score(data, labels) 76 | calinski_harabasz = calinski_harabasz_score(data, labels) 77 | davies_bouldin = davies_bouldin_score(data, labels) 78 | return silhouette, calinski_harabasz, davies_bouldin 79 | except Exception as e: 80 | print(f"Error evaluating clustering performance: {e}") 81 | 82 | # Define a function to visualize data 83 | def visualize_data(data, labels): 84 | try: 85 | plt.figure(figsize=(10, 8)) 86 | sns.scatterplot(x=data[:, 0], y=data[:, 1], hue=labels, palette='viridis') 87 | plt.title('Data Visualization') 88 | plt.show() 89 | except Exception as e: 90 | print(f"Error visualizing data: {e}") 91 | -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import os 4 | import sys 5 | 6 | # Create a logger 7 | logger = logging.getLogger(__name__) 8 | 9 | # Set the logging level 10 | logger.setLevel(logging.DEBUG) 11 | 12 | # Create a file handler 13 | file_handler = logging.FileHandler('log.log') 14 | file_handler.setLevel(logging.DEBUG) 15 | 16 | # Create a console handler 17 | console_handler = logging.StreamHandler(sys.stdout) 18 | console_handler.setLevel(logging.INFO) 19 | 20 | # Create a formatter 21 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 22 | 23 | # Add the formatter to the handlers 24 | file_handler.setFormatter(formatter) 25 | console_handler.setFormatter(formatter) 26 | 27 | # Add the handlers to the logger 28 | logger.addHandler(file_handler) 29 | logger.addHandler(console_handler) 30 | 31 | # Define a function to log messages 32 | def log_message(message, level=logging.INFO): 33 | logger.log(level, message) 34 | 35 | # Define a function to log errors 36 | def log_error(message): 37 | logger.error(message) 38 | 39 | # Define a function to log warnings 40 | def log_warning(message): 41 | logger.warning(message) 42 | 43 | # Define a function to log debug messages 44 | def log_debug(message): 45 | logger.debug(message) 46 | 47 | # Define a function to log info messages 48 | def log_info(message): 49 | logger.info(message) 50 | 51 | # Define a function to log critical messages 52 | def log_critical(message): 53 | logger.critical(message) 54 | -------------------------------------------------------------------------------- /utils/math_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | from scipy.stats import norm 4 | from scipy.optimize import minimize 5 | 6 | # Define a function to calculate the mean of a list of numbers 7 | def calculate_mean(numbers): 8 | try: 9 | mean = np.mean(numbers) 10 | return mean 11 | except Exception as e: 12 | print(f"Error calculating mean: {e}") 13 | 14 | # Define a function to calculate the standard deviation of a list of numbers 15 | def calculate_std_dev(numbers): 16 | try: 17 | std_dev = np.std(numbers) 18 | return std_dev 19 | except Exception as e: 20 | print(f"Error calculating standard deviation: {e}") 21 | 22 | # Define a function to calculate the variance of a list of numbers 23 | def calculate_variance(numbers): 24 | try: 25 | variance = np.var(numbers) 26 | return variance 27 | except Exception as e: 28 | print(f"Error calculating variance: {e}") 29 | 30 | # Define a function to calculate the median of a list of numbers 31 | def calculate_median(numbers): 32 | try: 33 | median = np.median(numbers) 34 | return median 35 | except Exception as e: 36 | print(f"Error calculating median: {e}") 37 | 38 | # Define a function to calculate the mode of a list of numbers 39 | def calculate_mode(numbers): 40 | try: 41 | mode = np.bincount(numbers).argmax() 42 | return mode 43 | except Exception as e: 44 | print(f"Error calculating mode: {e}") 45 | 46 | # Define a function to calculate the range of a list of numbers 47 | def calculate_range(numbers): 48 | try: 49 | range_ = np.ptp(numbers) 50 | return range_ 51 | except Exception as e: 52 | print(f"Error calculating range: {e}") 53 | 54 | # Define a function to calculate the interquartile range (IQR) of a list of numbers 55 | def calculate_iqr(numbers): 56 | try: 57 | q75, q25 = np.percentile(numbers, [75, 25]) 58 | iqr = q75 - q25 59 | return iqr 60 | except Exception as e: 61 | print(f"Error calculating IQR: {e}") 62 | 63 | # Define a function to calculate the skewness of a list of numbers 64 | def calculate_skewness(numbers): 65 | try: 66 | skewness = np.mean((numbers - np.mean(numbers)) ** 3) / np.std(numbers) ** 3 67 | return skewness 68 | except Exception as e: 69 | print(f"Error calculating skewness: {e}") 70 | 71 | # Define a function to calculate the kurtosis of a list of numbers 72 | def calculate_kurtosis(numbers): 73 | try: 74 | kurtosis = np.mean((numbers - np.mean(numbers)) ** 4) / np.std(numbers) ** 4 - 3 75 | return kurtosis 76 | except Exception as e: 77 | print(f"Error calculating kurtosis: {e}") 78 | 79 | # Define a function to calculate the correlation coefficient between two lists of numbers 80 | def calculate_correlation(numbers1, numbers2): 81 | try: 82 | correlation = np.corrcoef(numbers1, numbers2)[0, 1] 83 | return correlation 84 | except Exception as e: 85 | print(f"Error calculating correlation: {e}") 86 | 87 | # Define a function to calculate the regression line between two lists of numbers 88 | def calculate_regression_line(numbers1, numbers2): 89 | try: 90 | slope, intercept = np.polyfit(numbers1, numbers2, 1) 91 | return slope, intercept 92 | except Exception as e: 93 | print(f"Error calculating regression line: {e}") 94 | 95 | # Define a function to calculate the probability density function (PDF) of a normal distribution 96 | def calculate_pdf(x, mean, std_dev): 97 | try: 98 | pdf = norm.pdf(x, mean, std_dev) 99 | return pdf 100 | except Exception as e: 101 | print(f"Error calculating PDF: {e}") 102 | 103 | # Define a function to calculate the cumulative distribution function (CDF) of a normal distribution 104 | def calculate_cdf(x, mean, std_dev): 105 | try: 106 | cdf = norm.cdf(x, mean, std_dev) 107 | return cdf 108 | except Exception as e: 109 | print(f"Error calculating CDF: {e}") 110 | 111 | # Define a function to calculate the inverse CDF of a normal distribution 112 | def calculate_inverse_cdf(p, mean, std_dev): 113 | try: 114 | inverse_cdf = norm.ppf(p, mean, std_dev) 115 | return inverse_cdf 116 | except Exception as e: 117 | print(f"Error calculating inverse CDF: {e}") 118 | 119 | # Define a function to minimize a function using the minimize function from scipy 120 | def minimize_function(func, x0, method='BFGS'): 121 | try: 122 | res = minimize(func, x0, method=method) 123 | return res.x 124 | except Exception as e: 125 | print(f"Error minimizing function: {e}") 126 | --------------------------------------------------------------------------------