├── data └── .gitkeep ├── logs └── .gitkeep ├── masks └── .gitkeep ├── results └── .gitkeep ├── trained_models └── .gitkeep ├── pyproject.toml ├── entrypoint.sh ├── .flake8 ├── brainage ├── performance_metric.py ├── __init__.py ├── zscore.py ├── xgboost_adapted.py ├── read_data.py ├── define_models.py ├── create_splits.py └── calculate_features.py ├── tests └── test_read_data_mask_resampled.py ├── codes ├── run_in_venv.sh ├── cross_site_ixi_camcan_enki.submit ├── cross_site_4sites.submit ├── calculate_features_parcelwise.py ├── calculate_features_voxelwise.py ├── cross_site_combine_features.py ├── calculate_features.submit ├── within_site_bias_correction.py ├── cross_site_bias_correction_using_CN.py ├── within_site_combine_predictions.py ├── predict_age.py ├── cross_site_bias_correction.py ├── predict_age_sing.py ├── cross_site_combine_predictions.py ├── cross_site_read_results.py ├── within_site_ixi.submit ├── calculate_features2.submit ├── cross_site_train.py ├── within_site_read_results.py ├── within_site_train.py └── cat_standalone_batch-HiFi1mm.m ├── requirements.txt ├── setup.py ├── BA_predict.recipe ├── .gitignore └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /masks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trained_models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.pytest.ini_options] 6 | addopts = "--cov=brainage" 7 | testpaths = [ 8 | "tests", 9 | ] -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd codes 3 | python3 predict_age_sing.py --features_path $1 --data_dir $2 --subject_filepaths $3 --output_path $4 --output_prefix $5 --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm $6 --resample_size $7 --model_file $8 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = __init__.py,*externals*,constants.py,fixes.py,resources.py,nilearn_cache,venv,docs/auto_examples,docs/_build/,.eggs/ 3 | ignore = W503,W504,I100,I101,I201,N806,E201,E202,E221,E222,E241,F541,E999,E402 4 | # We add A for the array-spacing plugin, and ignore the E ones it covers above 5 | select = A,E,F,W,C -------------------------------------------------------------------------------- /brainage/performance_metric.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import mean_absolute_error, mean_squared_error 2 | import numpy as np 3 | 4 | def performance_metric(y_true, y_pred): 5 | mae = round(mean_absolute_error(y_true, y_pred), 3) 6 | mse = round(mean_squared_error(y_true, y_pred), 3) 7 | corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3) 8 | return mae, mse, corr 9 | -------------------------------------------------------------------------------- /brainage/__init__.py: -------------------------------------------------------------------------------- 1 | from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features 2 | from .create_splits import stratified_splits 3 | from .xgboost_adapted import XGBoostAdapted 4 | from .zscore import ZScoreSubwise, ZScore 5 | from .create_splits import repeated_stratified_splits 6 | from .read_data import read_data_cross_site 7 | from .read_data import read_data 8 | from .define_models import define_models 9 | from sklearn.linear_model import LinearRegression 10 | from .performance_metric import performance_metric 11 | 12 | -------------------------------------------------------------------------------- /tests/test_read_data_mask_resampled.py: -------------------------------------------------------------------------------- 1 | from brainage import binarize_3d 2 | from nibabel import Nifti1Image 3 | import numpy as np 4 | 5 | def _make_image(): 6 | return Nifti1Image( 7 | np.random.default_rng(seed=5).integers(low=0, high=5, size=(5, 5, 2)), 8 | np.eye(4), 9 | ) 10 | 11 | def test_binarize_3d(): 12 | img = _make_image() 13 | bin_img = binarize_3d(img, threshold=2) 14 | 15 | assert np.min(bin_img.get_fdata()) == 0 16 | assert np.max(bin_img.get_fdata()) == 1 17 | np.testing.assert_array_equal(np.unique(bin_img.get_fdata()),np.array([0, 1])) 18 | -------------------------------------------------------------------------------- /codes/run_in_venv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #source ~/.venvs/py3smore/bin/activate 3 | #OMP_NUM_THREADS=5 python3 $@ 4 | #deactivate 5 | 6 | if [ $# -lt 2 ]; then 7 | echo "This script is meant to run a command within a python environment" 8 | echo "It needs at least 2 parameters." 9 | echo "The first one must be the environment name." 10 | echo "The rest will be the command" 11 | exit 1 12 | fi 13 | 14 | env_name=$1 15 | echo "Activating ${env_name}" 16 | source ~/.venvs/${env_name}/bin/activate 17 | shift 1 18 | echo "Running ${@} in virtual environment" 19 | 20 | export MKL_NUM_THREADS=1 21 | export OPENBLAS_NUM_THREADS=1 22 | export NUMEXPR_NUM_THREADS=1 23 | export OMP_NUM_THREADS=1 24 | 25 | $@ 26 | 27 | 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asttokens==2.0.5 2 | backcall==0.2.0 3 | certifi==2021.10.8 4 | charset-normalizer==2.0.12 5 | commonmark==0.9.1 6 | convertdate==2.4.0 7 | cycler==0.11.0 8 | decorator==5.1.1 9 | executing==0.8.3 10 | fonttools==4.31.2 11 | numpy==1.22.3 12 | idna==3.3 13 | ipython==8.2.0 14 | jedi==0.18.1 15 | joblib==1.1.0 16 | julearn==0.2.5 17 | kiwisolver==1.4.2 18 | lunardate==0.2.0 19 | lxml==4.8.0 20 | matplotlib==3.5.1 21 | matplotlib-inline==0.1.3 22 | natsort==8.1.0 23 | nibabel==3.2.2 24 | nilearn==0.9.1 25 | packaging==21.3 26 | pandas==1.4.2 27 | parso==0.8.3 28 | pexpect==4.8.0 29 | pickleshare==0.7.5 30 | Pillow==9.1.0 31 | prompt-toolkit==3.0.29 32 | ptyprocess==0.7.0 33 | pure-eval==0.2.2 34 | Pygments==2.11.2 35 | pyluach==1.4.1 36 | PyMeeus==0.5.11 37 | pyparsing==3.0.7 38 | python-dateutil==2.8.2 39 | pytz==2022.1 40 | requests==2.27.1 41 | rich==12.2.0 42 | scikit-learn==1.0.2 43 | scipy==1.8.0 44 | seaborn==0.11.2 45 | six==1.16.0 46 | stack-data==0.2.0 47 | threadpoolctl==3.1.0 48 | traitlets==5.1.1 49 | urllib3==1.26.9 50 | wcwidth==0.2.5 51 | workalendar==16.3.0 52 | xgboost==1.6.0 53 | 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | # requirements = [] 4 | # with open("requirements.txt", "r") as f: 5 | # for line in f: 6 | # requirements.append(line) 7 | 8 | setup( 9 | name="brainage", 10 | version="0.1.0", 11 | description="Brainage prediction project", 12 | url="https://github.com/juaml/brainage_estimation", 13 | author="Applied Machine Learning FZJ", 14 | packages=find_packages(), 15 | # install_requires=requirements, 16 | classifiers=[ 17 | "Development Status :: 1 - Planning", 18 | "Intended Audience :: Science/Research", 19 | "License :: OSI Approved :: BSD License", 20 | "Operating System :: POSIX :: Linux", 21 | "Programming Language :: Python :: 2", 22 | "Programming Language :: Python :: 2.7", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.4", 25 | "Programming Language :: Python :: 3.5", 26 | ], 27 | python_requires=">=3.6", 28 | include_package_data=True, 29 | package_data={"": ["data/*"]}, 30 | ) 31 | 32 | 33 | -------------------------------------------------------------------------------- /brainage/zscore.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | from sklearn.utils import check_array 4 | from scipy.stats import zscore 5 | 6 | 7 | class ZScore(BaseEstimator, TransformerMixin): 8 | 9 | def __init__(self, axis=0): 10 | self.axis = axis 11 | 12 | def fit(self, X, y=None): 13 | X = check_array(X) 14 | self.mean_ = np.mean(X, axis=self.axis) 15 | self.std_ = np.std(X, axis=self.axis) 16 | return self 17 | 18 | def transform(self, X): 19 | X = check_array(X) 20 | mean = ( 21 | self.mean_.reshape(-1, 1) 22 | if self.axis 23 | else self.mean_ 24 | ) 25 | 26 | std = ( 27 | self.std_.reshape(-1, 1) 28 | if self.axis 29 | else self.std_ 30 | ) 31 | # print(f"{X.shape = }") 32 | # print(f"{mean.shape = }") 33 | # print(f"{std.shape = }") 34 | 35 | return (X - mean) / std 36 | 37 | 38 | class ZScoreSubwise(BaseEstimator, TransformerMixin): 39 | 40 | def __init__(self, axis=0): 41 | self.axis = axis 42 | 43 | def fit(self, X, y=None): 44 | return self 45 | 46 | def transform(self, X): 47 | X = check_array(X) 48 | return zscore(X, axis=self.axis) 49 | 50 | -------------------------------------------------------------------------------- /BA_predict.recipe: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: continuumio/miniconda3:latest 3 | 4 | %files 5 | # copy brainage module and related files 6 | brainage/ /opt/src/brainage/ 7 | setup.py /opt/src/ 8 | pyproject.toml /opt/src/ 9 | requirements.txt /opt/src/ 10 | entrypoint.sh /opt/scripts/ 11 | 12 | %post 13 | 14 | export PATH=/opt/conda/bin:$PATH 15 | 16 | chmod 777 /tmp/ 17 | chmod 777 /opt/scripts/entrypoint.sh 18 | 19 | apt-get update --allow-releaseinfo-change 20 | apt-get install -y --fix-missing libgomp1 wget dpkg 21 | 22 | 23 | NOW=`date` 24 | 25 | # Initialize conda 26 | conda --version 27 | 28 | conda create --name BA_env -c conda-forge python=3.9.1 numpy==1.22.3 matplotlib==3.5.1 nibabel==3.2.2 nilearn==0.9.1 pandas==1.4.2 scipy==1.8.0 seaborn==0.11.2 xgboost==1.6.1 scikit-learn==1.0.2 glmnet 29 | . /opt/conda/etc/profile.d/conda.sh 30 | conda activate BA_env 31 | pip install "julearn==0.2.5" 32 | pip install git+https://github.com/JamesRitchie/scikit-rvm.git@master 33 | cd /opt/src && pip install -e . 34 | 35 | 36 | %runscript 37 | echo "Container was created $NOW" 38 | echo "Arguments received: $*" 39 | 40 | # Activate environment 41 | conda init 42 | . /opt/conda/etc/profile.d/conda.sh 43 | conda activate BA_env 44 | 45 | # Running entrypoint.sh 46 | /opt/scripts/entrypoint.sh "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" 47 | echo "Computation finished!" 48 | -------------------------------------------------------------------------------- /brainage/xgboost_adapted.py: -------------------------------------------------------------------------------- 1 | from xgboost import XGBRegressor 2 | from sklearn.base import BaseEstimator 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | 6 | class XGBoostAdapted(BaseEstimator): 7 | 8 | def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0): 9 | self.early_stopping_rounds = early_stopping_rounds 10 | self.eval_metric = eval_metric 11 | self.eval_set_percent = eval_set_percent 12 | self.random_seed = random_seed 13 | self.n_jobs = n_jobs 14 | self.max_depth = max_depth 15 | self.n_estimators = n_estimators 16 | self.nthread = nthread 17 | self.reg_alpha = reg_alpha 18 | 19 | 20 | def fit(self, X, y): 21 | self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha) 22 | 23 | X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed) 24 | 25 | eval_set = [(X_test, y_test)] 26 | 27 | self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set) 28 | 29 | return self 30 | 31 | def score(self, X, y, sample_weight=None): 32 | return self._xgbregressor.score(X.values, y.values, sample_weight) 33 | 34 | def predict(self, X): 35 | return self._xgbregressor.predict(X.values) 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /codes/cross_site_ixi_camcan_enki.submit: -------------------------------------------------------------------------------- 1 | # The environment 2 | universe = vanilla 3 | getenv = True 4 | 5 | # resources 6 | request_cpus = 10 7 | request_memory = 5G 8 | 9 | 10 | # Execution 11 | initial_dir = . 12 | executable = $(initial_dir)/run_in_venv.sh 13 | transfer_executable = False 14 | 15 | #Logs 16 | log = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).log 17 | output = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).out 18 | error = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).err 19 | 20 | 21 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb' 22 | # --pca_status: 0 or 1 23 | 24 | 25 | data_name = ixi_camcan_enki 26 | subject_filepaths_csv = ixi_camcan_enki.subject_list_cat12.8.csv 27 | 28 | arguments = test_package_env python3 cross_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca) 29 | 30 | 31 | ########## S4_R8 32 | feature_name = $(data_name).S4_R8 33 | result_prefix = $(data_name).S4_R8 34 | pca = 0 35 | 36 | #model = ridge 37 | #queue 38 | 39 | #model = rf 40 | #queue 41 | 42 | model = rvr_poly 43 | queue 44 | 45 | 46 | ########## 173 47 | feature_name = $(data_name).173 48 | result_prefix = $(data_name).173 49 | pca = 0 50 | 51 | #model = gauss 52 | #queue 53 | 54 | #model = rf 55 | #queue 56 | 57 | model = lasso 58 | queue 59 | 60 | model = ridge 61 | queue 62 | 63 | #model = rvr_lin 64 | #queue 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /codes/cross_site_4sites.submit: -------------------------------------------------------------------------------- 1 | # The environment 2 | universe = vanilla 3 | getenv = True 4 | 5 | # resources 6 | request_cpus = 10 7 | request_memory = 5G 8 | 9 | 10 | # Execution 11 | initial_dir = . 12 | executable = $(initial_dir)/run_in_venv.sh 13 | transfer_executable = False 14 | 15 | #Logs 16 | log = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).log 17 | output = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).out 18 | error = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).err 19 | 20 | 21 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb' 22 | # --pca_status: 0 or 1 23 | 24 | 25 | data_name = ixi_camcan_enki_1000brains 26 | subject_filepaths_csv = ixi_camcan_enki_1000brains.subject_list_cat12.8.csv 27 | 28 | arguments = test_package_env python3 cross_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca) 29 | 30 | 31 | ########## S4_R4_pca 32 | feature_name = $(data_name).S4_R4 33 | result_prefix = $(data_name).S4_R4_pca 34 | pca = 1 35 | 36 | #model = ridge 37 | #queue 38 | 39 | #model = rf 40 | #queue 41 | 42 | #model = rvr_lin 43 | #queue 44 | 45 | #model = kernel_ridge 46 | #queue 47 | 48 | #model = gauss 49 | #queue 50 | 51 | #model = lasso 52 | #queue 53 | 54 | #model = elasticnet 55 | #queue 56 | 57 | #model = rvr_poly 58 | #queue 59 | 60 | #model = xgb 61 | #queue 62 | 63 | 64 | ########## 173 65 | feature_name = $(data_name).173 66 | result_prefix = $(data_name).173 67 | pca = 0 68 | 69 | model = rvr_lin 70 | queue 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /codes/calculate_features_parcelwise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import argparse 4 | from pathlib import Path 5 | from brainage import calculate_parcelwise_features 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--features_path", type=str, help="path to features dir") # eg '../data/ADNI' 11 | # parser.add_argument("--output_path", type=str, help="path to output_dir") # eg'../results/ADNI' 12 | parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv' 13 | parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI' 14 | parser.add_argument("--mask_file", type=str, help="path to mask nii file") 15 | parser.add_argument("--num_parcels", type=str, help="Number of parcels") 16 | 17 | # python3 calculate_features_parcelwise.py --features_path ../data/ixi/ --subject_filepaths ../data/ixi/ixi_paths_cat12.8.csv --output_prefix ixi --mask_file ../masks/BSF_173.nii --num_parcels 173 18 | 19 | # example inputs 20 | # features_path = Path('../data/ixi/') 21 | # subject_filepaths = '../data/ixi_paths_cat12.8.csv' 22 | # output_prefix = 'ixi' 23 | # mask_file = '../masks/BSF_173.nii' 24 | # num_parcels = 173 25 | 26 | args = parser.parse_args() 27 | features_path = Path(args.features_path) 28 | subject_filepaths = args.subject_filepaths 29 | output_prefix = args.output_prefix 30 | mask_file = args.mask_file 31 | num_parcels = args.num_parcels 32 | 33 | print('Subjects filepaths: ', subject_filepaths) 34 | print('Directory to features path: ', features_path) 35 | print('Results filename prefix: ', output_prefix) 36 | print('GM mask used: ', mask_file) 37 | print('Number of parcels:', num_parcels, '/n') 38 | 39 | data_parcels = calculate_parcelwise_features(subject_filepaths, mask_file, num_parcels) 40 | 41 | features_path.mkdir(exist_ok=True, parents=True) 42 | 43 | full_filename = str(output_prefix) + '.' + str(num_parcels) 44 | filename = os.path.join(features_path, full_filename) 45 | print('filename for features created: ', filename) 46 | pickle.dump(data_parcels, open(filename, "wb"), protocol=4) 47 | data_parcels.to_csv(filename + '.csv', index=False) -------------------------------------------------------------------------------- /brainage/read_data.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pandas as pd 3 | 4 | def read_data_cross_site(data_file, train_status, confounds): 5 | 6 | data_df = pickle.load(open(data_file, 'rb')) 7 | X = [col for col in data_df if col.startswith('f_')] 8 | y = 'age' 9 | data_df['age'] = data_df['age'].round().astype(int) # round off age and convert to integer 10 | data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True) 11 | duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject) 12 | data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True) # remove duplicated subjects 13 | 14 | if confounds is not None: # convert sites in numbers to perform confound removal 15 | if train_status == 'train': 16 | site_name = data_df['site'].unique() 17 | if type(site_name[0]) == str: 18 | site_dict = {k: idx for idx, k in enumerate(site_name)} 19 | data_df['site'] = data_df['site'].replace(site_dict) 20 | 21 | elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with confound removal 22 | X.append(confounds) 23 | site_name = data_df['site'].unique()[0,] 24 | if type(site_name) == str: 25 | data_df['site'] = 10 26 | return data_df, X, y 27 | 28 | 29 | 30 | def read_data(features_file, demographics_file): 31 | data_df = pickle.load(open(features_file, 'rb')) # read the data 32 | demo = pd.read_csv(demographics_file) # read demographics file 33 | data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them 34 | 35 | print('Data columns:', data_df.columns) 36 | print('Data Index:', data_df.index) 37 | 38 | X = [col for col in data_df if col.startswith('f_')] 39 | y = 'age' 40 | data_df['age'] = data_df['age'].round().astype(int) # round off age and convert to integer 41 | data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True) 42 | data_df.sort_values(by='age', inplace=True, ignore_index=True) # sort by age 43 | duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject) 44 | data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True) # remove duplicated subjects 45 | return data_df, X, y 46 | 47 | -------------------------------------------------------------------------------- /codes/calculate_features_voxelwise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | from pathlib import Path 5 | from brainage import calculate_voxelwise_features 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--features_path", type=str, help="path to features dir") # eg '../data/ADNI' 10 | # parser.add_argument("--output_path", type=str, help="path to output_dir") # eg'../results/ADNI' 11 | parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv' 12 | parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI' 13 | parser.add_argument("--mask_file", type=str, help="path to GM mask nii file", 14 | default='../masks/brainmask_12.8.nii') 15 | parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4) 16 | parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4) 17 | 18 | # python3 calculate_features_voxelwise.py --features_path ../data/ixi/ --subject_filepaths ../data/ixi/ixi_paths_cat12.8.csv --output_prefix ixi --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8 19 | 20 | # example inputs 21 | # features_path = Path('../data/ixi/') 22 | # subject_filepaths = '../data/ixi_paths_cat12.8.csv' 23 | # output_prefix = 'ixi' 24 | # mask_file = '../masks/brainmask_12.8.nii' 25 | # smooth_fwhm = 4 26 | # resample_size = 8 27 | 28 | args = parser.parse_args() 29 | features_path = Path(args.features_path) 30 | subject_filepaths = args.subject_filepaths 31 | output_prefix = args.output_prefix 32 | mask_file = args.mask_file 33 | smooth_fwhm = args.smooth_fwhm 34 | resample_size = args.resample_size 35 | 36 | print('Subjects filepaths: ', subject_filepaths) 37 | print('Directory to features path: ', features_path) 38 | print('Results filename prefix: ', output_prefix) 39 | print('GM mask used: ', mask_file) 40 | print('smooth_fwhm:', smooth_fwhm) 41 | print('resample_size:', resample_size, '/n') 42 | 43 | data_resampled = calculate_voxelwise_features(subject_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size) 44 | 45 | features_path.mkdir(exist_ok=True, parents=True) 46 | 47 | full_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size) 48 | filename = os.path.join(features_path, full_filename) 49 | print('filename for features created: ', filename) 50 | pickle.dump(data_resampled, open(filename, "wb"), protocol=4) 51 | data_resampled.to_csv(filename + '.csv', index=False) 52 | -------------------------------------------------------------------------------- /codes/cross_site_combine_features.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pandas as pd 3 | import os.path 4 | 5 | if __name__ == '__main__': 6 | 7 | results_folder = '../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.' 8 | data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/enki/enki.', '../data/1000brains/1000brains.'] 9 | 10 | results_folder = '../data/ixi_camcan_enki/ixi_camcan_enki.' 11 | data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/enki/enki.'] 12 | 13 | results_folder = '../data/ixi_camcan_1000brains/ixi_camcan_1000brains.' 14 | data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/1000brains/1000brains.'] 15 | 16 | results_folder = '../data/camcan_enki_1000brains/camcan_enki_1000brains_' 17 | data_list = ['../data/camcan/camcan.', '../data/enki/enki.', '../data/1000brains/1000brains.'] 18 | 19 | results_folder = '../data/ixi_enki_1000brains/ixi_enki_1000brains.' 20 | data_list = ['../data/ixi/ixi.', '../data/enki/enki.', '../data/1000brains/1000brains.'] 21 | 22 | feature_list = ['173', '473', '873', '1273', 'S0_R4', 'S4_R4', 'S8_R4', 'S0_R8', 'S4_R8', 'S8_R8'] 23 | 24 | combined_data_df = pd.DataFrame() 25 | combined_demo_df = pd.DataFrame() 26 | 27 | for feature_item in feature_list: 28 | print(feature_item) 29 | 30 | combined_data_df = pd.DataFrame() 31 | combined_demo_df = pd.DataFrame() 32 | 33 | for data_item in data_list: 34 | datafile_name = data_item + feature_item + '.csv' 35 | demofile_name = data_item + 'subject_list_cat12.8.csv' 36 | print(datafile_name, demofile_name) 37 | 38 | if os.path.exists(datafile_name): 39 | 40 | data_df, demo_df = pd.read_csv(datafile_name), pd.read_csv(demofile_name) 41 | print(data_df.shape, demo_df.shape) 42 | 43 | if 'session' not in demo_df.columns: 44 | demo_df['session'] = 'ses-1' 45 | 46 | combined_data_df = pd.concat([combined_data_df, data_df]) 47 | combined_demo_df = pd.concat([combined_demo_df, demo_df]) 48 | else: 49 | break 50 | 51 | combined_data_df = combined_data_df.reset_index(drop=True) 52 | combined_demo_df = combined_demo_df.reset_index(drop=True) 53 | 54 | print(combined_data_df.shape, combined_demo_df.shape) 55 | 56 | # demographic_file = results_folder + 'subject_list_cat12.8.csv' 57 | # features_file = results_folder + feature_item 58 | # print(demographic_file, features_file) 59 | # 60 | # combined_demo_df.to_csv(demographic_file, index=False) 61 | # combined_data_df.to_csv(features_file + '.csv', index=False) 62 | # pickle.dump(combined_data_df, open(features_file, 'wb'), protocol=4) 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /brainage/define_models.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | from skrvm import RVR 3 | from glmnet import ElasticNet 4 | import sklearn.gaussian_process as gp 5 | from sklearn.kernel_ridge import KernelRidge 6 | from sklearn.decomposition import PCA 7 | from brainage import XGBoostAdapted 8 | from sklearn.feature_selection import VarianceThreshold 9 | 10 | def define_models(): 11 | # Define all models and model parameters 12 | rvr_linear = RVR() 13 | rvr_poly = RVR() 14 | kernel_ridge = KernelRidge() 15 | lasso = ElasticNet(alpha=1, standardize=False) 16 | elasticnet = ElasticNet(alpha=0.5, standardize=False) 17 | ridge = ElasticNet(alpha=0, standardize=False) 18 | xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2) 19 | pca = PCA(n_components=None) # max as many components as sample size 20 | 21 | 22 | model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb] 23 | model_para_list = [ 24 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 25 | 26 | {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse', 27 | 'rf__max_features': 0.33, 'rf__min_samples_leaf': 5, 28 | 'rf__random_state': rand_seed}, 29 | 30 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear', 31 | 'rvr__random_state': rand_seed}, 32 | 33 | {'variancethreshold__threshold': var_threshold, 34 | 'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0], 35 | 'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5}, 36 | 37 | {'variancethreshold__threshold': var_threshold, 38 | 'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100, 39 | 'gauss__normalize_y': True, 'gauss__random_state': rand_seed}, 40 | 41 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 42 | 43 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 44 | 45 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1, 46 | 'rvr__random_state': rand_seed}, 47 | 48 | {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1, 49 | 'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100, 50 | 'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2], 51 | 'xgboostadapted__random_seed': rand_seed, 'cv': 5}] # 'search_params':{'n_jobs': 5}] 52 | 53 | return model_list, model_para_list 54 | -------------------------------------------------------------------------------- /codes/calculate_features.submit: -------------------------------------------------------------------------------- 1 | # The environment 2 | universe = vanilla 3 | getenv = True 4 | 5 | # resources 6 | request_cpus = 1 7 | request_memory = 5G 8 | 9 | # Execution 10 | initial_dir = . 11 | executable = $(initial_dir)/run_in_venv.sh 12 | 13 | # Job 14 | #log = $(initial_dir)/../logs/$(Cluster).$(Process).log 15 | #output = $(initial_dir)/../logs/$(Cluster).$(Process).out 16 | #error = $(initial_dir)/../logs/$(Cluster).$(Process).err 17 | 18 | log = $(initial_dir)/../logs/$(data_name).$(Process).log 19 | output = $(initial_dir)/../logs/$(data_name).$(Process).out 20 | error = $(initial_dir)/../logs/$(data_name).$(Process).err 21 | 22 | # 1000brains (change data_name and subject_filepaths_csv to run for different dataset) 23 | data_name = 1000brains 24 | subject_filepaths_csv = 1000brains.paths_cat12.8.csv 25 | 26 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4 27 | queue 28 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8 29 | queue 30 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 31 | queue 32 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8 33 | queue 34 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4 35 | queue 36 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8 37 | queue 38 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173 39 | queue 40 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473 41 | queue 42 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873 43 | queue 44 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273 45 | queue 46 | 47 | -------------------------------------------------------------------------------- /codes/within_site_bias_correction.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os.path 3 | import argparse 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.model_selection import StratifiedKFold 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--input_predictions_file", type=str, help="Path to predictions csv") 12 | parser.add_argument("--BC_predictions_file", type=str, help="Path to bias corrected predictions") 13 | 14 | # python3 within_site_bias_correction.py \ 15 | # --input_predictions_file ../results/ixi/ixi.all_models_pred.csv \ 16 | # --BC_predictions_file ../results/ixi/ixi.all_models_pred_BC.csv 17 | 18 | # read arguments 19 | args = parser.parse_args() 20 | input_predictions_file = args.input_predictions_file 21 | BC_predictions_file = args.BC_predictions_file 22 | 23 | # Initialize 24 | input_df, output_df = pd.DataFrame(), pd.DataFrame() 25 | column_list, column_name_original = [], [] 26 | 27 | if os.path.exists(input_predictions_file): # if predictions exists 28 | input_df = pd.read_csv(input_predictions_file) # read predictions from all workflows 29 | print(input_df.columns) 30 | print(input_df.index) 31 | 32 | if 'session' in input_df.columns: 33 | column_list = input_df.columns[5:] # remove ['site', 'subject', 'age', 'gender''] 34 | output_df = input_df[['site', 'subject', 'age', 'gender', 'session']] 35 | else: 36 | column_list = input_df.columns[4:] # remove ['site', 'subject', 'age', 'gender''] 37 | output_df = input_df[['site', 'subject', 'age', 'gender']] 38 | 39 | # Fixed parameters from model training random seed and CV 40 | rand_seed = 200 41 | num_splits = 5 # how many train and test splits 42 | num_bins = math.floor(len(input_df)/num_splits) # num of bins to be created = num of labels created 43 | qc = pd.cut(input_df.index.tolist(), num_bins) # create bins for age 44 | cv_5fold = StratifiedKFold(n_splits=num_splits, shuffle=False, random_state=None) 45 | 46 | for column in column_list: # for each workflow, X = true age, y= predicted age 47 | results_pred = pd.DataFrame() 48 | X = ['age'] 49 | y = column 50 | print(f'worflow name: {column}') 51 | 52 | for train_idx, test_idx in cv_5fold.split(input_df, qc.codes): 53 | # print('test_idx', test_idx) 54 | train_df, test_df = input_df.loc[train_idx,:], input_df.loc[test_idx,:] # get test and train dataframes 55 | print('train size:', train_df.shape, 'test size:', test_df.shape) 56 | # print(test_df) 57 | 58 | train_x = train_df.loc[:, X] # true age 59 | train_y = train_df.loc[:, y] # predicted age 60 | 61 | model = LinearRegression().fit(train_x, train_y) # x = age, y = predicted age 62 | print(model.intercept_, model.coef_) 63 | corrected_pred = (test_df[y] - model.intercept_) / model.coef_ # corrected predictions 64 | 65 | if results_pred.empty: 66 | results_pred = corrected_pred 67 | else: 68 | results_pred = pd.concat([results_pred, corrected_pred], axis=0) 69 | 70 | results_pred.sort_index(axis=0, level=None, ascending=True, inplace=True) 71 | output_df = pd.concat([output_df, results_pred], axis=1) 72 | 73 | output_df.rename(columns=dict(zip(column_list, column_name_original)), inplace=True) 74 | 75 | print('ALL DONE') 76 | print(f'Corrected predictions: \n {output_df}') 77 | output_df.to_csv(BC_predictions_file, index=False) 78 | else: 79 | print(f'{input_predictions_file} not found') 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /brainage/create_splits.py: -------------------------------------------------------------------------------- 1 | #!/home/smore/.venvs/py3smore/bin/python3 2 | import math 3 | import pandas as pd 4 | from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold 5 | 6 | 7 | # def create_splits(data_df, repeats): 8 | # num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created 9 | # print('num_bins', num_bins, len(data_df)/repeats) 10 | # 11 | # qc = pd.cut(data_df.index, num_bins) 12 | # df = pd.DataFrame({'bin': qc.codes}) 13 | # 14 | # max_num = max(df['bin'].value_counts()) 15 | # print(df['bin'].value_counts()) 16 | # print(max_num, 'max_num') 17 | # 18 | # test_idx = {} 19 | # for rpt_num in range(0, repeats): 20 | # key = 'repeat_' + str(rpt_num) 21 | # test_idx[key] = [] 22 | # 23 | # if repeats == max_num: 24 | # for num in range(0, max_num): 25 | # for bin_idx in df['bin'].unique(): 26 | # test = df[df['bin'] == bin_idx] 27 | # if num < len(test): 28 | # key = 'repeat_' + str(num) 29 | # test_idx[key].append(test.index[num]) 30 | # return test_idx 31 | 32 | 33 | def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): 34 | """ 35 | :param bins_on: variable used to create bins 36 | :param num_bins: num of bins/classes to create 37 | :param data: data to create cv splits on 38 | :param num_splits: number of cv splits to create 39 | :param shuffle: shuffle the data or not 40 | :param random_state: random seed to use if shuffle=True 41 | :return: a dictionary with index 42 | """ 43 | qc = pd.cut(bins_on.tolist(), num_bins) # divides data in bins 44 | cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state) 45 | test_idx = {} 46 | rpt_num = 0 47 | for train_index, test_index in cv.split(data, qc.codes): 48 | key = 'repeat_' + str(rpt_num) 49 | test_idx[key] = test_index 50 | rpt_num = rpt_num + 1 51 | return test_idx 52 | 53 | 54 | def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state): 55 | """ 56 | :param bins_on: variable used to create bins 57 | :param data: data to create cv splits on 58 | :param num_splits: number of cv splits to create 59 | :param shuffle: shuffle the data or not 60 | :param random_state: random seed to use if shuffle=True 61 | :return: a dictionary with index 62 | """ 63 | cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state) 64 | test_idx = {} 65 | rpt_num = 0 66 | for train_index, test_index in cv.split(data, bins_on): 67 | key = 'repeat_' + str(rpt_num) 68 | test_idx[key] = test_index 69 | rpt_num = rpt_num + 1 70 | return test_idx 71 | 72 | 73 | # def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation() 74 | # """ 75 | # :param bins_on: variable used to create bins 76 | # :param num_bins: num of bins/classes to create 77 | # :param data: data to create cv splits on 78 | # :param num_splits: number of cv splits to create 79 | # :param shuffle: shuffle the data or not 80 | # :param random_state: random seed to use if shuffle=True 81 | # :return: cv iterator 82 | # """ 83 | # qc = pd.cut(bins_on.tolist(), num_bins) 84 | # cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes) 85 | # return cv 86 | 87 | 88 | def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state): 89 | qc = pd.cut(bins_on.tolist(), num_bins) 90 | cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state) 91 | test_idx = {} 92 | rpt_num = 0 93 | for train_index, test_index in cv.split(data, qc.codes): 94 | key = 'repeat_' + str(rpt_num) 95 | test_idx[key] = test_index 96 | rpt_num = rpt_num + 1 97 | return test_idx 98 | 99 | -------------------------------------------------------------------------------- /codes/cross_site_bias_correction_using_CN.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.linear_model import LinearRegression 5 | 6 | 7 | def bias_correction(train_data, test_data, x, y): 8 | 9 | # bias correction using cole's method: (Using HC from the test sample) 10 | # a, b = np.polyfit(train_data[x], train_data[y], 1) 11 | # print(a, b) 12 | # corrected_predictions = (test_data[y] - b) / a 13 | # print(corrected_predictions) 14 | 15 | # bias correction using cole's method: (Using HC from the test sample) 16 | train_x = train_data[x].to_numpy().reshape(-1, 1) # x = age 17 | train_y = train_data[y].to_numpy().reshape(-1, 1) # y = predictions 18 | 19 | lin_reg = LinearRegression().fit(train_x, train_y) 20 | print(lin_reg.intercept_, lin_reg.coef_) 21 | 22 | corrected_predictions = (test_data[y] - lin_reg.intercept_[0]) / lin_reg.coef_[0][0] 23 | 24 | return corrected_predictions 25 | 26 | 27 | if __name__ == '__main__': 28 | # Read arguments from submit file 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") # age and group is mandatory 31 | parser.add_argument("--predictions_file", type=str, help="Predictions file path") 32 | parser.add_argument("--predictions_column_name", type=str, help="Predictions", default='S4_R4_pca+gauss') 33 | parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name", default='.BC') # eg: 'ADNI' 34 | 35 | # read arguments 36 | args = parser.parse_args() 37 | demographics_file = args.demographics_file 38 | predictions_file = args.predictions_file 39 | predictions_column_name = args.predictions_column_name 40 | output_prefix = args.output_prefix 41 | 42 | # example 43 | # python3 cross_site_bias_correction_using_CN.py \ 44 | # --demographics_file ../data/ADNI/ADNI.subject_list_cat12.8.csv \ 45 | # --predictions_file ../results/ADNI/ADNI.S4_R4_pca.gauss.prediction.csv \ 46 | # --predictions_column_name S4_R4_pca+gauss \ 47 | # --output_prefix _BC 48 | 49 | # creating output filename same as imput predictions file name but adding output_prefix 50 | predictions_file_name_BC = predictions_file.replace('.csv', output_prefix + '.csv') 51 | 52 | demographics = pd.read_csv(demographics_file) 53 | predictions = pd.read_csv(predictions_file) 54 | 55 | # check if predictions contains predictions_column_name column as given by the user 56 | assert predictions_column_name in predictions.columns, f"{predictions_column_name} column not found in {predictions_file}" 57 | 58 | # check if demographics contains 'age' column and 'Research Group' column (which should have 'CN' as a category) 59 | assert "Research Group" in demographics.columns, f"'Research Group' column not found in {demographics_file}" 60 | assert "age" in demographics.columns, f"'age' column not found in {demographics_file}" 61 | assert 'CN' in demographics['Research Group'].unique(), f"'CN' group is not found in 'Research Group' column in {demographics_file}" 62 | 63 | # check if the demographics and predictions are of same length 64 | assert len(demographics) == len(predictions), "Mimatch between length of demographics and predictions" 65 | combined_df = pd.concat([demographics, predictions], axis=1) 66 | 67 | train_data = combined_df[combined_df["Research Group"] == "CN"] # train only on Healthy subjects 68 | test_data = combined_df # apply on whole sample 69 | x = 'age' 70 | y = predictions_column_name 71 | 72 | corrected_predictions = bias_correction(train_data=train_data, test_data=combined_df, x=x, y=y) 73 | corrected_predictions = corrected_predictions.to_frame() 74 | corrected_predictions = corrected_predictions.rename(columns={predictions_column_name: predictions_column_name + output_prefix}) # adding prefix to the column name 75 | 76 | corrected_predictions.to_csv(predictions_file_name_BC, index=False) 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,linux,python,visualstudiocode 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,linux,python,visualstudiocode 4 | 5 | ### Linux ### 6 | *~ 7 | 8 | # temporary files which can be created if a process still has a handle open of a deleted file 9 | .fuse_hidden* 10 | 11 | # KDE directory preferences 12 | .directory 13 | 14 | # Linux trash folder which might appear on any partition or disk 15 | .Trash-* 16 | 17 | # .nfs files are created when an open file is removed but is still being accessed 18 | .nfs* 19 | 20 | ### macOS ### 21 | # General 22 | .DS_Store 23 | .AppleDouble 24 | .LSOverride 25 | 26 | # Icon must end with two \r 27 | Icon 28 | 29 | 30 | # Thumbnails 31 | ._* 32 | 33 | # Files that might appear in the root of a volume 34 | .DocumentRevisions-V100 35 | .fseventsd 36 | .Spotlight-V100 37 | .TemporaryItems 38 | .Trashes 39 | .VolumeIcon.icns 40 | .com.apple.timemachine.donotpresent 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | ### Python ### 50 | # Byte-compiled / optimized / DLL files 51 | __pycache__/ 52 | *.py[cod] 53 | *$py.class 54 | 55 | # C extensions 56 | *.so 57 | 58 | # Distribution / packaging 59 | .Python 60 | build/ 61 | develop-eggs/ 62 | dist/ 63 | downloads/ 64 | eggs/ 65 | .eggs/ 66 | parts/ 67 | sdist/ 68 | var/ 69 | wheels/ 70 | pip-wheel-metadata/ 71 | share/python-wheels/ 72 | *.egg-info/ 73 | .installed.cfg 74 | *.egg 75 | MANIFEST 76 | 77 | # PyInstaller 78 | # Usually these files are written by a python script from a template 79 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 80 | *.manifest 81 | *.spec 82 | 83 | # Installer logs 84 | pip-log.txt 85 | pip-delete-this-directory.txt 86 | 87 | # Unit test / coverage reports 88 | htmlcov/ 89 | .tox/ 90 | .nox/ 91 | .coverage 92 | .coverage.* 93 | .cache 94 | nosetests.xml 95 | coverage.xml 96 | *.cover 97 | *.py,cover 98 | .hypothesis/ 99 | .pytest_cache/ 100 | pytestdebug.log 101 | 102 | # Translations 103 | *.mo 104 | *.pot 105 | 106 | # Django stuff: 107 | *.log 108 | local_settings.py 109 | db.sqlite3 110 | db.sqlite3-journal 111 | 112 | # Flask stuff: 113 | instance/ 114 | .webassets-cache 115 | 116 | # Scrapy stuff: 117 | .scrapy 118 | 119 | # Sphinx documentation 120 | docs/_build/ 121 | doc/_build/ 122 | 123 | # PyBuilder 124 | target/ 125 | 126 | # Jupyter Notebook 127 | .ipynb_checkpoints 128 | 129 | # IPython 130 | profile_default/ 131 | ipython_config.py 132 | 133 | # pyenv 134 | .python-version 135 | 136 | # pipenv 137 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 138 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 139 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 140 | # install all needed dependencies. 141 | #Pipfile.lock 142 | 143 | # poetry 144 | #poetry.lock 145 | 146 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 147 | __pypackages__/ 148 | 149 | # Celery stuff 150 | celerybeat-schedule 151 | celerybeat.pid 152 | 153 | # SageMath parsed files 154 | *.sage.py 155 | 156 | # Environments 157 | # .env 158 | .env/ 159 | .venv/ 160 | env/ 161 | venv/ 162 | ENV/ 163 | env.bak/ 164 | venv.bak/ 165 | pythonenv* 166 | 167 | # Spyder project settings 168 | .spyderproject 169 | .spyproject 170 | 171 | # Rope project settings 172 | .ropeproject 173 | 174 | # mkdocs documentation 175 | /site 176 | 177 | # mypy 178 | .mypy_cache/ 179 | .dmypy.json 180 | dmypy.json 181 | 182 | # Pyre type checker 183 | .pyre/ 184 | 185 | # pytype static type analyzer 186 | .pytype/ 187 | 188 | # operating system-related files 189 | # file properties cache/storage on macOS 190 | *.DS_Store 191 | # thumbnail cache on Windows 192 | Thumbs.db 193 | 194 | # profiling data 195 | .prof 196 | 197 | 198 | ### VisualStudioCode ### 199 | .vscode/* 200 | *.code-workspace 201 | 202 | ### VisualStudioCode Patch ### 203 | # Ignore all local history of files 204 | .history 205 | .ionide 206 | 207 | # End of https://www.toptal.com/developers/gitignore/api/macos,linux,python,visualstudiocode -------------------------------------------------------------------------------- /codes/within_site_combine_predictions.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import argparse 3 | import os.path 4 | import numpy as np 5 | import pandas as pd 6 | from brainage import read_data 7 | 8 | def check_predictions(data_df, test_idx, model, test_pred): 9 | 10 | all_idx = np.array(range(0, len(data_df))) 11 | train_idx = np.delete(all_idx, test_idx) 12 | train_df = data_df.loc[train_idx, :] 13 | test_df = data_df.loc[test_idx, :] 14 | 15 | if type(model) == list: 16 | train_pred = model[0].predict(train_df[X]).ravel() 17 | else: 18 | train_pred = model.predict(train_df[X]).ravel() 19 | print(train_pred.shape, train_df[y].shape) 20 | 21 | test_pred_model = model.predict(test_df[X]).ravel() 22 | assert(np.round(test_pred) == np.round(test_pred_model)).all() # check if test pred saved == test predictions using model 23 | 24 | # print('Prediction from CV models', test_pred) 25 | # print('Prediction saved during training',test_pred_model) 26 | 27 | print('Predictions match') 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") 33 | parser.add_argument("--features_path", type=str, help="Features file path") 34 | parser.add_argument("--model_path", type=str, help="Path to directory where within site models of particular datasets are saved") 35 | parser.add_argument("--output_prefix", type=str, help="Output prefix for predictions filename", default='all_models_pred') 36 | 37 | # Parse the arguments 38 | args = parser.parse_args() 39 | demographics_file = args.demographics_file 40 | features_path = args.features_path 41 | model_path = args.model_path 42 | output_prefix = args.output_prefix 43 | 44 | # python3 within_site_combine_predictions.py --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv --features_path ../data/ixi/ixi. --model_path ../results/ixi/ixi. --output_prefix all_models_pred 45 | 46 | # demographics_file = '../data/ixi/ixi.subject_list_cat12.8.csv' 47 | # features_path = '../data/ixi/ixi.' 48 | # model_path = '../results/ixi/ixi.' 49 | # output_prefix = 'all_models_pred' 50 | 51 | model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #, 'xgb'] 52 | data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4', 'S4_R4', 'S4_R4', 'S8_R4', 'S8_R4', 53 | 'S0_R8', 'S0_R8', 'S4_R8', 'S4_R8', 'S8_R8', 'S8_R8'] 54 | filenm_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca', 55 | 'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca'] 56 | 57 | df_pred_all = pd.DataFrame() 58 | df_pred = pd.DataFrame() 59 | df = pd.DataFrame() 60 | 61 | for idx, filenm_item in enumerate(filenm_list): # for each feature space 62 | for model_item in model_names: 63 | features_file = features_path + data_list[idx] # get features file 64 | result_file = model_path + filenm_item + '.' + model_item + '.results' # get results 65 | model_file = model_path + filenm_item + '.' + model_item + '.models' # get models 66 | 67 | if os.path.isfile(model_file): # if model exists 68 | print('\n') 69 | print('data file: ', features_file) 70 | print('demographic file: ', demographics_file) 71 | print('model used:', model_file, '\n') 72 | print('results file: ', result_file) 73 | 74 | # Read the results file 75 | res = pickle.load(open(result_file,'rb')) # load the saved results 76 | res_model = pickle.load(open(model_file, 'rb')) # load the saved results 77 | data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file) 78 | 79 | df = pd.DataFrame() 80 | df_pred = pd.DataFrame() 81 | 82 | for key1, value1 in res.items(): 83 | df = pd.DataFrame() 84 | for key2, value2 in value1.items(): 85 | print(key1, key2) 86 | test_idx = value2['test_idx'] # get the saved test indices for each fold and pick up demo 87 | print(value2['test_idx'].shape) 88 | df['site'] = data_df.iloc[test_idx]['site'] 89 | df['subject'] = data_df.iloc[test_idx]['subject'] 90 | df['age'] = data_df.iloc[test_idx]['age'] # should be same as value2['true'] 91 | df['gender'] = data_df.iloc[test_idx]['gender'] 92 | 93 | if 'session' in data_df.columns: 94 | df['session'] = data_df.iloc[test_idx]['session'] 95 | 96 | model = res_model[key1][key2] # get CV model for each fold 97 | test_pred = value2['predictions'] # get the saved predictions for each fold 98 | 99 | check_predictions(data_df, test_idx, model, test_pred) # get predictions using model, check if equal to saved 100 | 101 | df[filenm_item + ' + ' + key2] = value2['predictions'] # predictions 102 | 103 | df_pred = pd.concat([df_pred, df], axis=0) # concat over all CV 104 | df_pred.sort_index(axis=0, level=None, ascending=True, inplace=True) 105 | 106 | if len(df_pred_all) == 0: # concat over all workflows 107 | df_pred_all = df_pred 108 | else: 109 | df_pred_all = df_pred_all.merge(df_pred, on=list(set(data_df.columns.tolist()) - set(X)), how="left") 110 | 111 | print('\n', 'predictions dataframe:', '\n', df_pred_all) 112 | save_path = model_path + output_prefix + '.csv' 113 | print('output path:', save_path) 114 | df_pred_all.to_csv(save_path, index=False) 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /codes/predict_age.py: -------------------------------------------------------------------------------- 1 | #from read_data_mask_resampled import * 2 | from brainage import calculate_voxelwise_features 3 | from pathlib import Path 4 | import pandas as pd 5 | import argparse 6 | import pickle 7 | import os 8 | import re 9 | 10 | 11 | def model_pred(test_df, model_file, feature_space_str): 12 | """This functions predicts age 13 | Args: 14 | test_df (dataframe): test data 15 | model_file (pickle file): trained model file 16 | feature_space_str (string): feature space name 17 | 18 | Returns: 19 | dataframe: predictions from the model 20 | """ 21 | 22 | model = pickle.load(open(model_file, 'rb')) # load model 23 | pred = pd.DataFrame() 24 | for key, model_value in model.items(): 25 | X = data_df.columns.tolist() 26 | pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X]) # preprocessed data 27 | y_pred = model_value.predict(test_df).ravel() 28 | print(y_pred.shape) 29 | pred[feature_space_str + '+' + key] = y_pred 30 | return pred 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--features_path", type=str, help="path to features dir") # eg '../data/ADNI' 36 | parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv' 37 | parser.add_argument("--output_path", type=str, help="path to output_dir") # eg'../results/ADNI' 38 | parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI' 39 | parser.add_argument("--mask_file", type=str, help="path to GM mask nii file", 40 | default='../masks/brainmask_12.8.nii') 41 | parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4) 42 | parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4) 43 | parser.add_argument("--model_file", type=str, help="Trained model to be used to predict", 44 | default='../trained_models/4sites.S4_R4_pca.gauss.models') 45 | # For testing 46 | # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI_paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models 47 | 48 | args = parser.parse_args() 49 | features_path = Path(args.features_path) 50 | subject_filepaths = args.subject_filepaths 51 | output_path = Path(args.output_path) 52 | output_prefix = args.output_prefix 53 | smooth_fwhm = args.smooth_fwhm 54 | resample_size = args.resample_size 55 | mask_file = args.mask_file 56 | model_file = args.model_file 57 | 58 | print('\nBrain-age trained model used: ', model_file) 59 | print('Subjects filepaths (test data): ', subject_filepaths) 60 | print('Directory to features path: ', features_path) 61 | print('Results directory: ', output_path) 62 | print('Results filename prefix: ', output_prefix) 63 | print('GM mask used: ', mask_file) 64 | 65 | # get feature space name from the model file entered and 66 | # create feature space name using the input values (smoothing, resampling) 67 | # match them: they should be same 68 | 69 | # get feature space name from the model file entered in argument 70 | pipeline_name1 = model_file.split('/')[-1] 71 | feature_space = pipeline_name1.split('.')[1] 72 | model_name = pipeline_name1.split('.')[2] 73 | pipeline_name = feature_space + '.' + model_name 74 | 75 | # create feature space name using the input values (smoothing, resampling) 76 | pca_string = re.findall(r"pca", feature_space) 77 | if len(pca_string) == 1: 78 | feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca' 79 | else: 80 | feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) 81 | 82 | # match them: they should be same 83 | assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})" 84 | 85 | print('Feature space: ', feature_space) 86 | print('Model name: ', model_name) 87 | 88 | # Create directories, create features if they don't exists 89 | output_path.mkdir(exist_ok=True, parents=True) 90 | features_path.mkdir(exist_ok=True, parents=True) 91 | features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size) 92 | features_fullfile = os.path.join(features_path, features_filename) 93 | print('\nfilename for features created: ', features_fullfile) 94 | 95 | if os.path.isfile(features_fullfile): # check if features file exists 96 | print('\n----File exists') 97 | data_df = pickle.load(open(features_fullfile, 'rb')) 98 | print('Features loaded') 99 | else: 100 | print('\n-----Extracting features') 101 | # create features 102 | data_df = calculate_voxelwise_features(subject_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size) 103 | # save features 104 | pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4) 105 | data_df.to_csv(features_fullfile + '.csv', index=False) 106 | print('Feature extraction done and saved') 107 | 108 | # get predictions and save 109 | try: 110 | predictions_df = model_pred(data_df, model_file, feature_space_str) 111 | # save predictions 112 | predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv' 113 | predictions_fullfile = os.path.join(output_path, predictions_filename) 114 | print('\nfilename for predictions created: ', predictions_fullfile) 115 | predictions_df.to_csv(predictions_fullfile, index=False) 116 | print(predictions_df) 117 | 118 | except FileNotFoundError: 119 | print(f'{model_file} is not present') 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /codes/cross_site_bias_correction.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import argparse 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.linear_model import LinearRegression 6 | from brainage import read_data, performance_metric 7 | from sklearn.model_selection import RepeatedStratifiedKFold 8 | 9 | 10 | if __name__ == '__main__': 11 | # Read arguments from submit file 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") 14 | parser.add_argument("--features_file", type=str, help="Features file path") 15 | parser.add_argument("--model_file", type=str, help="Path to saved model ", default='../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss') # path to scores-CV models file 16 | 17 | # read arguments 18 | args = parser.parse_args() 19 | demographics_file = args.demographics_file 20 | features_file = args.features_file 21 | model_file = args.model_file 22 | model_name = model_file.split('.')[-1] 23 | 24 | # python3 cross_site_bias_correction.py \ 25 | # --demographics_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.subject_list_cat12.8.csv \ 26 | # --features_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.S4_R4 \ 27 | # --model_file ../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss 28 | 29 | scores_path = model_file + '.scores' # contains CV models 30 | cv_prediction_savepath = model_file + '.predictions.csv' # save CV predictions 31 | bias_params_savepath = model_file + '.bias_params' # save BC parameters 32 | 33 | print('\nfeatures used:', features_file) 34 | print('\model_file:', model_file) 35 | print('\nscores_path:', scores_path) 36 | print('\ncv_prediction_savepath:', cv_prediction_savepath) 37 | print('\nbias_params_savepath:', bias_params_savepath) 38 | print('\nmodel used:', model_name) 39 | 40 | # Load the data which was used for training 41 | data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file) 42 | 43 | # Fixed variables, set random seed, create classes for age 44 | rand_seed, n_splits, n_repeats = 200, 5, 5 # fixed during training models 45 | qc = pd.cut(data_df['age'].tolist(), bins=5, precision=1) # create bins for train data only 46 | print('age_bins', qc.categories, 'age_codes', qc.codes) 47 | data_df['bins'] = qc.codes # add bin/classes as a column in train df 48 | 49 | # Load scores which contains CV models 50 | scores = pickle.load(open(scores_path, 'rb')) 51 | 52 | # get the exact train and test splits of CV as used during training 53 | test_idx_all = list() 54 | cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=rand_seed).split(data_df, data_df.bins) 55 | for train_idx, test_idx in cv: 56 | test_idx_all.append(test_idx) 57 | 58 | # Get CV predictions for each split and repeat 59 | predictions_df = pd.DataFrame() 60 | predictions_df_all = pd.DataFrame() 61 | cv_split = range(0, 25, 5) # [0, 5, 10, 15, 20, 25] get predictions and arrange them in diff. columns for diff. repeats 62 | 63 | for each_split in cv_split: # for each split (25 in total) 64 | print('each_split', each_split) 65 | predictions_df = pd.DataFrame() 66 | for ind in range(each_split, each_split + n_splits): # run from (0,5), (5,10), (10,15), (15,20), (20,25) 67 | print('Split number', ind) 68 | temp_df = pd.DataFrame() 69 | model_cv = scores[model_name]['estimator'][ind] # pick CV estimator 70 | test_idx = test_idx_all[ind] # pick test indices 71 | 72 | # get predictions for test data 73 | test_df = data_df.iloc[test_idx, :] # take test data from one split 74 | y_true = test_df[y] 75 | y_pred = model_cv.predict(test_df[X]).ravel() 76 | mae, mse, corr = performance_metric(y_true, y_pred) 77 | print(f' test true age size: {y_true.shape}, predicted age sixe: {y_pred.shape}') 78 | print(f'MAE: {mae}, MSE: {mse}, CoRR: {corr}') 79 | 80 | if predictions_df.empty: 81 | predictions_df['test_index'] = pd.Series(test_idx) 82 | predictions_df['predictions_' + str(each_split)] = pd.Series(y_pred) 83 | else: 84 | temp_df['test_index'] = pd.Series(test_idx) 85 | temp_df['predictions_' + str(each_split)] = pd.Series(y_pred) 86 | 87 | predictions_df = pd.concat([predictions_df, temp_df], axis=0) # append for all the splits of one repeat 88 | 89 | predictions_df.sort_values(by=['test_index'], inplace=True) 90 | 91 | if predictions_df_all.empty: 92 | predictions_df_all = predictions_df 93 | else: 94 | predictions_df_all = predictions_df_all.merge(predictions_df, on=['test_index'], how="left") # merge for all the repeats 95 | 96 | print('predictions_df_all', predictions_df_all) 97 | predictions_df_all = predictions_df_all.reset_index(drop=True) 98 | predictions_df_all = pd.concat([data_df[['site', 'subject', 'age', 'gender']], predictions_df_all], axis=1) # add subject info 99 | predictions_df_all.to_csv(cv_prediction_savepath) 100 | 101 | # Calculate bias correction parameters (m and c) from cv predictions for each column 102 | results_pred = pd.DataFrame() 103 | filter_col = [col for col in predictions_df_all if col.startswith('predictions')] 104 | print('filter_col', filter_col) 105 | 106 | model_intercept, model_coef = [], [] 107 | model_bias_params = {'c':0, 'm': 1} 108 | 109 | for column in filter_col: # for 5 repeats 110 | X_lin = 'age' 111 | y_lin = column 112 | train_x = predictions_df_all.loc[:, X_lin].to_numpy().reshape(-1, 1) # true age 113 | train_y = predictions_df_all.loc[:, y_lin].to_numpy().reshape(-1, 1) # predicted age 114 | lin_reg = LinearRegression().fit(train_x, train_y) 115 | 116 | print(f'Intercept: {lin_reg.intercept_}, slope: {lin_reg.coef_}') 117 | model_intercept.append(lin_reg.intercept_) 118 | model_coef.append(lin_reg.coef_) 119 | 120 | # use this m and c for bias correction on test data later 121 | model_bias_params['m'] = np.mean(model_coef) 122 | model_bias_params['c'] = np.mean(model_intercept) 123 | print('average slope', model_bias_params['m']) 124 | print('average intercept', model_bias_params['c']) 125 | pickle.dump(model_bias_params, open(bias_params_savepath, 'wb')) 126 | print('ALL DONE') 127 | -------------------------------------------------------------------------------- /codes/predict_age_sing.py: -------------------------------------------------------------------------------- 1 | #from read_data_mask_resampled import * 2 | from brainage import calculate_voxelwise_features 3 | from pathlib import Path 4 | import pandas as pd 5 | import argparse 6 | import pickle 7 | import os 8 | import re 9 | 10 | 11 | def model_pred(test_df, model_file, feature_space_str): 12 | """This functions predicts age 13 | Args: 14 | test_df (dataframe): test data 15 | model_file (pickle file): trained model file 16 | feature_space_str (string): feature space name 17 | 18 | Returns: 19 | dataframe: predictions from the model 20 | """ 21 | 22 | model = pickle.load(open(model_file, 'rb')) # load model 23 | pred = pd.DataFrame() 24 | for key, model_value in model.items(): 25 | X = data_df.columns.tolist() 26 | pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X]) # preprocessed data 27 | y_pred = model_value.predict(test_df).ravel() 28 | print(y_pred.shape) 29 | pred[feature_space_str + '+' + key] = y_pred 30 | return pred 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--features_path", type=str, help="path to features dir") # eg '../data/ADNI' 36 | parser.add_argument("--data_dir", type=str, help="path to data dir") # 37 | parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI.paths_cat12.8.csv' 38 | parser.add_argument("--output_path", type=str, help="path to output_dir") # eg'../results/ADNI' 39 | parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI' 40 | parser.add_argument("--mask_file", type=str, help="path to GM mask nii file", 41 | default='../masks/brainmask_12.8.nii') 42 | parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4) 43 | parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4) 44 | parser.add_argument("--model_file", type=str, help="Trained model to be used to predict", 45 | default='../trained_models/4sites.S4_R4_pca.gauss.models') 46 | # For testing 47 | # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models 48 | 49 | args = parser.parse_args() 50 | features_path = args.features_path 51 | data_dir = args.data_dir 52 | subject_filepaths = args.subject_filepaths 53 | output_path = args.output_path 54 | output_prefix = args.output_prefix 55 | smooth_fwhm = args.smooth_fwhm 56 | resample_size = args.resample_size 57 | mask_file = args.mask_file 58 | model_file = args.model_file 59 | 60 | print('\nBrain-age trained model used: ', model_file) 61 | print('Data directory (test data): ', data_dir) 62 | print('Subjects filepaths (test data): ', subject_filepaths) 63 | print('Directory to features path: ', features_path) 64 | print('Results directory: ', output_path) 65 | print('Results filename prefix: ', output_prefix) 66 | print('GM mask used: ', mask_file) 67 | 68 | # create full filename for the nii files of the subjects and save as csv in features_path 69 | subject_filepaths_nii = pd.read_csv(subject_filepaths, header=None) 70 | subject_filepaths_nii = data_dir + '/' +subject_filepaths_nii 71 | print(subject_filepaths_nii) 72 | subject_full_filepaths = os.path.join(features_path, 'subject_full_filepaths.csv') 73 | print(subject_full_filepaths) 74 | subject_filepaths_nii.to_csv(subject_full_filepaths, header=False, index=False) 75 | 76 | 77 | # get feature space name from the model file entered and 78 | # create feature space name using the input values (smoothing, resampling) 79 | # match them: they should be same 80 | 81 | # get feature space name from the model file entered in argument 82 | pipeline_name1 = model_file.split('/')[-1] 83 | feature_space = pipeline_name1.split('.')[1] 84 | model_name = pipeline_name1.split('.')[2] 85 | pipeline_name = feature_space + '.' + model_name 86 | 87 | # create feature space name using the input values (smoothing, resampling) 88 | pca_string = re.findall(r"pca", feature_space) 89 | if len(pca_string) == 1: 90 | feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca' 91 | else: 92 | feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) 93 | 94 | # match them: they should be same 95 | assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})" 96 | 97 | print('Feature space: ', feature_space) 98 | print('Model name: ', model_name) 99 | 100 | # Create directories, create features if they don't exists 101 | Path(output_path).mkdir(exist_ok=True, parents=True) 102 | Path(features_path).mkdir(exist_ok=True, parents=True) 103 | features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size) 104 | features_fullfile = os.path.join(features_path, features_filename) 105 | print('\nfilename for features created: ', features_fullfile) 106 | 107 | if os.path.isfile(features_fullfile): # check if features file exists 108 | print('\n----File exists') 109 | data_df = pickle.load(open(features_fullfile, 'rb')) 110 | print('Features loaded') 111 | else: 112 | print('\n-----Extracting features') 113 | # create features 114 | data_df = calculate_voxelwise_features(subject_full_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size) 115 | # save features 116 | pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4) 117 | data_df.to_csv(features_fullfile + '.csv', index=False) 118 | print('Feature extraction done and saved') 119 | 120 | # get predictions and save 121 | try: 122 | predictions_df = model_pred(data_df, model_file, feature_space_str) 123 | # save predictions 124 | predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv' 125 | predictions_fullfile = os.path.join(output_path, predictions_filename) 126 | print('\nfilename for predictions created: ', predictions_fullfile) 127 | predictions_df.to_csv(predictions_fullfile, index=False) 128 | print(predictions_df) 129 | 130 | except FileNotFoundError: 131 | print(f'{model_file} is not present') 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /codes/cross_site_combine_predictions.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import argparse 3 | import os.path 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.metrics import mean_absolute_error, mean_squared_error 7 | 8 | def model_pred(test_df, X, y, model_file, workflow_name): 9 | 10 | # load the model 11 | model = pickle.load(open(model_file, 'rb')) 12 | y_true = test_df[y].reset_index(drop=True) 13 | 14 | # Initialize dataframe for saving output 15 | pred = pd.DataFrame() 16 | mae_corr = pd.DataFrame() 17 | 18 | for key, model_value in model.items(): 19 | X_preprocessed, _ = model_value.preprocess(test_df[X], y_true, until='variancethreshold')# until='zscore' 20 | # print('X_preprocessed shape after variancethreshold', X_preprocessed.shape) 21 | 22 | # predict test data 23 | y_pred = model_value.predict(test_df[X]).ravel() 24 | print('age and predicted age sizes', y_true.shape, y_pred.shape) 25 | mae = np.round(mean_absolute_error(y_true, y_pred), 3) 26 | mse = np.round(mean_squared_error(y_true, y_pred), 2) 27 | corr = np.round(np.corrcoef(y_pred, y_true)[1, 0], 2) 28 | 29 | print('MAE:', mae, 'MSE:', mse, 'CoRR:', corr) 30 | print('workflow_name:', workflow_name, key) 31 | 32 | pred[workflow_name] = y_pred # add column for predictions 33 | mae_corr = pd.concat([mae_corr, pd.DataFrame([{'mae': mae, 'mse': mse, 'corr': corr}], index=[workflow_name])], axis=0) 34 | 35 | return pred, y_true, mae_corr 36 | 37 | 38 | def read_data(features_file, demographics_file): 39 | demo_df = pd.read_csv(open(demographics_file, 'rb')) 40 | data_df = pickle.load(open(features_file, 'rb')) 41 | data_df = pd.concat([demo_df, data_df], axis=1) 42 | # data_df = data_df.drop(columns='file_path_cat12.8') 43 | data_df.rename(columns=lambda X: str(X), inplace=True) # convert numbers to strings as column names 44 | X = [col for col in data_df if col.startswith('f_')] 45 | y = 'age' 46 | age = data_df[y].round().astype(int) # round off age and convert to integer 47 | data_df[y] = age 48 | return data_df, X, y 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") 54 | parser.add_argument("--features_path", type=str, help="Features file path") 55 | parser.add_argument("--model_path", type=str, help="Path to directory where within site models of particular datasets are saved") 56 | parser.add_argument("--output_prefix", type=str, help="Output prefix for predictions filename", default='pred_1000brains_all') 57 | 58 | # Parse the arguments 59 | args = parser.parse_args() 60 | demographics_file = args.demographics_file 61 | features_path = args.features_path 62 | model_path = args.model_path 63 | output_prefix = args.output_prefix 64 | 65 | # python3 cross_site_combine_predictions.py --demographics_file ../data/1000brains/1000brains.subject_list_cat12.8.csv --features_path ../data/1000brains/1000brains. --model_path ../results/ixi_camcan_enki/ixi_camcan_enki. --output_prefix pred_1000brains_all 66 | 67 | # demographics_file = '../data/1000brains/1000brains.subject_list_cat12.8.csv' 68 | # features_path = '../data/1000brains/1000brains.' 69 | # model_path = '../results/ixi_camcan_enki/ixi_camcan_enki.' 70 | # output_prefix = 'pred_1000brains_all' 71 | 72 | model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] 73 | data_list = ['173', '473', '873', '1273', 'S0_R4', 'S0_R4', 'S4_R4', 'S4_R4', 'S8_R4', 'S8_R4', 74 | 'S0_R8', 'S0_R8', 'S4_R8', 'S4_R8', 'S8_R8', 'S8_R8'] 75 | filenm_list = ['173', '473', '873', '1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca', 76 | 'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca'] 77 | 78 | output_df = pd.DataFrame() 79 | mae_corr_df = pd.DataFrame() 80 | 81 | for idx, data_item in enumerate(filenm_list): # for each feature space 82 | for model_item in model_names: 83 | features_file = features_path + data_list[idx] # get test features 84 | model_file = model_path + data_item + '.' + model_item + '.models' # get models 85 | 86 | if os.path.exists(model_file) and os.path.exists(features_file): # if test data and trained model exists 87 | print('\n') 88 | print('test data', features_file) 89 | print('demographic file: ', demographics_file) 90 | print('model used', model_file) 91 | print("model and data exists") 92 | 93 | test_df, test_X, test_y = read_data(features_file, demographics_file) # load test data, read data and demo both 94 | y_pred1, y_true1, mae_corr1 = model_pred(test_df, test_X, test_y, model_file, 95 | str(data_item + ' + ' + model_item)) # predict test data 96 | 97 | if output_df.empty: 98 | needed_cols = test_df.columns[~test_df.columns.isin(test_X)].tolist() 99 | output_df = test_df[needed_cols].copy() 100 | 101 | output_df = pd.concat([output_df, y_pred1], axis=1) # concat for all workflows 102 | mae_corr_df = pd.concat([mae_corr_df, mae_corr1], axis=0) 103 | 104 | print('\n', 'predictions dataframe:', '\n', output_df) 105 | 106 | mae_corr_df.to_csv(model_path + output_prefix + '_temp.csv') 107 | output_df.to_csv(model_path + output_prefix + '.csv', index=False) 108 | 109 | # keep predictions from 32 selected workdlows (we trained more than 32) 110 | selected_workflows_df = ['site', 'subject', 'age', 'gender', 111 | '173 + rf', '173 + gauss', '173 + lasso', 112 | '473 + lasso', '473 + rvr_poly', 113 | '873 + gauss', '873 + elasticnet', 114 | '1273 + gauss', '1273 + rvr_poly', 115 | 'S0_R4 + lasso', 116 | 'S4_R4 + ridge', 'S4_R4 + rvr_lin', 'S4_R4 + gauss', 117 | 'S4_R4_pca + ridge', 'S4_R4_pca + rf', 'S4_R4_pca + rvr_lin', 'S4_R4_pca + gauss', 118 | 'S8_R4 + kernel_ridge', 119 | 'S8_R4_pca + rvr_lin', 'S8_R4_pca + gauss', 'S8_R4_pca + lasso', 'S8_R4_pca + rvr_poly', 120 | 'S0_R8 + rvr_poly', 'S0_R8_pca + lasso', 'S0_R8_pca + elasticnet', 'S0_R8_pca + rvr_poly', 121 | 'S4_R8 + ridge', 'S4_R8 + rvr_lin', 'S4_R8 + lasso', 122 | 'S8_R8 + ridge', 'S8_R8 + kernel_ridge', 123 | 'S8_R8_pca + elasticnet'] 124 | 125 | if 'session' in output_df.columns: 126 | selected_workflows_df.insert(4, 'session') 127 | 128 | output_df = output_df.reindex(columns=selected_workflows_df) 129 | output_df.to_csv( model_path + output_prefix + '_selected' + '.csv', index=False) 130 | 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 1. **Set up** 2 | 3 | ``` 4 | git clone https://github.com/juaml/brainage_estimation.git 5 | cd brainage_estimation 6 | python3 -m venv brainage_env 7 | source brainage_env/bin/activate 8 | pip install -r requirements.txt 9 | # install other packages 10 | pip install https://github.com/JamesRitchie/scikit-rvm/archive/master.zip 11 | #brew install gcc # for Mac users in case you don't have it 12 | pip install glmnet 13 | ``` 14 | 15 | After the set up following codes can be run as provided in the `codes` directory. 16 | 17 | 2. **Get predictions** 18 | 19 | We provide pretrained models that can used to obtain predictions on new samples. 20 | 21 | ``` 22 | python3 predict_age.py \ 23 | --features_path path_to_features_dir \ 24 | --subject_filepaths path_to_txt_file \ 25 | --output_path path_to_output_dir \ 26 | --output_prefix PREFIX \ 27 | --mask_file ../masks/brainmask_12.8.nii \ 28 | --smooth_fwhm 4 \ 29 | --resample_size 4 \ 30 | --model_file ../trained_models/4sites.S4_R4_pca.gauss.models 31 | ``` 32 | 33 | The arguments are: 34 | - `--features_path` should point to a directory where calculated features are stored as a `pickle` file. 35 | - `--subject_filepaths` should point to a text file containing path to the CAT12.8's `mwp1` file for each subject per line. 36 | - `--output_path` points to a directory where the predictions will be saved. 37 | - `--output_prefix` prefix for the output files. 38 | - `--mask_file` points to the GM mask to be used (defaults to `../masks/brainmask_12.8.nii`) 39 | - `--smooth_fwhm` smoothing kernel size to be used (defaults to `4`) 40 | - `--resample_size` resampling of the voxels to isometric size (defaults to `4`) 41 | - `--model_file` should point to an already trained model (defaults to `4sites_S4_R4_pca.gauss.models`) 42 | 43 | This will calculate features with 4mm smoothing and 4mm resampling (`S4_R4`) for all subjects in the file provided via `--subject_filepaths`. 44 | The predictions will be performed using the S4_R4_pca+gauss model. 45 | The model will perform `PCA` based on the model used. 46 | Note that if the features are available in the `--features_path` then they will not be recalculated. 47 | 48 | 3. **calculate features: voxel-wise and parcel-wise features** 49 | 50 | It is possible to calculate features from a list of CAT12.8 files. 51 | 52 | Voxel-wise features 53 | ``` 54 | python3 calculate_features_voxelwise.py \ 55 | --features_path ../data/ADNI/ \ 56 | --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv \ 57 | --output_prefix ADNI \ 58 | --mask_file ../masks/brainmask_12.8.nii \ 59 | --smooth_fwhm 4 \ 60 | --resample_size 8 \ 61 | ``` 62 | 63 | Parcel-wise features 64 | ``` 65 | python3 calculate_features_parcelwise.py \ 66 | --features_path ../data/ADNI/ \ 67 | --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv \ 68 | --output_prefix ADNI \ 69 | --mask_file ../masks/BSF_173.nii \ 70 | --num_parcels 173 \ 71 | ``` 72 | 73 | 4. **Within-site: Train models** 74 | 75 | ``` 76 | python3 within_site_train.py \ 77 | --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv \ 78 | --features_file ../data/ixi/ixi.173 \ 79 | --output_path ../results/ixi \ 80 | --output_prefix ixi.173 \ 81 | --models rvr_lin \ 82 | --pca_status 0 83 | ``` 84 | 85 | The arguments are: 86 | - `--demographics_file` should point to a `csv` file with four columns `{'subject', 'site', 'age', 'gender'}`. 87 | - `--features_file` should point to a `pickle` file with features. 88 | - `--output_path` points to a directory where the models, scores and results will be saved. 89 | - `--output_prefix` prefix for output files which will be used to create three files `.models`, `.scores`, and `.results`. 90 | - `--models` one or more models to train, multiple models can be provided as a comma separated list. 91 | - `--pca_status` either 0 (no PCA) or 1 (for PCA retaining 100% variance). 92 | 93 | This will run outer 5-fold and inner 5x5-fold cross-validation. 94 | 95 | In case you are using `HTcondor`, you can also use the provided submit file. 96 | 97 | `condor_submit within_site_ixi.submit` 98 | 99 | 100 | 5. **Within-site: Read results from saved models** 101 | 102 | `python3 within_site_read_results.py --data_nm ../results/ixi/ixi.` 103 | 104 | 105 | 6. **Within-site: Get predictions from 128 workflows** 106 | 107 | ``` 108 | python3 within_site_combine_predictions.py \ 109 | --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv \ 110 | --features_path ../data/ixi/ixi. \ 111 | --model_path ../results/ixi/ixi. \ 112 | --output_prefix all_models_pred 113 | ``` 114 | 115 | 7. **Within-site: Bias correction** 116 | 117 | ``` 118 | python3 within_site_bias_correction.py \ 119 | --input_predictions_file ../results/ixi/ixi.all_models_pred.csv \ 120 | --BC_predictions_file ../results/ixi/ixi.all_models_pred_BC.csv 121 | ``` 122 | 123 | 124 | 8. **Cross-site: Train and test** 125 | 126 | First train a model with three sites. 127 | ``` 128 | python3 cross_site_train.py \ 129 | --demographics_file ../data/ixi_camcan_enki/ixi_camcan_enki_subject_list_cat12.8.csv \ 130 | --features_file ../data/ixi_camcan_enki/ixi_camcan_enki.173 \ 131 | --output_path ../results/ixi_camcan_enki \ 132 | --output_prefix ixi_camcan_enki.173 \ 133 | --models rvr_lin \ 134 | --pca_status 0 135 | ``` 136 | 137 | Now we can make predictions on the hold-out site using all models available in the `--model_path`. 138 | ``` 139 | python3 cross_site_combine_predictions.py \ 140 | --demographics_file ../data/1000brains/1000brains.subject_list_cat12.8.csv \ 141 | --features_path ../data/1000brains/1000brains. \ 142 | --model_path ../results/ixi_camcan_enki/ixi_camcan_enki. \ 143 | --output_prefix pred_1000brains_all 144 | 145 | ``` 146 | 147 | 9. **Cross-site: Read results from saved models** 148 | 149 | Create cross-validation scores from cross-site predictions. 150 | 151 | `python3 cross_site_read_results.py --data_nm ../results/ixi_camcan_enki/ixi_camcan_enki.` 152 | 153 | 154 | 10. **Cross-site: Bias correction** 155 | 156 | Using the CV predictions from the training data: 157 | 158 | ``` 159 | python3 cross_site_bias_correction.py \ 160 | --demographics_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.subject_list_cat12.8.csv \ 161 | --features_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.S4_R4 \ 162 | --model_file ../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss 163 | ``` 164 | 165 | Using the control subjects from the testing data: 166 | 167 | This code will train bias correction model using the predictions and age from the control group (`CN`) group and apply to it the full sample. It needs `demographics_file` which should contain `age` and `Research Group` columns, and `Research Group` column should contain `CN` category. `predictions_file` should contain a column for predictions defined by `predictions_column_name`. The bias corrected predictions will be saved in the same location as `predictions_file` with a prefix defined by `output_prefix`. 168 | 169 | ``` 170 | python3 cross_site_bias_correction_using_CN.py \ 171 | --demographics_file ../data/ADNI/ADNI.subject_list_cat12.8.csv \ 172 | --predictions_file ../results/ADNI/ADNI.S4_R4_pca.gauss.prediction.csv \ 173 | --predictions_column_name S4_R4_pca+gauss \ 174 | --output_prefix _BC 175 | ``` 176 | -------------------------------------------------------------------------------- /codes/cross_site_read_results.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os.path 3 | import argparse 4 | import pandas as pd 5 | 6 | # all possible inputs 7 | ## cross site (3 sites) 8 | # data_nm = '..results/camcan_enki_1000brains/camcan_enki_1000brains_' 9 | # data_nm = '..results/ixi_enki_1000brains/ixi_enki_1000brains_' 10 | # data_nm = '..results/ixi_camcan_enki/ixi_camcan_enki_' 11 | # data_nm = '..results/ixi_camcan_1000brains/ixi_camcan_1000brains_' 12 | ## cross-site (4 sites) 13 | # data_nm = '..results/ixi_camcan_enki_1000brains/4sites_' 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--data_nm", type=str, help="Output path for one dataset") 18 | 19 | args = parser.parse_args() 20 | data_nm = args.data_nm 21 | 22 | # Filename to save results 23 | cv_file_ext = 'cv_scores.csv' 24 | cv_file_ext_selected = 'cv_scores_selected.csv' 25 | 26 | # Complete results filepaths 27 | cv_filename = data_nm + cv_file_ext 28 | cv_filename_selected = data_nm + cv_file_ext_selected 29 | 30 | 31 | # all model names 32 | model_names = ['lin_reg', 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #'xgb' 33 | model_names_new = ['LiR', 'RR', 'RFR', 'RVRlin', 'KRR', 'GPR', 'LR', 'ENR', 'RVRpoly'] # 'XGB' 34 | 35 | # all feature spaces names 36 | data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca', 37 | 'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca'] 38 | data_list_new = ['173', '473', '873','1273', 'S0_R4', 'S0_R4 + PCA', 'S4_R4', 'S4_R4 + PCA', 'S8_R4', 'S8_R4 + PCA', 39 | 'S0_R8', 'S0_R8 + PCA', 'S4_R8', 'S4_R8 + PCA', 'S8_R8', 'S8_R8 + PCA'] 40 | 41 | # check which scores file is missing 42 | missing_outs = [] 43 | for data_item in data_list: 44 | for model_item in model_names: 45 | scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file 46 | if os.path.isfile(scores_item): 47 | print('yes') 48 | else: 49 | missing_outs.append(scores_item) 50 | print('Missing files:\n', missing_outs) 51 | 52 | # get the saved cv scores 53 | df = pd.DataFrame() 54 | df_cv = pd.DataFrame() 55 | for data_item in data_list: 56 | for model_item in model_names: 57 | scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file 58 | if os.path.isfile(scores_item): 59 | res = pickle.load(open(scores_item, 'rb')) 60 | df = pd.DataFrame() 61 | mae_all, mse_all, corr_all, corr_delta_all, key_all = list(), list(), list(), list(), list() 62 | for key, value in res.items(): 63 | mae = round(value['test_neg_mean_absolute_error'].mean() * -1, 3) 64 | mse = round(value['test_neg_mean_squared_error'].mean() * -1, 3) 65 | corr = round(value['test_r2'].mean(), 3) 66 | mae_all.append(mae) 67 | mse_all.append(mse) 68 | corr_all.append(corr) 69 | key_all.append(key) 70 | 71 | df['model'] = key_all 72 | df['data'] = len(mae_all) * [data_item] 73 | df['cv_mae'] = mae_all 74 | df['cv_mse'] = mse_all 75 | df['cv_corr'] = corr_all 76 | # print(df) 77 | df_cv = pd.concat([df_cv, df], axis=0) 78 | 79 | df_cv = df_cv.reset_index(drop=True) 80 | df_cv['workflow_name'] = df_cv['data'] + ' + ' + df_cv['model'] 81 | df_cv['data'] = df_cv['data'].replace(data_list, data_list_new) 82 | df_cv['model'] = df_cv['model'].replace(model_names, model_names_new) 83 | df_cv['workflow_name_updated'] = df_cv['data'] + ' + ' + df_cv['model'] 84 | df_cv.reset_index(drop=True, inplace=True) 85 | 86 | # selected 32 workflows (since we have more then 32 selected workflows) 87 | selected_workflows_df = pd.DataFrame([ 88 | '173 + GPR', 89 | '473 + LR', 90 | '473 + RVRpoly', 91 | '1273 + GPR', 92 | 'S4_R4 + RR', 93 | 'S4_R4 + GPR', 94 | 'S4_R4 + PCA + RFR', 95 | 'S4_R4 + PCA + RVRlin', 96 | 'S8_R4 + PCA + RVRlin', 97 | 'S8_R4 + PCA + GPR', 98 | 'S0_R8 + PCA + ENR', 99 | 'S0_R8 + PCA + RVRpoly', 100 | 'S4_R8 + RR', 101 | 'S8_R8 + RR', 102 | 'S8_R8 + KRR', 103 | 'S8_R8 + PCA + ENR', 104 | 'S4_R4 + PCA + GPR', 105 | 'S4_R4 + RVRlin', 106 | 'S4_R4 + PCA + RR', 107 | 'S4_R8 + RVRlin', 108 | 'S8_R4 + KRR', 109 | 'S0_R4 + LR', 110 | 'S8_R4 + PCA + RVRpoly', 111 | 'S0_R8 + RVRpoly', 112 | 'S4_R8 + LR', 113 | '873 + GPR', 114 | 'S8_R4 + PCA + LR', 115 | '1273 + RVRpoly', 116 | '873 + ENR', 117 | '173 + LR', 118 | 'S0_R8 + PCA + LR', 119 | '173 + RFR'], columns=['workflow_name_updated']) 120 | 121 | df_final = df_cv.merge(selected_workflows_df, how='inner', on=['workflow_name_updated']) 122 | 123 | # save the csv files 124 | print('\n cv results file:', cv_filename) 125 | print(df_cv) 126 | print('\n selected results file:', cv_filename_selected) 127 | print(df_final) 128 | df_cv.to_csv(cv_filename, index=False) 129 | df_final.to_csv(cv_filename_selected, index=False) 130 | 131 | 132 | # # check model parameters 133 | print('\n Model Parameters') 134 | error_models = list() 135 | for data_item in data_list: 136 | for model_item in model_names: 137 | model_item = data_nm + data_item + '.' + model_item + '.models' # get models 138 | # print('\n','model filename', model_item) 139 | if os.path.isfile(model_item): 140 | print('\n', 'model filename', model_item) 141 | res = pickle.load(open(model_item, 'rb')) 142 | # print(res) 143 | for key, value in res.items(): 144 | print(key) 145 | 146 | if key == 'gauss': 147 | model = res['gauss']['gauss'] 148 | # print(model.get_params()) 149 | print(model.kernel_.get_params()) 150 | 151 | elif key == 'kernel_ridge': 152 | model = res['kernel_ridge']['kernelridge'] 153 | print(model) 154 | # print(model.get_params()) 155 | 156 | elif key == 'rvr_lin': 157 | model = res['rvr_lin']['rvr'] 158 | print(model) 159 | # print(model.get_params()) 160 | 161 | elif key == 'rvr_poly': 162 | model = res['rvr_poly']['rvr'] 163 | print(model) 164 | # print(model.get_params()) 165 | 166 | elif key == 'rf': 167 | model = res['rf']['rf'] 168 | print(model) 169 | # print(model.get_params()) 170 | 171 | else: 172 | model = res[key]['elasticnet'] 173 | # print(model.get_params()) 174 | print(model.lambda_best_) 175 | 176 | else: 177 | error_models.append(model_item) 178 | -------------------------------------------------------------------------------- /brainage/calculate_features.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import nilearn 3 | from nilearn import image 4 | import numpy as np 5 | import pandas as pd 6 | import nibabel as nib 7 | import nibabel.processing as npr 8 | 9 | def subsample_img(img, f): 10 | """Reduce resample_to_img features of a 3D array by a given factor f.""" 11 | 12 | data = img.get_fdata() 13 | mask = np.zeros(img.shape) 14 | mask[::f, ::f, ::f] = 1 15 | data = data * mask 16 | return nib.Nifti1Image(data, img.affine, img.header) 17 | 18 | def binarize_3d(img, threshold): 19 | """binarize 3D spatial image""" 20 | return nib.Nifti1Image( 21 | np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header 22 | ) 23 | 24 | def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size): 25 | """Calculate voxelwise features for the subjects 26 | 27 | Args: 28 | phenotype_file (csv or txt): A csv or text file with path to subject images 29 | mask_file (nii): The GM mask file to be used to extract features 30 | smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm) 31 | resample_size (int): Resample image to given voxel size 32 | 33 | Returns: 34 | data_resampled (dataframe): pandas dataframe of features (N subjects by M features) 35 | """ 36 | 37 | phenotype = pd.read_csv(phenotype_file, header=None) 38 | 39 | # don't need this anymore 40 | # filename, file_extension = os.path.splitext(phenotype_file) 41 | # if file_extension == ".txt": 42 | # phenotype = pd.read_csv(phenotype_file, header=None) 43 | # elif file_extension == ".csv": 44 | # phenotype = pd.read_csv(phenotype_file, sep=",", header=None) 45 | # else: 46 | # raise ValueError("Wrong file. Please imput either a csv or text file") 47 | 48 | print(phenotype.shape) 49 | print(phenotype.head()) 50 | 51 | # phenotype = phenotype.iloc[0:15] 52 | 53 | data_resampled = np.array([]) # array to save resampled features from subjects mri 54 | count = 0 55 | for index, row in phenotype.iterrows(): # iterate over each row 56 | sub_file = row.values[0] 57 | 58 | if os.path.exists(sub_file): 59 | print(f"\n-----Processing subject number {count}------") 60 | sub_img = nib.load(sub_file) # load subject image 61 | mask_img = nib.load(mask_file) # load mask image 62 | print("Subject and mask image loaded") 63 | print("sub affine original \n", sub_img.affine, sub_img.shape) 64 | print("mask affine original \n", mask_img.affine, mask_img.shape) 65 | 66 | print("Perform smoothing") 67 | sub_img = image.smooth_img( 68 | sub_img, smooth_fwhm 69 | ) # smooth the image with 4 mm FWHM 70 | 71 | print("Perform resampling") 72 | # trying to match Gaser 73 | mask_img_rs = npr.resample_to_output( 74 | mask_img, [resample_size] * len(mask_img.shape), order=1 75 | ) # resample mask 76 | print( 77 | "mask affine after resampling\n", 78 | mask_img_rs.affine, 79 | mask_img_rs.shape, 80 | ) 81 | 82 | sub_img_rs = image.resample_to_img( 83 | sub_img, mask_img_rs, interpolation="linear" 84 | ) # resample subject 85 | print( 86 | "sub affine after resampling\n", 87 | sub_img_rs.affine, 88 | sub_img_rs.shape, 89 | ) 90 | 91 | binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5) # binarize the mask 92 | mask_rs = binary_mask_img_rs.get_fdata().astype(bool) 93 | 94 | sub_data_rs = sub_img_rs.get_fdata()[ 95 | mask_rs 96 | ] # extract voxel using the binarized mask 97 | sub_data_rs = sub_data_rs.reshape(1, -1) 98 | 99 | if data_resampled.size == 0: 100 | data_resampled = sub_data_rs 101 | else: 102 | data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0) 103 | count = count + 1 104 | print(data_resampled.shape) 105 | 106 | print("\n *** Feature extraction done ***") 107 | 108 | # renaming the columns and convering to dataframe 109 | data_resampled = pd.DataFrame(data_resampled) 110 | data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True) 111 | print('Feature names:', data_resampled.columns) 112 | 113 | print(f"The size of the feature space is {data_resampled.shape}") 114 | 115 | return data_resampled 116 | 117 | 118 | 119 | def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels): 120 | """Calculate parcelwise features for the subjects 121 | 122 | Args: 123 | phenotype_file (csv or text): A csv or text file with path to subject images 124 | mask_dir (_type_): The GM mask file to be used to extract features 125 | num_parcels (_type_): Number of parcels 126 | 127 | Returns: 128 | data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels) 129 | """ 130 | 131 | phenotype = pd.read_csv(phenotype_file, header=None) 132 | 133 | # filename, file_extension = os.path.splitext(phenotype_file) 134 | 135 | # if file_extension == '.txt': 136 | # phenotype = pd.read_csv(phenotype_file, header=None) 137 | # elif file_extension == '.csv': 138 | # phenotype = pd.read_csv(phenotype_file, sep=',', header=None) 139 | # else: 140 | # raise ValueError("Wrong file. Please imput either a csv or text file") 141 | 142 | print(phenotype.shape) 143 | print(phenotype.head()) 144 | # phenotype = phenotype.iloc[0:15] 145 | 146 | data_parcels = [] #np.array([]) # array to save resampled features from subjects mri 147 | count = 0 148 | 149 | for index, row in phenotype.iterrows(): # iterate over each row 150 | sub_file = row.values[0] 151 | 152 | if os.path.exists(sub_file): 153 | print(f'\nProcessing subject number {count}') 154 | sub_img = nib.load(sub_file) # load subject image 155 | mask_img = nib.load(mask_dir) # load mask image 156 | print ('Subject and mask image loaded') 157 | print(sub_file, sub_img.affine, mask_img.affine) 158 | 159 | sub_data = sub_img.get_fdata() 160 | sub_data[sub_data == 0] = np.nan # replace zeros with Nan 161 | sub_data_parcels = [] 162 | 163 | if not np.array_equal(sub_img.affine, mask_img.affine): 164 | mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear') 165 | else: 166 | print("Subject and mask have same affine") 167 | 168 | for num in range(1, int(num_parcels) + 1): 169 | itemindex = np.where(mask_img.get_fdata() == num) # get indices from the mask for a parcel 170 | sub_mat = sub_data[itemindex] 171 | 172 | if np.all(np.isnan(sub_mat)): 173 | sub_agg = 0 174 | else: 175 | sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume 176 | sub_data_parcels.append(sub_agg) 177 | 178 | data_parcels.append(sub_data_parcels) 179 | print(len(data_parcels)) 180 | count = count + 1 181 | 182 | print('\n *** Feature extraction done ***') 183 | data_parcels = pd.DataFrame(data_parcels) 184 | data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True) 185 | print(data_parcels.columns) 186 | 187 | print('final dataframe shape', data_parcels.shape) 188 | return data_parcels 189 | -------------------------------------------------------------------------------- /codes/within_site_ixi.submit: -------------------------------------------------------------------------------- 1 | # The environment 2 | universe = vanilla 3 | getenv = True 4 | 5 | # resources 6 | request_cpus = 1 7 | request_memory = 5G 8 | 9 | 10 | # Execution 11 | initial_dir = . 12 | executable = $(initial_dir)/run_in_venv.sh 13 | transfer_executable = False 14 | 15 | # Logs 16 | #log = $(initial_dir)/../logs/$(Cluster).$(Process).log 17 | #output = $(initial_dir)/../logs/$(Cluster).$(Process).out 18 | #error = $(initial_dir)/../logs/$(Cluster).$(Process).err 19 | 20 | log = $(initial_dir)/../logs/$(result_prefix).$(model).log 21 | output = $(initial_dir)/../logs/$(result_prefix).$(model).out 22 | error = $(initial_dir)/../logs/$(result_prefix).$(model).err 23 | 24 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb' 25 | # --pca_status: 0 or 1 26 | 27 | # enki (change data_name and subject_filepaths_csv to run for different dataset) 28 | data_name = ixi 29 | subject_filepaths_csv = ixi.subject_list_cat12.8.csv 30 | 31 | arguments = test_package_env python3 within_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca) 32 | 33 | ########## 173 parcels 34 | feature_name = $(data_name).173 35 | result_prefix = $(data_name).173 36 | 37 | pca = 0 38 | model = ridge 39 | queue 40 | model = rf 41 | queue 42 | model = rvr_lin 43 | queue 44 | model = kernel_ridge 45 | queue 46 | model = gauss 47 | queue 48 | model = lasso 49 | queue 50 | model = elasticnet 51 | queue 52 | model = rvr_poly 53 | queue 54 | #model = xgb 55 | #queue 56 | 57 | 58 | 59 | ########## 473 parcels 60 | feature_name = $(data_name).473 61 | result_prefix = $(data_name).473 62 | pca = 0 63 | model = ridge 64 | queue 65 | model = rf 66 | queue 67 | model = rvr_lin 68 | queue 69 | model = kernel_ridge 70 | queue 71 | model = gauss 72 | queue 73 | model = lasso 74 | queue 75 | model = elasticnet 76 | queue 77 | model = rvr_poly 78 | queue 79 | #model = xgb 80 | #queue 81 | 82 | 83 | 84 | ########## 873 parcels 85 | feature_name = $(data_name).873 86 | result_prefix = $(data_name).873 87 | pca = 0 88 | model = ridge 89 | queue 90 | model = rf 91 | queue 92 | model = rvr_lin 93 | queue 94 | model = kernel_ridge 95 | queue 96 | model = gauss 97 | queue 98 | model = lasso 99 | queue 100 | model = elasticnet 101 | queue 102 | model = rvr_poly 103 | queue 104 | #model = xgb 105 | #queue 106 | 107 | 108 | 109 | ########## 1273 parcels 110 | feature_name = $(data_name).1273 111 | result_prefix = $(data_name).1273 112 | pca = 0 113 | model = ridge 114 | queue 115 | model = rf 116 | queue 117 | model = rvr_lin 118 | queue 119 | model = kernel_ridge 120 | queue 121 | model = gauss 122 | queue 123 | model = lasso 124 | queue 125 | model = elasticnet 126 | queue 127 | model = rvr_poly 128 | queue 129 | #model = xgb 130 | #queue 131 | 132 | 133 | 134 | ########## S0_R4 135 | feature_name = $(data_name).S0_R4 136 | result_prefix = $(data_name).S0_R4 137 | pca = 0 138 | model = ridge 139 | queue 140 | model = rf 141 | queue 142 | model = rvr_lin 143 | queue 144 | model = kernel_ridge 145 | queue 146 | model = gauss 147 | queue 148 | model = lasso 149 | queue 150 | model = elasticnet 151 | queue 152 | model = rvr_poly 153 | queue 154 | #model = xgb 155 | #queue 156 | 157 | 158 | 159 | ########## S0_R8 160 | feature_name = $(data_name).S0_R8 161 | result_prefix = $(data_name).S0_R8 162 | pca = 0 163 | model = ridge 164 | queue 165 | model = rf 166 | queue 167 | model = rvr_lin 168 | queue 169 | model = kernel_ridge 170 | queue 171 | model = gauss 172 | queue 173 | model = lasso 174 | queue 175 | model = elasticnet 176 | queue 177 | model = rvr_poly 178 | queue 179 | #model = xgb 180 | #queue 181 | 182 | 183 | 184 | ########## S4_R4 185 | feature_name = $(data_name).S4_R4 186 | result_prefix = $(data_name).S4_R4 187 | pca = 0 188 | model = ridge 189 | queue 190 | model = rf 191 | queue 192 | model = rvr_lin 193 | queue 194 | model = kernel_ridge 195 | queue 196 | model = gauss 197 | queue 198 | model = lasso 199 | queue 200 | model = elasticnet 201 | queue 202 | model = rvr_poly 203 | queue 204 | #model = xgb 205 | #queue 206 | 207 | 208 | 209 | ########## S4_R8 210 | feature_name = $(data_name).S4_R8 211 | result_prefix = $(data_name).S4_R8 212 | pca = 0 213 | model = ridge 214 | queue 215 | model = rf 216 | queue 217 | model = rvr_lin 218 | queue 219 | model = kernel_ridge 220 | queue 221 | model = gauss 222 | queue 223 | model = lasso 224 | queue 225 | model = elasticnet 226 | queue 227 | model = rvr_poly 228 | queue 229 | #model = xgb 230 | #queue 231 | 232 | 233 | ########## S8_R4 234 | feature_name = $(data_name).S8_R4 235 | result_prefix = $(data_name).S8_R4 236 | pca = 0 237 | model = ridge 238 | queue 239 | model = rf 240 | queue 241 | model = rvr_lin 242 | queue 243 | model = kernel_ridge 244 | queue 245 | model = gauss 246 | queue 247 | model = lasso 248 | queue 249 | model = elasticnet 250 | queue 251 | model = rvr_poly 252 | queue 253 | #model = xgb 254 | #queue 255 | 256 | 257 | 258 | 259 | ########## S8_R8 260 | feature_name = $(data_name).S8_R8 261 | result_prefix = $(data_name).S8_R8 262 | pca = 0 263 | model = ridge 264 | queue 265 | model = rf 266 | queue 267 | model = rvr_lin 268 | queue 269 | model = kernel_ridge 270 | queue 271 | model = gauss 272 | queue 273 | model = lasso 274 | queue 275 | model = elasticnet 276 | queue 277 | model = rvr_poly 278 | queue 279 | #model = xgb 280 | #queue 281 | 282 | ##################### PCA 283 | ########## S0_R4_pca 284 | feature_name = $(data_name).S0_R4 285 | result_prefix = $(data_name).S0_R4_pca 286 | pca = 1 287 | model = ridge 288 | queue 289 | model = rf 290 | queue 291 | model = rvr_lin 292 | queue 293 | model = kernel_ridge 294 | queue 295 | model = gauss 296 | queue 297 | model = lasso 298 | queue 299 | model = elasticnet 300 | queue 301 | model = rvr_poly 302 | queue 303 | #model = xgb 304 | #queue 305 | 306 | 307 | 308 | ########## S0_R8_pca 309 | feature_name = $(data_name).S0_R8 310 | result_prefix = $(data_name).S0_R8_pca 311 | pca = 1 312 | model = ridge 313 | queue 314 | model = rf 315 | queue 316 | model = rvr_lin 317 | queue 318 | model = kernel_ridge 319 | queue 320 | model = gauss 321 | queue 322 | model = lasso 323 | queue 324 | model = elasticnet 325 | queue 326 | model = rvr_poly 327 | queue 328 | #model = xgb 329 | #queue 330 | 331 | 332 | 333 | ########## S4_R4_pca 334 | feature_name = $(data_name).S4_R4 335 | result_prefix = $(data_name).S4_R4_pca 336 | pca = 1 337 | model = ridge 338 | queue 339 | model = rf 340 | queue 341 | model = rvr_lin 342 | queue 343 | model = kernel_ridge 344 | queue 345 | model = gauss 346 | queue 347 | model = lasso 348 | queue 349 | model = elasticnet 350 | queue 351 | model = rvr_poly 352 | queue 353 | #model = xgb 354 | #queue 355 | 356 | 357 | 358 | ########## S4_R8_pca 359 | feature_name = $(data_name).S4_R8 360 | result_prefix = $(data_name).S4_R8_pca 361 | pca = 1 362 | model = ridge 363 | queue 364 | model = rf 365 | queue 366 | model = rvr_lin 367 | queue 368 | model = kernel_ridge 369 | queue 370 | model = gauss 371 | queue 372 | model = lasso 373 | queue 374 | model = elasticnet 375 | queue 376 | model = rvr_poly 377 | queue 378 | #model = xgb 379 | #queue 380 | 381 | ########## S8_R4_pca 382 | feature_name = $(data_name).S8_R4 383 | result_prefix = $(data_name).S8_R4_pca 384 | pca = 1 385 | model = ridge 386 | queue 387 | model = rf 388 | queue 389 | model = rvr_lin 390 | queue 391 | model = kernel_ridge 392 | queue 393 | model = gauss 394 | queue 395 | model = lasso 396 | queue 397 | model = elasticnet 398 | queue 399 | model = rvr_poly 400 | queue 401 | #model = xgb 402 | #queue 403 | 404 | 405 | 406 | 407 | ########## S8_R8 408 | feature_name = $(data_name).S8_R8 409 | result_prefix = $(data_name).S8_R8_pca 410 | pca = 1 411 | model = ridge 412 | queue 413 | model = rf 414 | queue 415 | model = rvr_lin 416 | queue 417 | model = kernel_ridge 418 | queue 419 | model = gauss 420 | queue 421 | model = lasso 422 | queue 423 | model = elasticnet 424 | queue 425 | model = rvr_poly 426 | queue 427 | #model = xgb 428 | #queue -------------------------------------------------------------------------------- /codes/calculate_features2.submit: -------------------------------------------------------------------------------- 1 | # The environment 2 | universe = vanilla 3 | getenv = True 4 | 5 | # resources 6 | request_cpus = 1 7 | request_memory = 5G 8 | 9 | # Execution 10 | initial_dir = . 11 | executable = $(initial_dir)/run_in_venv.sh 12 | 13 | # Job 14 | #log = $(initial_dir)/../logs/$(Cluster).$(Process).log 15 | #output = $(initial_dir)/../logs/$(Cluster).$(Process).out 16 | #error = $(initial_dir)/../logs/$(Cluster).$(Process).err 17 | 18 | log = $(initial_dir)/../logs/$(data_name).$(Process).log 19 | output = $(initial_dir)/../logs/$(data_name).$(Process).out 20 | error = $(initial_dir)/../logs/$(data_name).$(Process).err 21 | 22 | # 1000brains (change data_name and subject_filepaths_csv to run for different dataset) 23 | data_name = 1000brains 24 | subject_filepaths_csv = 1000brains.paths_cat12.8.csv 25 | 26 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4 27 | queue 28 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8 29 | queue 30 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 31 | queue 32 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8 33 | queue 34 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4 35 | queue 36 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8 37 | queue 38 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173 39 | queue 40 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473 41 | queue 42 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873 43 | queue 44 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273 45 | queue 46 | 47 | # camcan (change data_name and subject_filepaths_csv to run for different dataset) 48 | data_name = camcan 49 | subject_filepaths_csv = camcan.paths_cat12.8.csv 50 | 51 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4 52 | queue 53 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8 54 | queue 55 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 56 | queue 57 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8 58 | queue 59 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4 60 | queue 61 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8 62 | queue 63 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173 64 | queue 65 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473 66 | queue 67 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873 68 | queue 69 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273 70 | queue 71 | 72 | 73 | # enki (change data_name and subject_filepaths_csv to run for different dataset) 74 | data_name = enki 75 | subject_filepaths_csv = enki.paths_cat12.8.csv 76 | 77 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4 78 | queue 79 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8 80 | queue 81 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4 82 | queue 83 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8 84 | queue 85 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4 86 | queue 87 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8 88 | queue 89 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173 90 | queue 91 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473 92 | queue 93 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873 94 | queue 95 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273 96 | queue 97 | 98 | 99 | -------------------------------------------------------------------------------- /codes/cross_site_train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import argparse 4 | import pandas as pd 5 | from pathlib import Path 6 | 7 | from brainage import read_data, XGBoostAdapted 8 | 9 | import xgboost as xgb 10 | from skrvm import RVR 11 | from glmnet import ElasticNet 12 | import sklearn.gaussian_process as gp 13 | from sklearn.kernel_ridge import KernelRidge 14 | from sklearn.decomposition import PCA 15 | from sklearn.feature_selection import VarianceThreshold 16 | from sklearn.model_selection import RepeatedStratifiedKFold 17 | 18 | from julearn import run_cross_validation 19 | from julearn.utils import configure_logging 20 | from julearn.transformers import register_transformer 21 | 22 | start_time = time.time() 23 | 24 | def none_or_str(value): 25 | if value == 'None': 26 | return None 27 | return value 28 | 29 | if __name__ == '__main__': 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") 32 | parser.add_argument("--features_file", type=str, help="Features file path") 33 | parser.add_argument("--output_path", type=str, help="Path to output directory") 34 | parser.add_argument("--output_prefix", type=str, help="Output prefix (used {dataname}.{featurename}") 35 | parser.add_argument("--models", type=str, nargs='?', const=1, default="ridge", 36 | help="models to use (comma seperated no space): ridge,rf,rvr_linear") 37 | parser.add_argument("--pca_status", type=int, default=0, 38 | help="0: no pca, 1: yes pca") 39 | parser.add_argument("--confounds", type=none_or_str, help="confounds", default=None) 40 | parser.add_argument("--n_jobs", type=int, default=1, help="Number of parallel jobs to run") 41 | 42 | configure_logging(level='INFO') 43 | 44 | # Parse the arguments 45 | args = parser.parse_args() 46 | demographics_file = args.demographics_file 47 | features_file = args.features_file 48 | output_path = Path(args.output_path) 49 | output_prefix = args.output_prefix 50 | model_required = [x.strip() for x in args.models.split(',')] # converts string into list 51 | confounds = args.confounds 52 | pca_status = bool(args.pca_status) 53 | n_jobs = args.n_jobs 54 | output_path.mkdir(exist_ok=True, parents=True) # check and create output directory 55 | 56 | # initialize random seed and create test indices 57 | rand_seed = 200 58 | n_repeats = 5 # for inner CV 59 | n_splits = 5 # how many train and test splits (both for other and inner) 60 | 61 | print('\nDemographics file: ', demographics_file) 62 | print('Features file: ', features_file) 63 | print('Ouput path : ', output_path) 64 | print('Ouput prefix: ', output_prefix) 65 | print('Model:', model_required, type(model_required)) 66 | print('PCA status : ', pca_status) 67 | print('Random seed : ', rand_seed) 68 | print('Num of splits for kfolds : ', n_splits, '\n') 69 | print('confounds:', confounds, type(confounds)) 70 | print('Num of parallel jobs initiated: ', n_jobs, '\n') 71 | 72 | # read the features, demographics and define X and y 73 | data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file) 74 | 75 | # register VarianceThreshold as a transformer 76 | register_transformer('variancethreshold', VarianceThreshold, returned_features='unknown', apply_to='all_features') 77 | var_threshold = 1e-5 78 | 79 | # Initialize variables, set random seed, create classes for age 80 | scores_cv, models, results = {}, {}, {} 81 | qc = pd.cut(data_df['age'].tolist(), bins=5, precision=1) # create bins for train data only 82 | print('age_bins', qc.categories, 'age_codes', qc.codes) 83 | data_df['bins'] = qc.codes # add bin/classes as a column in train df 84 | 85 | # Define all models and model parameters 86 | rvr_linear = RVR() 87 | rvr_poly = RVR() 88 | kernel_ridge = KernelRidge() # kernelridge 89 | lasso = ElasticNet(alpha=1, standardize=False) 90 | elasticnet = ElasticNet(alpha=0.5, standardize=False) 91 | ridge = ElasticNet(alpha=0, standardize=False) 92 | xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2) 93 | pca = PCA(n_components=None) # max as many components as sample size 94 | 95 | model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb'] 96 | model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb] 97 | 98 | model_para_list = [{'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed, 99 | 'elasticnet__n_jobs': n_jobs}, 100 | 101 | {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse', 102 | 'rf__max_features': 0.33, 'rf__min_samples_leaf': 5, 'rf__n_jobs':n_jobs, 103 | 'rf__random_state': rand_seed}, 104 | 105 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear', 106 | 'rvr__random_state': rand_seed}, 107 | 108 | {'variancethreshold__threshold': var_threshold, 109 | 'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0], 110 | 'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5, 111 | 'search_params': {'n_jobs': n_jobs}}, 112 | 113 | {'variancethreshold__threshold': var_threshold, 114 | 'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100, 115 | 'gauss__normalize_y': True, 'gauss__random_state': rand_seed}, 116 | 117 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed, 118 | 'elasticnet__n_jobs': n_jobs}, 119 | 120 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed, 121 | 'elasticnet__n_jobs': n_jobs}, 122 | 123 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1, 124 | 'rvr__random_state': rand_seed}, 125 | 126 | {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1, 127 | 'xgboostadapted__max_depth': [1, 2, 3, 6, 8], 'xgboostadapted__n_estimators': 100, 128 | 'xgboostadapted__reg_alpha': [0.0001, 0.01, 0.1, 1, 10], 129 | 'xgboostadapted__reg_lambda': [0.0001, 0.01, 0.1, 1, 10, 20], 130 | 'xgboostadapted__random_seed': rand_seed, 'search_params': {'n_jobs': n_jobs}}] 131 | 132 | # Define processing for X (features) 133 | if confounds is None: 134 | if pca_status: 135 | preprocess_X = ['variancethreshold', 'zscore', pca] 136 | else: 137 | preprocess_X = ['variancethreshold', 'zscore'] 138 | else: 139 | if pca_status: 140 | preprocess_X = ['variancethreshold', 'zscore', 'remove_confound', pca] 141 | else: 142 | preprocess_X = ['variancethreshold', 'zscore', 'remove_confound'] 143 | print('Preprocessing includes:', preprocess_X) 144 | 145 | # Get the model, its parameters, pca status and train 146 | for ind in range(0, len(model_required)): # run only for required models and not all 147 | print('model required index and name:', ind, model_required[ind]) 148 | i = model_names.index(model_required[ind]) # find index of required model in model_names list and use this index i to access model params 149 | assert model_required[ind] == model_names[i] # sanity check 150 | print('model picked from the list', model_names[i], model_list[i], '\n') 151 | 152 | # initialize dictionaries to save scores and models here to save every model separately 153 | scores_cv, models = {}, {} 154 | 155 | cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=rand_seed).split(data_df, data_df.bins) 156 | 157 | scores, model = run_cross_validation(X=X, y=y, data=data_df, preprocess_X=preprocess_X, confounds=confounds, 158 | problem_type='regression', model=model_list[i], cv=cv, 159 | return_estimator='all', model_params=model_para_list[i], seed=rand_seed, 160 | scoring= 161 | ['neg_mean_absolute_error', 'neg_mean_squared_error','r2'], n_jobs=n_jobs) # adapted run_cross_validation to give n_jobs 162 | 163 | scores_cv[model_names[i]] = scores 164 | 165 | if model_names[i] == 'kernel_ridge' or model_names[i] == 'xgb': 166 | models[model_names[i]] = model.best_estimator_ 167 | print('best model', model.best_estimator_) 168 | print('best para', model.best_params_) 169 | else: 170 | models[model_names[i]] = model 171 | print('best model', model) 172 | 173 | print('Output file name') 174 | print(output_path / f'{output_prefix}.{model_names[i]}.models') 175 | pickle.dump(models, open(output_path / f'{output_prefix}.{model_names[i]}.models', "wb")) 176 | pickle.dump(scores_cv, open(output_path / f'{output_prefix}.{model_names[i]}.scores', "wb")) 177 | 178 | print('ALL DONE') 179 | print("--- %s seconds ---" % (time.time() - start_time)) 180 | print("--- %s minutes ---" % ((time.time() - start_time)/60)) 181 | print("--- %s hours ---" % ((time.time() - start_time)/3600)) 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /codes/within_site_read_results.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os.path 3 | import argparse 4 | import pandas as pd 5 | import numpy as np 6 | 7 | # all possible inputs 8 | ## within site 9 | # data_nm = '..results/ixi/ixi_' 10 | # data_nm = '..results/enki/enki_' 11 | # data_nm = '..results/camcan/camcan_' 12 | # data_nm = '..results/1000brains/1000brains_' 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--data_nm", type=str, help="Output path for one dataset") 17 | 18 | args = parser.parse_args() 19 | data_nm = args.data_nm 20 | 21 | # Filename to save results 22 | cv_file_ext = 'cv_scores.csv' 23 | test_file_ext = 'test_scores.csv' 24 | combined_file_ext = 'cv_test_scores.csv' 25 | 26 | # Complete results filepaths 27 | cv_filename = data_nm + cv_file_ext 28 | test_filename = data_nm + test_file_ext 29 | combined_filename = data_nm + combined_file_ext 30 | 31 | # all model names 32 | model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #'xgb' 33 | model_names_new = ['RR', 'RFR', 'RVRlin', 'KRR', 'GPR', 'LR', 'ENR', 'RVRpoly'] # 'XGB' 34 | 35 | # all feature spaces names 36 | data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca', 37 | 'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca'] 38 | data_list_new = ['173', '473', '873','1273', 'S0_R4', 'S0_R4 + PCA', 'S4_R4', 'S4_R4 + PCA', 'S8_R4', 'S8_R4 + PCA', 39 | 'S0_R8', 'S0_R8 + PCA', 'S4_R8', 'S4_R8 + PCA', 'S8_R8', 'S8_R8 + PCA'] 40 | 41 | # check which scores file is missing 42 | missing_outs = [] 43 | for data_item in data_list: 44 | for model_item in model_names: 45 | scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file 46 | if os.path.isfile(scores_item): 47 | print('yes') 48 | else: 49 | missing_outs.append(scores_item) 50 | print('Missing files:\n', missing_outs) 51 | 52 | # get the saved cv scores 53 | df = pd.DataFrame() 54 | df_cv = pd.DataFrame() 55 | for data_item in data_list: 56 | for model_item in model_names: 57 | scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file 58 | if os.path.isfile(scores_item): 59 | print(scores_item) 60 | res = pickle.load(open(scores_item,'rb')) 61 | df = pd.DataFrame() 62 | for key1, value1 in res.items(): 63 | print('key1', key1) 64 | mae_all, mse_all, corr_all, corr_delta_all, key_all = list(), list(), list(), list(), list() 65 | for key, value in value1.items(): 66 | mae = round(value['test_neg_mean_absolute_error'].mean() * -1, 3) 67 | mse = round(value['test_neg_mean_squared_error'].mean() * -1, 3) 68 | corr = round(value['test_r2'].mean(), 3) 69 | mae_all.append(mae) 70 | mse_all.append(mse) 71 | corr_all.append(corr) 72 | key_all.append(key) 73 | 74 | df['model'] = key_all 75 | df['data'] = len(mae_all) * [data_item] 76 | df[key1 + '_mae'] = mae_all 77 | df[key1 + '_mse'] = mse_all 78 | df[key1 + '_corr'] = corr_all 79 | # print(df) 80 | df_cv = pd.concat([df_cv, df], axis=0) 81 | 82 | df_cv.reset_index(drop=True, inplace=True) 83 | 84 | xx_mae = df_cv.loc[:, df_cv.columns.str.endswith('_mae')].values # ro take average over repeats of mae 85 | xx_mse = df_cv.loc[:, df_cv.columns.str.endswith('_mse')].values # ro take average over repeats of mae 86 | xx_corr = df_cv.loc[:, df_cv.columns.str.endswith('_corr')].values # ro take average over repeats of mae 87 | 88 | df_cv['mean_cv_mae'] = np.mean(xx_mae, axis=1).round(3) 89 | df_cv['mean_cv_mse'] = np.mean(xx_mse, axis=1).round(3) 90 | df_cv['mean_cv_corr'] = np.mean(xx_corr, axis=1).round(3) 91 | 92 | df_cv['workflow_name'] = df_cv['data'] + ' + ' + df_cv['model'] 93 | df_cv['data'] = df_cv['data'].replace(data_list, data_list_new) 94 | df_cv['model'] = df_cv['model'].replace(model_names, model_names_new) 95 | 96 | 97 | # # get the saved test scores 98 | df = pd.DataFrame() 99 | df_test = pd.DataFrame() 100 | 101 | for data_item in data_list: 102 | for model_item in model_names: 103 | scores_item = data_nm + data_item + '.' + model_item + '.results' # create the complete path to scores file 104 | if os.path.isfile(scores_item): 105 | print(scores_item) 106 | res = pickle.load(open(scores_item,'rb')) 107 | df = pd.DataFrame() 108 | for key1, value1 in res.items(): 109 | print('key1', key1) 110 | mae_all, mse_all, corr_all, key_all = list(), list(), list(), list() 111 | for key, value in value1.items(): 112 | mae = value['mae'] 113 | mse = value['mse'] 114 | corr = value['corr'] 115 | mae_all.append(mae) 116 | mse_all.append(mse) 117 | corr_all.append(corr) 118 | key_all.append(key) 119 | df['model'] = key_all 120 | df['data'] = len(mae_all) * [data_item] 121 | df[key1 + '_mae'] = mae_all 122 | df[key1 + '_mse'] = mse_all 123 | df[key1 + '_corr'] = corr_all 124 | # print(df) 125 | df_test = pd.concat([df_test, df], axis=0) 126 | 127 | df_test.reset_index(drop=True, inplace=True) 128 | 129 | xx_mae = df_test.loc[:, df_test.columns.str.endswith('_mae')].values # ro take average over repeats of mae 130 | xx_mse = df_test.loc[:, df_test.columns.str.endswith('_mse')].values # ro take average over repeats of mae 131 | xx_corr = df_test.loc[:, df_test.columns.str.endswith('_corr')].values # ro take average over repeats of mae 132 | 133 | df_test['mean_test_mae'] = np.mean(xx_mae, axis=1).round(3) 134 | df_test['mean_test_mse'] = np.mean(xx_mse, axis=1).round(3) 135 | df_test['mean_test_corr'] = np.mean(xx_corr, axis=1).round(3) 136 | 137 | df_test['workflow_name'] = df_test['data'] + ' + ' + df_test['model'] 138 | df_test['data'] = df_test['data'].replace(data_list, data_list_new) 139 | df_test['model'] = df_test['model'].replace(model_names, model_names_new) 140 | 141 | df_combined1 = df_cv[['model', 'data', 'mean_cv_mae', 'mean_cv_mse', 'mean_cv_corr', 'workflow_name']].copy() 142 | df_combined2 = df_test[['model', 'data', 'mean_test_mae', 'mean_test_mse', 'mean_test_corr', 'workflow_name']].copy() 143 | df_combined = pd.merge(df_combined1, df_combined2, how='left', on=['model', 'data', 'workflow_name']) 144 | df_combined['workflow_name_updated'] = df_combined['data'] + ' + ' + df_combined['model'] 145 | df_combined.reset_index(drop=True, inplace=True) 146 | 147 | # save the csv files 148 | print('\n cv results file:', cv_filename) 149 | print(df_cv) 150 | print('\n test results file:', test_filename) 151 | print(df_test) 152 | print('\n combined results file:', combined_filename) 153 | print(df_combined) 154 | 155 | df_cv.to_csv(cv_filename, index=False) 156 | df_test.to_csv(test_filename, index=False) 157 | df_combined.to_csv(combined_filename, index=False) 158 | 159 | 160 | # # check model parameters 161 | print('\n Model Parameters') 162 | error_models = list() 163 | for data_item in data_list: 164 | for model_item in model_names: 165 | model_item = data_nm + data_item + '.' + model_item + '.models' # get models 166 | 167 | if os.path.isfile(model_item): 168 | print('\n', 'model filename', model_item) 169 | 170 | res = pickle.load(open(model_item, 'rb')) 171 | # print(res) 172 | 173 | for key1, value1 in res.items(): 174 | for key2, value2 in value1.items(): 175 | print(key1, key2) 176 | 177 | if key2 == 'linreg': 178 | print(res[key1]['linreg']['linreg'].intercept_, res[key1]['linreg']['linreg'].coef_) 179 | 180 | elif key2 == 'gauss': 181 | model = res[key1]['gauss']['gauss'] 182 | # print(model.get_params()) 183 | print(model.kernel_.get_params()) 184 | 185 | elif key2 == 'kernel_ridge': 186 | model = res[key1]['kernel_ridge']['kernelridge'] 187 | print(model) 188 | 189 | elif key2 == 'rvr_lin': 190 | model = res[key1]['rvr_lin']['rvr'] 191 | print(model) 192 | 193 | elif key2 == 'rvr_poly': 194 | model = res[key1]['rvr_poly']['rvr'] 195 | print(model) 196 | 197 | elif key2 == 'rf': 198 | model = res[key1]['rf']['rf'] 199 | print(model) 200 | 201 | elif key2 == 'xgb': 202 | model = res[key1]['xgb']['xgboostadapted'] 203 | print(model) 204 | 205 | else: # for lasso, ridge, elasticnet 206 | model = res[key1][key2]['elasticnet'] 207 | print(model.lambda_best_) 208 | 209 | else: 210 | error_models.append(model_item) 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /codes/within_site_train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | import pickle 4 | import argparse 5 | import numpy as np 6 | import pandas as pd 7 | from pathlib import Path 8 | 9 | from brainage import stratified_splits, read_data, XGBoostAdapted, performance_metric 10 | 11 | import xgboost as xgb 12 | from skrvm import RVR 13 | from glmnet import ElasticNet 14 | import sklearn.gaussian_process as gp 15 | from sklearn.kernel_ridge import KernelRidge 16 | from sklearn.decomposition import PCA 17 | from sklearn.feature_selection import VarianceThreshold 18 | from sklearn.model_selection import RepeatedStratifiedKFold 19 | 20 | from julearn import run_cross_validation 21 | from julearn.utils import configure_logging 22 | from julearn.transformers import register_transformer 23 | 24 | start_time = time.time() 25 | 26 | if __name__ == '__main__': 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--demographics_file", type=str, help="Demographics file path") 29 | parser.add_argument("--features_file", type=str, help="Features file path") 30 | parser.add_argument("--output_path", type=str, help="Path to output directory") 31 | parser.add_argument("--output_prefix", type=str, help="Output prefix (used {dataname}.{featurename}") 32 | parser.add_argument("--models", type=str, nargs='?', const=1, default="ridge", 33 | help="models to use (comma seperated no space): ridge,rf,rvr_linear") 34 | parser.add_argument("--pca_status", type=int, default=0, 35 | help="0: no pca, 1: yes pca") 36 | 37 | configure_logging(level='INFO') 38 | 39 | # Parse the arguments 40 | args = parser.parse_args() 41 | demographics_file = args.demographics_file 42 | features_file = args.features_file 43 | output_path = Path(args.output_path) 44 | output_prefix = args.output_prefix 45 | model_required = [x.strip() for x in args.models.split(',')] # converts string into list 46 | pca_status = bool(args.pca_status) 47 | output_path.mkdir(exist_ok=True, parents=True) # check and create output directory 48 | 49 | # initialize random seed and create test indices 50 | rand_seed = 200 51 | n_repeats = 5 # for inner CV 52 | num_splits = 5 # how many train and test splits (both for other and inner) 53 | 54 | print('\nDemographics file: ', demographics_file) 55 | print('Features file: ', features_file) 56 | print('Ouput path : ', output_path) 57 | print('Ouput prefix: ', output_prefix) 58 | print('Model : ', model_required) 59 | print('PCA status : ', pca_status) 60 | print('Random seed : ', rand_seed) 61 | print('Num of splits for kfolds : ', num_splits, '\n') 62 | 63 | # read the features, demographics and define X and y 64 | data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file) 65 | 66 | # register VarianceThreshold as a transformer 67 | register_transformer('variancethreshold', VarianceThreshold, returned_features='unknown', apply_to='all_features') 68 | var_threshold = 1e-5 69 | 70 | # Create stratified splits for outer CV 71 | num_bins = math.floor(len(data_df)/num_splits) # num of bins to be created = num of labels created 72 | test_indices = stratified_splits(bins_on=data_df.index, num_bins=num_bins, data=data_df, num_splits=num_splits, 73 | shuffle=False, random_state=None) # creates dictionary of test indices 74 | 75 | # Define all models and model parameters 76 | rvr_linear = RVR() 77 | rvr_poly = RVR() 78 | kernel_ridge = KernelRidge() 79 | lasso = ElasticNet(alpha=1, standardize=False) 80 | elasticnet = ElasticNet(alpha=0.5, standardize=False) 81 | ridge = ElasticNet(alpha=0, standardize=False) 82 | xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2) 83 | pca = PCA(n_components=None) # max as many components as sample size 84 | 85 | model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb'] 86 | model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb] 87 | model_para_list = [{'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 88 | 89 | {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse', 90 | 'rf__max_features': 0.33, 'rf__min_samples_leaf': 5, 91 | 'rf__random_state': rand_seed}, 92 | 93 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear', 94 | 'rvr__random_state': rand_seed}, 95 | 96 | {'variancethreshold__threshold': var_threshold, 97 | 'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0], 98 | 'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5}, 99 | 100 | {'variancethreshold__threshold': var_threshold, 101 | 'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100, 102 | 'gauss__normalize_y': True, 'gauss__random_state': rand_seed}, 103 | 104 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 105 | 106 | {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed}, 107 | 108 | {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1, 109 | 'rvr__random_state': rand_seed}, 110 | 111 | {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1, 112 | 'xgboostadapted__max_depth': [6, 8, 10, 12], 'xgboostadapted__n_estimators': 100, 113 | 'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2], 114 | 'xgboostadapted__random_seed': rand_seed, 'cv': 5}] # 'search_params':{'n_jobs': 5} 115 | 116 | # Define processing for X (features) 117 | if pca_status: 118 | preprocess_X = ['variancethreshold', 'zscore', pca] 119 | else: 120 | preprocess_X = ['variancethreshold', 'zscore'] 121 | print('Preprocessing includes:', preprocess_X) 122 | 123 | # Get the model, its parameters, pca status and train 124 | for ind in range(0, len(model_required)): 125 | print('model required:', model_required[ind]) 126 | i = model_names.index(model_required[ind]) 127 | assert model_required[ind] == model_names[i] # sanity check 128 | print('model picked from the list', model_names[i], model_list[i], '\n') 129 | 130 | # initialize dictionaries to save scores, models and results here to save every model separately 131 | scores_cv = {k: {} for k in test_indices.keys()} 132 | models = {k: {} for k in test_indices.keys()} 133 | results = {k: {} for k in test_indices.keys()} 134 | 135 | for repeat_key in test_indices.keys(): 136 | all_idx = np.array(range(0, len(data_df))) 137 | print('\n \n--Repeat', repeat_key) 138 | test_idx = test_indices[repeat_key] # get test indices 139 | train_idx = np.delete(all_idx, test_idx) # get train indices 140 | train_df, test_df = data_df.loc[train_idx,:], data_df.loc[test_idx,:] # get test and train dataframes 141 | train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True) 142 | print('train size:', train_df.shape, 'test size:', test_df.shape) 143 | qc = pd.cut(train_df[y].tolist(), bins=5) # create bins for only train set using age, use this for stratification 144 | # print('age_bins', qc.categories, 'age_codes', qc.codes) 145 | 146 | cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=n_repeats, random_state=rand_seed).split(train_df, qc.codes) 147 | 148 | scores, model = run_cross_validation(X=X, y=y, data=train_df, preprocess_X=preprocess_X, 149 | problem_type='regression', model=model_list[i], cv=cv, 150 | return_estimator='final', model_params=model_para_list[i], seed=rand_seed, 151 | scoring= 152 | ['neg_mean_absolute_error', 'neg_mean_squared_error','r2']) 153 | 154 | scores_cv[repeat_key][model_names[i]] = scores 155 | 156 | if model_names[i] == 'kernel_ridge' or model_names[i] == 'xgb': 157 | models[repeat_key][model_names[i]] = model.best_estimator_ 158 | print('best model', model.best_estimator_) 159 | print('best para', model.best_params_) 160 | else: 161 | models[repeat_key][model_names[i]] = model 162 | print('best model', model) 163 | 164 | # Predict on test split 165 | y_true = test_df[y] 166 | y_pred = model.predict(test_df[X]).ravel() 167 | y_delta = y_true - y_pred 168 | print(y_true.shape, y_pred.shape) 169 | 170 | mae, mse, corr = performance_metric(y_true, y_pred) 171 | print('MAE:', mae, 'MSE:', mse, 'CoRR', corr) 172 | results[repeat_key][model_names[i]] = {'predictions': y_pred, 'true': y_true, 'test_idx': test_idx, 173 | 'delta': y_delta, 'mae': mae, 'mse': mse, 'corr': corr} 174 | 175 | print('Output file name') 176 | print(output_path / f'{output_prefix}.{model_names[i]}.models') 177 | pickle.dump(results, open(output_path / f'{output_prefix}.{model_names[i]}.results', "wb")) 178 | pickle.dump(scores_cv, open(output_path / f'{output_prefix}.{model_names[i]}.scores', "wb")) 179 | pickle.dump(models, open(output_path / f'{output_prefix}.{model_names[i]}.models', "wb")) 180 | 181 | print('ALL DONE') 182 | print("--- %s seconds ---" % (time.time() - start_time)) 183 | print("--- %s minutes ---" % ((time.time() - start_time)/60)) 184 | print("--- %s hours ---" % ((time.time() - start_time)/3600)) 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /codes/cat_standalone_batch-HiFi1mm.m: -------------------------------------------------------------------------------- 1 | % Batch file for CAT12 segmentation, mean GM ROI extraction to csv, 2 | % TIV & tissue volume extraction for SPM12 standalone installation 3 | % 4 | % ** eats GZIP for real now ** 5 | % 6 | % ----> CREATE INDIVIDUAL OUTPUT FOLDER BEFORE <---- 7 | % 8 | % input: 9 | % <*T1w.nii.gz> 10 | %_______________________________________________________________________ 11 | % $Id: cat_standalone_batch.m r1871 12 | %----------------------------------------------------------------------- 13 | 14 | % Used CAT12.8 r1871 15 | 16 | % INPUT FILE 17 | matlabbatch{1}.spm.tools.cat.estwrite.data(1) = ''; 18 | matlabbatch{1}.spm.tools.cat.estwrite.data_wmh = {''}; 19 | matlabbatch{1}.spm.tools.cat.estwrite.nproc = 0; 20 | matlabbatch{1}.spm.tools.cat.estwrite.useprior = ''; 21 | % Remove comments if you would like to change TPM by using additional arguments in cat_standalone.sh 22 | % or change this field manually by editing the "'}; 26 | 27 | % Affine regularisation (SPM12 default = mni) - '';'mni';'eastern';'subj';'none';'rigid' 28 | matlabbatch{1}.spm.tools.cat.estwrite.opts.affreg = 'mni'; 29 | 30 | % Strength of the bias correction that controls the biasreg and biasfwhm parameter (CAT only!) 31 | % 0 - use SPM parameter; eps - ultralight, 0.25 - light, 0.5 - medium, 0.75 - strong, and 1 - heavy corrections 32 | % job.opts.biasreg = min( 10 , max( 0 , 10^-(job.opts.biasstr*2 + 2) )); 33 | % job.opts.biasfwhm = min( inf , max( 30 , 30 + 60*job.opts.biasstr )); 34 | matlabbatch{1}.spm.tools.cat.estwrite.opts.biasstr = 0.5; 35 | %Overview of parameters: accstr: 0.50 0.75 1.00 samp: 3.00 2.00 1.00 (in mm) tol: 1e-4 1e-8 1e-16SPM default is samp 36 | matlabbatch{1}.spm.tools.cat.estwrite.opts.accstr = 0.8; 37 | % Use center-of-mass to roughly correct for differences in the position between image and template. This will internally correct the origin. 38 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.setCOM = 1; 39 | % Affine PreProcessing (APP) with rough bias correction and brain extraction for special anatomies (nonhuman/neonates) 40 | % 0 - none; 1070 - default; [1 - light; 2 - full; 1144 - update of 1070, 5 - animal (no affreg)] 41 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.APP = 1070; 42 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.affmod = 0; 43 | % Strength of the noise correction: 0 to 1; 0 - no filter, -Inf - auto, 1 - full, 2 - ISARNLM (else SANLM), default -Inf 44 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.NCstr = -Inf; 45 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.spm_kamap = 0; 46 | % Strength of the local adaption: 0 to 1; default 0.5 47 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.LASstr = 0.5; 48 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.LASmyostr = 0; 49 | % Strength of skull-stripping: 0 - SPM approach; eps to 1 - gcut; 2 - new APRG approach; -1 - no skull-stripping (already skull-stripped); default = 2 50 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.gcutstr = 2; 51 | % Strength of the cleanup process: 0 to 1; default 0.5 52 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.cleanupstr = 0.5; 53 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.BVCstr = 0.5; 54 | % Correction of WM hyperintensities: 0 - no correction, 1 - only for Dartel/Shooting 55 | % 2 - also correct segmentation (to WM), 3 - handle as separate class; default 1 56 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.WMHC = 2; 57 | % Stroke lesion correction (SLC): 0 - no correction, 1 - handling of manual lesion that have to be set to zero! 58 | % 2 - automatic lesion detection (in development) 59 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.SLC = 0; 60 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.mrf = 1; 61 | % % resolution handling: 'native','fixed','best', 'optimal' 62 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.restypes.optimal = [1 0.1]; 63 | % Remove comments and edit entry if you would like to change the Dartel/Shooting approach 64 | % Otherwise the default value from cat_defaults.m is used. 65 | % entry for choosing shooting approach 66 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.shootingtpm = {fullfile('/', 'templates_1mm', 'Template_0_GS1mm.nii')}; 67 | %matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.shootingtpm = {fullfile(spm('dir'),'toolbox','cat12','templates_MNI152NLin2009cAsym','Template_1_GS.nii')}; 68 | % entry for choosing dartel approach 69 | %matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.dartel.darteltpm = {fullfile(spm('dir'),'toolbox','cat12','templates_MNI152NLin2009cAsym','Template_1_Dartel.nii')}; 70 | 71 | % Strength of Shooting registration: 0 - Dartel, eps (fast), 0.5 (default) to 1 (accurate) optimized Shooting, 4 - default Shooting; default 0.5 72 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.regstr = 1; 73 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.vox = 1; 74 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.bb = 45; 75 | 76 | % surface and thickness creation: 0 - no (default), 1 - lh+rh, 2 - lh+rh+cerebellum, 77 | % 3 - lh, 4 - rh, 5 - lh+rh (fast, no registration, only for quick quality check and not for analysis), 78 | % 6 - lh+rh+cerebellum (fast, no registration, only for quick quality check and not for analysis) 79 | % 9 - thickness only (for ROI analysis, experimental!) 80 | % +10 to estimate WM and CSF width/depth/thickness (experimental!) 81 | matlabbatch{1}.spm.tools.cat.estwrite.output.surface = 1; 82 | matlabbatch{1}.spm.tools.cat.estwrite.output.surf_measures = 1; 83 | % surface options 84 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.pbtres = 0.5; 85 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.pbtmethod = 'pbt2x'; 86 | % surface recontruction pipeline & self-intersection correction: 0/1 - CS1 without/with/with-optimized SIC; 20/21/22 - CS2 without/with/with-optimized SIC; 87 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.SRP = 22; 88 | % optimize surface sampling: 0 - PBT res. (slow); 1 - optimal res. (default); 2 - internal res.; 3 - SPM init; 4 - MATLAB init; 5 - SPM full; 89 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.reduce_mesh = 1; 90 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.vdist = 2; 91 | % % reduce myelination effects (experimental, not yet working properly!) 92 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.pbtlas = 0; 93 | % % distance method for estimating thickness: 1 - Tfs: Freesurfer method using mean(Tnear1,Tnear2) (default in 12.7+); 0 - Tlink: linked distance (used before 12.7) 94 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.thick_measure = 1; 95 | % % upper limit for Tfs thickness measure similar to Freesurfer (only valid if cat.extopts.thick_measure is set to "1" 96 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.thick_limit = 5; 97 | 98 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.scale_cortex = 0.7; 99 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.add_parahipp = 0.1; 100 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.close_parahipp = 1; 101 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.experimental = 0; 102 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.new_release = 0; 103 | % set this to 1 for skipping preprocessing if already processed data exist 104 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.lazy = 0; 105 | % catch errors: 0 - stop with error (default); 1 - catch preprocessing errors (requires MATLAB 2008 or higher); 106 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.ignoreErrors = 1; 107 | % verbose output: 1 - default; 2 - details; 3 - write debugging files 108 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.verb = 2; 109 | % display and print out pdf-file of results: 0 - off, 1 - volume only, 2 - volume and surface (default) 110 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.print = 2; 111 | matlabbatch{1}.spm.tools.cat.estwrite.output.BIDS.BIDSno = 1; 112 | 113 | % define here volume atlases 114 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.neuromorphometrics = 1; 115 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.lpba40 = 1; 116 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.cobra = 1; 117 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.hammers = 1; 118 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.ibsr = 1; 119 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.aal3 = 1; 120 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.mori = 1; 121 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.anatomy3 = 1; 122 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.julichbrain = 1; 123 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_100Parcels_17Networks_order = 1; 124 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_200Parcels_17Networks_order = 1; 125 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_400Parcels_17Networks_order = 1; 126 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_600Parcels_17Networks_order = 1; 127 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.ownatlas = {''}; 128 | 129 | % % { name fileid GUIlevel use } - in development 130 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.satlas = { ... 131 | % 'Desikan' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_a2009s.freesurfer.annot') 1 1; 132 | % 'Destrieux' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_DK40.freesurfer.annot') 1 1; 133 | % 'HCP' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_HCP_MMP1.freesurfer.annot') 1 1; 134 | % ... Schaefer atlases ... 135 | % 'Schaefer2018_100P_17N' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_100Parcels_17Networks_order.annot') 1 1; 136 | % 'Schaefer2018_200P_17N' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_200Parcels_17Networks_order.annot') 1 1; 137 | % 'Schaefer2018_400P_17N' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_400Parcels_17Networks_order.annot') 1 1; 138 | % 'Schaefer2018_600P_17N' fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_600Parcels_17Networks_order.annot') 1 1; 139 | % }; 140 | 141 | 142 | % Writing options (see cat_defaults for the description of parameters) 143 | % native 0/1 (none/yes) 144 | % warped 0/1 (none/yes) 145 | % mod 0/1/2/3 (none/affine+nonlinear/nonlinear only/both) 146 | % dartel 0/1/2/3 (none/rigid/affine/both) 147 | 148 | % GM/WM/CSF/WMH 149 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.native = 0; 150 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.warped = 0; 151 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.mod = 3; 152 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.dartel = 0; 153 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.native = 0; 154 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.warped = 0; 155 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.mod = 0; 156 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.dartel = 0; 157 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.native = 0; 158 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.warped = 0; 159 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.mod = 0; 160 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.dartel = 0; 161 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.native = 0; 162 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.warped = 0; 163 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.dartel = 0; 164 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.native = 0; 165 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.warped = 0; 166 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.dartel = 0; 167 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.native = 0; 168 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.warped = 0; 169 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.mod = 0; 170 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.dartel = 0; 171 | 172 | % stroke lesion tissue maps (only for opt.extopts.SLC>0) - in development 173 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.native = 0; 174 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.warped = 0; 175 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.mod = 0; 176 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.dartel = 0; 177 | 178 | % Tissue classes 4-6 to create own TPMs 179 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.native = 0; 180 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.warped = 0; 181 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.mod = 0; 182 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.dartel = 0; 183 | 184 | % atlas maps (for evaluation) 185 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.native = 1; 186 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.warped = 0; 187 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.dartel = 0; 188 | 189 | % label 190 | % background=0, CSF=1, GM=2, WM=3, WMH=4 (if opt.extopts.WMHC==3), SL=1.5 (if opt.extopts.SLC>0)matlabbatch{1}.spm.tools.cat.estwrite.output.label.native = 0; 191 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.native = 1; 192 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.warped = 1; 193 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.dartel = 0; 194 | 195 | % bias and noise corrected, global intensity normalized 196 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.native = 0; 197 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.warped = 0; 198 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.dartel = 0; 199 | 200 | % bias and noise corrected, (locally - if LAS>0) intensity normalized 201 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.native = 0; 202 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.warped = 0; 203 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.dartel = 0; 204 | 205 | % jacobian determinant 0/1 (none/yes) 206 | matlabbatch{1}.spm.tools.cat.estwrite.output.jacobianwarped = 1; 207 | 208 | % deformations, order is [forward inverse] 209 | matlabbatch{1}.spm.tools.cat.estwrite.output.warps = [1 1]; 210 | 211 | % deformation matrices (affine and rigid) 212 | matlabbatch{1}.spm.tools.cat.estwrite.output.rmat = 1; 213 | --------------------------------------------------------------------------------