├── data
    └── .gitkeep
├── logs
    └── .gitkeep
├── masks
    └── .gitkeep
├── results
    └── .gitkeep
├── trained_models
    └── .gitkeep
├── pyproject.toml
├── entrypoint.sh
├── .flake8
├── brainage
    ├── performance_metric.py
    ├── __init__.py
    ├── zscore.py
    ├── xgboost_adapted.py
    ├── read_data.py
    ├── define_models.py
    ├── create_splits.py
    └── calculate_features.py
├── tests
    └── test_read_data_mask_resampled.py
├── codes
    ├── run_in_venv.sh
    ├── cross_site_ixi_camcan_enki.submit
    ├── cross_site_4sites.submit
    ├── calculate_features_parcelwise.py
    ├── calculate_features_voxelwise.py
    ├── cross_site_combine_features.py
    ├── calculate_features.submit
    ├── within_site_bias_correction.py
    ├── cross_site_bias_correction_using_CN.py
    ├── within_site_combine_predictions.py
    ├── predict_age.py
    ├── cross_site_bias_correction.py
    ├── predict_age_sing.py
    ├── cross_site_combine_predictions.py
    ├── cross_site_read_results.py
    ├── within_site_ixi.submit
    ├── calculate_features2.submit
    ├── cross_site_train.py
    ├── within_site_read_results.py
    ├── within_site_train.py
    └── cat_standalone_batch-HiFi1mm.m
├── requirements.txt
├── setup.py
├── BA_predict.recipe
├── .gitignore
└── README.md


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/masks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trained_models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.pytest.ini_options]
6 | addopts = "--cov=brainage"
7 | testpaths = [
8 |     "tests",
9 | ]


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd codes
3 | python3 predict_age_sing.py --features_path $1 --data_dir $2 --subject_filepaths $3 --output_path $4 --output_prefix $5 --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm $6  --resample_size $7 --model_file $8
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = __init__.py,*externals*,constants.py,fixes.py,resources.py,nilearn_cache,venv,docs/auto_examples,docs/_build/,.eggs/
3 | ignore = W503,W504,I100,I101,I201,N806,E201,E202,E221,E222,E241,F541,E999,E402
4 | # We add A for the array-spacing plugin, and ignore the E ones it covers above
5 | select = A,E,F,W,C


--------------------------------------------------------------------------------
/brainage/performance_metric.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import mean_absolute_error, mean_squared_error
2 | import numpy as np
3 | 
4 | def performance_metric(y_true, y_pred):
5 |     mae = round(mean_absolute_error(y_true, y_pred), 3)
6 |     mse = round(mean_squared_error(y_true, y_pred), 3)
7 |     corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
8 |     return mae, mse, corr
9 | 


--------------------------------------------------------------------------------
/brainage/__init__.py:
--------------------------------------------------------------------------------
 1 | from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
 2 | from .create_splits import stratified_splits
 3 | from .xgboost_adapted import XGBoostAdapted
 4 | from .zscore import ZScoreSubwise, ZScore
 5 | from .create_splits import repeated_stratified_splits
 6 | from .read_data import read_data_cross_site
 7 | from .read_data import read_data
 8 | from .define_models import define_models
 9 | from sklearn.linear_model import LinearRegression
10 | from .performance_metric import performance_metric
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/test_read_data_mask_resampled.py:
--------------------------------------------------------------------------------
 1 | from brainage import binarize_3d
 2 | from nibabel import Nifti1Image
 3 | import numpy as np
 4 | 
 5 | def _make_image():
 6 |     return Nifti1Image(
 7 |         np.random.default_rng(seed=5).integers(low=0, high=5, size=(5, 5, 2)),
 8 |         np.eye(4),
 9 |     )
10 | 
11 | def test_binarize_3d():
12 |     img = _make_image()
13 |     bin_img = binarize_3d(img, threshold=2)
14 |     
15 |     assert np.min(bin_img.get_fdata()) == 0
16 |     assert np.max(bin_img.get_fdata()) == 1
17 |     np.testing.assert_array_equal(np.unique(bin_img.get_fdata()),np.array([0, 1]))
18 | 


--------------------------------------------------------------------------------
/codes/run_in_venv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #source ~/.venvs/py3smore/bin/activate
 3 | #OMP_NUM_THREADS=5 python3 $@
 4 | #deactivate
 5 | 
 6 | if [ $# -lt 2 ]; then
 7 |     echo "This script is meant to run a command within a python environment"
 8 |     echo "It needs at least 2 parameters."
 9 |     echo "The first one must be the environment name."
10 |     echo "The rest will be the command"
11 |     exit 1
12 | fi
13 | 
14 | env_name=$1
15 | echo "Activating ${env_name}"
16 | source ~/.venvs/${env_name}/bin/activate
17 | shift 1
18 | echo "Running ${@} in virtual environment"
19 | 
20 | export MKL_NUM_THREADS=1
21 | export OPENBLAS_NUM_THREADS=1
22 | export NUMEXPR_NUM_THREADS=1
23 | export OMP_NUM_THREADS=1
24 | 
25 | $@
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asttokens==2.0.5
 2 | backcall==0.2.0
 3 | certifi==2021.10.8
 4 | charset-normalizer==2.0.12
 5 | commonmark==0.9.1
 6 | convertdate==2.4.0
 7 | cycler==0.11.0
 8 | decorator==5.1.1
 9 | executing==0.8.3
10 | fonttools==4.31.2
11 | numpy==1.22.3
12 | idna==3.3
13 | ipython==8.2.0
14 | jedi==0.18.1
15 | joblib==1.1.0
16 | julearn==0.2.5
17 | kiwisolver==1.4.2
18 | lunardate==0.2.0
19 | lxml==4.8.0
20 | matplotlib==3.5.1
21 | matplotlib-inline==0.1.3
22 | natsort==8.1.0
23 | nibabel==3.2.2
24 | nilearn==0.9.1
25 | packaging==21.3
26 | pandas==1.4.2
27 | parso==0.8.3
28 | pexpect==4.8.0
29 | pickleshare==0.7.5
30 | Pillow==9.1.0
31 | prompt-toolkit==3.0.29
32 | ptyprocess==0.7.0
33 | pure-eval==0.2.2
34 | Pygments==2.11.2
35 | pyluach==1.4.1
36 | PyMeeus==0.5.11
37 | pyparsing==3.0.7
38 | python-dateutil==2.8.2
39 | pytz==2022.1
40 | requests==2.27.1
41 | rich==12.2.0
42 | scikit-learn==1.0.2
43 | scipy==1.8.0
44 | seaborn==0.11.2
45 | six==1.16.0
46 | stack-data==0.2.0
47 | threadpoolctl==3.1.0
48 | traitlets==5.1.1
49 | urllib3==1.26.9
50 | wcwidth==0.2.5
51 | workalendar==16.3.0
52 | xgboost==1.6.0
53 | 
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | # requirements = []
 4 | # with open("requirements.txt", "r") as f:
 5 | #    for line in f:
 6 | #        requirements.append(line)
 7 | 
 8 | setup(
 9 |     name="brainage",
10 |     version="0.1.0",
11 |     description="Brainage prediction project",
12 |     url="https://github.com/juaml/brainage_estimation",
13 |     author="Applied Machine Learning FZJ",
14 |     packages=find_packages(),
15 |     # install_requires=requirements,
16 |     classifiers=[
17 |         "Development Status :: 1 - Planning",
18 |         "Intended Audience :: Science/Research",
19 |         "License :: OSI Approved :: BSD License",
20 |         "Operating System :: POSIX :: Linux",
21 |         "Programming Language :: Python :: 2",
22 |         "Programming Language :: Python :: 2.7",
23 |         "Programming Language :: Python :: 3",
24 |         "Programming Language :: Python :: 3.4",
25 |         "Programming Language :: Python :: 3.5",
26 |     ],
27 |     python_requires=">=3.6",
28 |     include_package_data=True,
29 |     package_data={"": ["data/*"]},
30 | )
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/brainage/zscore.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import BaseEstimator, TransformerMixin
 3 | from sklearn.utils import check_array
 4 | from scipy.stats import zscore
 5 | 
 6 | 
 7 | class ZScore(BaseEstimator, TransformerMixin):
 8 | 
 9 |     def __init__(self, axis=0):
10 |         self.axis = axis
11 | 
12 |     def fit(self, X, y=None):
13 |         X = check_array(X)
14 |         self.mean_ = np.mean(X, axis=self.axis)
15 |         self.std_ = np.std(X, axis=self.axis)
16 |         return self
17 | 
18 |     def transform(self, X):
19 |         X = check_array(X)
20 |         mean = (
21 |             self.mean_.reshape(-1, 1)
22 |             if self.axis
23 |             else self.mean_
24 |         )
25 | 
26 |         std = (
27 |             self.std_.reshape(-1, 1)
28 |             if self.axis
29 |             else self.std_
30 |         )
31 |         # print(f"{X.shape = }")
32 |         # print(f"{mean.shape = }")
33 |         # print(f"{std.shape = }")
34 | 
35 |         return (X - mean) / std
36 | 
37 | 
38 | class ZScoreSubwise(BaseEstimator, TransformerMixin):
39 | 
40 |     def __init__(self, axis=0):
41 |         self.axis = axis
42 | 
43 |     def fit(self, X, y=None):
44 |         return self
45 | 
46 |     def transform(self, X):
47 |         X = check_array(X)
48 |         return zscore(X, axis=self.axis)
49 | 
50 | 


--------------------------------------------------------------------------------
/BA_predict.recipe:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: continuumio/miniconda3:latest
 3 | 
 4 | %files
 5 | 	# copy brainage module and related files
 6 | 	brainage/ /opt/src/brainage/
 7 | 	setup.py /opt/src/
 8 | 	pyproject.toml /opt/src/
 9 | 	requirements.txt /opt/src/
10 | 	entrypoint.sh /opt/scripts/
11 | 
12 | %post
13 | 
14 | 	export PATH=/opt/conda/bin:$PATH
15 | 
16 | 	chmod 777 /tmp/
17 |     chmod 777 /opt/scripts/entrypoint.sh
18 | 	
19 | 	apt-get update --allow-releaseinfo-change
20 |     apt-get install -y --fix-missing libgomp1 wget dpkg 
21 | 
22 |     
23 | 	NOW=`date`
24 | 	
25 |     # Initialize conda
26 |     conda --version
27 | 
28 | 	conda create --name BA_env -c conda-forge python=3.9.1 numpy==1.22.3 matplotlib==3.5.1 nibabel==3.2.2 nilearn==0.9.1 pandas==1.4.2 scipy==1.8.0 seaborn==0.11.2 xgboost==1.6.1 scikit-learn==1.0.2 glmnet
29 | 	. /opt/conda/etc/profile.d/conda.sh
30 |  	conda activate BA_env
31 | 	pip install "julearn==0.2.5"
32 | 	pip install git+https://github.com/JamesRitchie/scikit-rvm.git@master
33 | 	cd /opt/src && pip install -e .
34 | 
35 | 
36 | %runscript
37 |     echo "Container was created $NOW"
38 |     echo "Arguments received: $*"
39 | 
40 |     # Activate environment
41 |     conda init
42 |     . /opt/conda/etc/profile.d/conda.sh
43 |     conda activate BA_env
44 | 
45 |     # Running entrypoint.sh
46 |     /opt/scripts/entrypoint.sh "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8"
47 |     echo "Computation finished!"
48 | 


--------------------------------------------------------------------------------
/brainage/xgboost_adapted.py:
--------------------------------------------------------------------------------
 1 | from xgboost import XGBRegressor
 2 | from sklearn.base import BaseEstimator
 3 | from sklearn.model_selection import train_test_split
 4 | import numpy as np
 5 | 
 6 | class XGBoostAdapted(BaseEstimator):
 7 | 
 8 |     def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
 9 |         self.early_stopping_rounds = early_stopping_rounds
10 |         self.eval_metric = eval_metric
11 |         self.eval_set_percent = eval_set_percent
12 |         self.random_seed = random_seed
13 |         self.n_jobs = n_jobs
14 |         self.max_depth = max_depth
15 |         self.n_estimators = n_estimators
16 |         self.nthread = nthread
17 |         self.reg_alpha = reg_alpha
18 | 
19 |             
20 |     def fit(self, X, y):
21 |         self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
22 | 
23 |         X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
24 | 
25 |         eval_set = [(X_test, y_test)]
26 | 
27 |         self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
28 |         
29 |         return self
30 | 
31 |     def score(self, X, y, sample_weight=None):
32 |         return self._xgbregressor.score(X.values, y.values, sample_weight)
33 | 
34 |     def predict(self, X):
35 |         return self._xgbregressor.predict(X.values)
36 | 
37 | 
38 | 
39 | 
40 |         
41 |         
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/codes/cross_site_ixi_camcan_enki.submit:
--------------------------------------------------------------------------------
 1 | # The environment
 2 | universe       = vanilla
 3 | getenv         = True
 4 | 
 5 | # resources
 6 | request_cpus   = 10
 7 | request_memory = 5G
 8 | 
 9 | 
10 | # Execution
11 | initial_dir    = .
12 | executable     = $(initial_dir)/run_in_venv.sh
13 | transfer_executable = False
14 | 
15 | #Logs
16 | log            = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).log
17 | output         = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).out
18 | error          = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).err
19 | 
20 | 
21 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb'
22 | # --pca_status: 0 or 1
23 | 
24 | 
25 | data_name = ixi_camcan_enki
26 | subject_filepaths_csv = ixi_camcan_enki.subject_list_cat12.8.csv
27 | 
28 | arguments = test_package_env python3 cross_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca)
29 | 
30 | 
31 | ########## S4_R8
32 | feature_name = $(data_name).S4_R8
33 | result_prefix = $(data_name).S4_R8
34 | pca = 0
35 | 
36 | #model = ridge
37 | #queue
38 | 
39 | #model = rf
40 | #queue
41 | 
42 | model = rvr_poly
43 | queue
44 | 
45 | 
46 | ########## 173
47 | feature_name = $(data_name).173
48 | result_prefix = $(data_name).173
49 | pca = 0
50 | 
51 | #model = gauss
52 | #queue
53 | 
54 | #model = rf
55 | #queue
56 | 
57 | model = lasso
58 | queue
59 | 
60 | model = ridge
61 | queue
62 | 
63 | #model = rvr_lin
64 | #queue
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/codes/cross_site_4sites.submit:
--------------------------------------------------------------------------------
 1 | # The environment
 2 | universe       = vanilla
 3 | getenv         = True
 4 | 
 5 | # resources
 6 | request_cpus   = 10
 7 | request_memory = 5G
 8 | 
 9 | 
10 | # Execution
11 | initial_dir    = .
12 | executable     = $(initial_dir)/run_in_venv.sh
13 | transfer_executable = False
14 | 
15 | #Logs
16 | log            = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).log
17 | output         = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).out
18 | error          = $(initial_dir)/../logs/$(result_prefix).$(model).$(Cluster).$(Process).err
19 | 
20 | 
21 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb'
22 | # --pca_status: 0 or 1
23 | 
24 | 
25 | data_name = ixi_camcan_enki_1000brains
26 | subject_filepaths_csv = ixi_camcan_enki_1000brains.subject_list_cat12.8.csv
27 | 
28 | arguments = test_package_env python3 cross_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca)
29 | 
30 | 
31 | ########## S4_R4_pca
32 | feature_name = $(data_name).S4_R4
33 | result_prefix = $(data_name).S4_R4_pca
34 | pca = 1
35 | 
36 | #model = ridge
37 | #queue
38 | 
39 | #model = rf
40 | #queue
41 | 
42 | #model = rvr_lin
43 | #queue
44 | 
45 | #model = kernel_ridge
46 | #queue
47 | 
48 | #model = gauss
49 | #queue
50 | 
51 | #model = lasso
52 | #queue
53 | 
54 | #model = elasticnet
55 | #queue
56 | 
57 | #model = rvr_poly
58 | #queue
59 | 
60 | #model = xgb
61 | #queue
62 | 
63 | 
64 | ########## 173
65 | feature_name = $(data_name).173
66 | result_prefix = $(data_name).173
67 | pca = 0
68 | 
69 | model = rvr_lin
70 | queue
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/codes/calculate_features_parcelwise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import argparse
 4 | from pathlib import Path
 5 | from brainage import calculate_parcelwise_features
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
11 |     # parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
12 |     parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv'
13 |     parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
14 |     parser.add_argument("--mask_file", type=str, help="path to mask nii file")
15 |     parser.add_argument("--num_parcels", type=str, help="Number of parcels")
16 | 
17 |     # python3 calculate_features_parcelwise.py --features_path ../data/ixi/ --subject_filepaths ../data/ixi/ixi_paths_cat12.8.csv --output_prefix ixi --mask_file ../masks/BSF_173.nii --num_parcels 173
18 |    
19 |     # example inputs
20 |     # features_path = Path('../data/ixi/')
21 |     # subject_filepaths = '../data/ixi_paths_cat12.8.csv'
22 |     # output_prefix = 'ixi'
23 |     # mask_file = '../masks/BSF_173.nii'
24 |     # num_parcels = 173
25 | 
26 |     args = parser.parse_args()
27 |     features_path = Path(args.features_path)
28 |     subject_filepaths = args.subject_filepaths
29 |     output_prefix = args.output_prefix
30 |     mask_file = args.mask_file
31 |     num_parcels = args.num_parcels
32 | 
33 |     print('Subjects filepaths: ', subject_filepaths)
34 |     print('Directory to features path: ',  features_path)
35 |     print('Results filename prefix: ', output_prefix)
36 |     print('GM mask used: ', mask_file)
37 |     print('Number of parcels:', num_parcels, '/n')
38 |     
39 |     data_parcels = calculate_parcelwise_features(subject_filepaths, mask_file, num_parcels)
40 | 
41 |     features_path.mkdir(exist_ok=True, parents=True)
42 |     
43 |     full_filename = str(output_prefix) + '.' + str(num_parcels)
44 |     filename = os.path.join(features_path, full_filename)
45 |     print('filename for features created: ', filename)
46 |     pickle.dump(data_parcels, open(filename, "wb"), protocol=4)
47 |     data_parcels.to_csv(filename + '.csv', index=False)


--------------------------------------------------------------------------------
/brainage/read_data.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import pandas as pd
 3 | 
 4 | def read_data_cross_site(data_file, train_status, confounds):
 5 |     
 6 |     data_df = pickle.load(open(data_file, 'rb'))
 7 |     X = [col for col in data_df if col.startswith('f_')]
 8 |     y = 'age'
 9 |     data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
10 |     data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
11 |     duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
12 |     data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
13 | 
14 |     if confounds is not None:  # convert sites in numbers to perform confound removal
15 |         if train_status == 'train':
16 |             site_name = data_df['site'].unique()
17 |             if type(site_name[0]) == str:
18 |                 site_dict = {k: idx for idx, k in enumerate(site_name)}
19 |                 data_df['site'] = data_df['site'].replace(site_dict)
20 | 
21 |         elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
22 |             X.append(confounds)
23 |             site_name = data_df['site'].unique()[0,]
24 |             if type(site_name) == str:
25 |                 data_df['site'] = 10
26 |     return data_df, X, y
27 |     
28 |     
29 |     
30 | def read_data(features_file, demographics_file):
31 |     data_df = pickle.load(open(features_file, 'rb')) # read the data
32 |     demo = pd.read_csv(demographics_file)     # read demographics file
33 |     data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
34 | 
35 |     print('Data columns:', data_df.columns)
36 |     print('Data Index:', data_df.index)
37 | 
38 |     X = [col for col in data_df if col.startswith('f_')]
39 |     y = 'age'
40 |     data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
41 |     data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
42 |     data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
43 |     duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
44 |     data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
45 |     return data_df, X, y
46 | 
47 | 


--------------------------------------------------------------------------------
/codes/calculate_features_voxelwise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pickle
 4 | from pathlib import Path
 5 | from brainage import calculate_voxelwise_features
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
10 |     # parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
11 |     parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv'
12 |     parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
13 |     parser.add_argument("--mask_file", type=str, help="path to GM mask nii file",
14 |                         default='../masks/brainmask_12.8.nii')
15 |     parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4)
16 |     parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4)
17 | 
18 |     # python3 calculate_features_voxelwise.py --features_path ../data/ixi/ --subject_filepaths ../data/ixi/ixi_paths_cat12.8.csv --output_prefix ixi --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8
19 |     
20 |     # example inputs
21 |     # features_path = Path('../data/ixi/')
22 |     # subject_filepaths = '../data/ixi_paths_cat12.8.csv'
23 |     # output_prefix = 'ixi'
24 |     # mask_file = '../masks/brainmask_12.8.nii'
25 |     # smooth_fwhm = 4
26 |     # resample_size = 8
27 | 
28 |     args = parser.parse_args()
29 |     features_path = Path(args.features_path)
30 |     subject_filepaths = args.subject_filepaths
31 |     output_prefix = args.output_prefix
32 |     mask_file = args.mask_file
33 |     smooth_fwhm = args.smooth_fwhm
34 |     resample_size = args.resample_size
35 | 
36 |     print('Subjects filepaths: ', subject_filepaths)
37 |     print('Directory to features path: ',  features_path)
38 |     print('Results filename prefix: ', output_prefix)
39 |     print('GM mask used: ', mask_file)
40 |     print('smooth_fwhm:', smooth_fwhm)
41 |     print('resample_size:', resample_size, '/n')
42 | 
43 |     data_resampled = calculate_voxelwise_features(subject_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size)
44 | 
45 |     features_path.mkdir(exist_ok=True, parents=True)
46 | 
47 |     full_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size)
48 |     filename = os.path.join(features_path, full_filename)
49 |     print('filename for features created: ', filename)
50 |     pickle.dump(data_resampled, open(filename, "wb"), protocol=4)
51 |     data_resampled.to_csv(filename + '.csv', index=False)
52 | 


--------------------------------------------------------------------------------
/codes/cross_site_combine_features.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import pandas as pd
 3 | import os.path
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     results_folder = '../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.'
 8 |     data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/enki/enki.', '../data/1000brains/1000brains.']
 9 | 
10 |     results_folder = '../data/ixi_camcan_enki/ixi_camcan_enki.'
11 |     data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/enki/enki.']
12 | 
13 |     results_folder = '../data/ixi_camcan_1000brains/ixi_camcan_1000brains.'
14 |     data_list = ['../data/ixi/ixi.', '../data/camcan/camcan.', '../data/1000brains/1000brains.']
15 |     
16 |     results_folder = '../data/camcan_enki_1000brains/camcan_enki_1000brains_'
17 |     data_list = ['../data/camcan/camcan.', '../data/enki/enki.', '../data/1000brains/1000brains.']
18 | 
19 |     results_folder = '../data/ixi_enki_1000brains/ixi_enki_1000brains.'
20 |     data_list = ['../data/ixi/ixi.', '../data/enki/enki.', '../data/1000brains/1000brains.']
21 | 
22 |     feature_list = ['173', '473', '873', '1273', 'S0_R4', 'S4_R4', 'S8_R4', 'S0_R8', 'S4_R8', 'S8_R8']
23 | 
24 |     combined_data_df = pd.DataFrame()
25 |     combined_demo_df = pd.DataFrame()
26 | 
27 |     for feature_item in feature_list:
28 |         print(feature_item)
29 | 
30 |         combined_data_df = pd.DataFrame()
31 |         combined_demo_df = pd.DataFrame()
32 | 
33 |         for data_item in data_list:
34 |             datafile_name = data_item + feature_item + '.csv'
35 |             demofile_name = data_item + 'subject_list_cat12.8.csv'
36 |             print(datafile_name, demofile_name)
37 | 
38 |             if os.path.exists(datafile_name):
39 | 
40 |                 data_df, demo_df = pd.read_csv(datafile_name), pd.read_csv(demofile_name)
41 |                 print(data_df.shape, demo_df.shape)
42 | 
43 |                 if 'session' not in demo_df.columns:
44 |                     demo_df['session'] = 'ses-1'
45 | 
46 |                 combined_data_df = pd.concat([combined_data_df, data_df])
47 |                 combined_demo_df = pd.concat([combined_demo_df, demo_df])
48 |             else:
49 |                 break
50 | 
51 |         combined_data_df = combined_data_df.reset_index(drop=True)
52 |         combined_demo_df = combined_demo_df.reset_index(drop=True)
53 | 
54 |         print(combined_data_df.shape, combined_demo_df.shape)
55 | 
56 | #        demographic_file = results_folder + 'subject_list_cat12.8.csv'
57 | #        features_file = results_folder + feature_item
58 | #        print(demographic_file, features_file)
59 | #
60 | #        combined_demo_df.to_csv(demographic_file, index=False)
61 | #        combined_data_df.to_csv(features_file + '.csv', index=False)
62 | #        pickle.dump(combined_data_df, open(features_file, 'wb'), protocol=4)
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/brainage/define_models.py:
--------------------------------------------------------------------------------
 1 | import xgboost as xgb
 2 | from skrvm import RVR
 3 | from glmnet import ElasticNet
 4 | import sklearn.gaussian_process as gp
 5 | from sklearn.kernel_ridge import KernelRidge
 6 | from sklearn.decomposition import PCA
 7 | from brainage import XGBoostAdapted
 8 | from sklearn.feature_selection import VarianceThreshold
 9 |     
10 | def define_models():
11 |     # Define all models and model parameters
12 |     rvr_linear = RVR()
13 |     rvr_poly = RVR()
14 |     kernel_ridge = KernelRidge()
15 |     lasso = ElasticNet(alpha=1, standardize=False)
16 |     elasticnet = ElasticNet(alpha=0.5, standardize=False)
17 |     ridge = ElasticNet(alpha=0, standardize=False)
18 |     xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
19 |     pca = PCA(n_components=None)  # max as many components as sample size
20 | 
21 | 
22 |     model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
23 |     model_para_list = [
24 |                     {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
25 | 
26 |                     {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
27 |                     'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
28 |                     'rf__random_state': rand_seed},
29 | 
30 |                     {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
31 |                     'rvr__random_state': rand_seed},
32 | 
33 |                     {'variancethreshold__threshold': var_threshold,
34 |                     'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
35 |                     'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
36 | 
37 |                     {'variancethreshold__threshold': var_threshold,
38 |                     'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
39 |                     'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
40 | 
41 |                     {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
42 | 
43 |                     {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
44 | 
45 |                     {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
46 |                     'rvr__random_state': rand_seed},
47 | 
48 |                     {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
49 |                     'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
50 |                     'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
51 |                     'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
52 |                     
53 |     return model_list, model_para_list
54 | 


--------------------------------------------------------------------------------
/codes/calculate_features.submit:
--------------------------------------------------------------------------------
 1 | # The environment
 2 | universe       = vanilla
 3 | getenv         = True
 4 | 
 5 | # resources
 6 | request_cpus   = 1
 7 | request_memory = 5G
 8 | 
 9 | # Execution
10 | initial_dir    = .
11 | executable     = $(initial_dir)/run_in_venv.sh
12 | 
13 | # Job
14 | #log            = $(initial_dir)/../logs/$(Cluster).$(Process).log
15 | #output         = $(initial_dir)/../logs/$(Cluster).$(Process).out
16 | #error          = $(initial_dir)/../logs/$(Cluster).$(Process).err
17 | 
18 | log            = $(initial_dir)/../logs/$(data_name).$(Process).log
19 | output         = $(initial_dir)/../logs/$(data_name).$(Process).out
20 | error          = $(initial_dir)/../logs/$(data_name).$(Process).err
21 | 
22 | # 1000brains (change data_name and subject_filepaths_csv to run for different dataset)
23 | data_name = 1000brains
24 | subject_filepaths_csv = 1000brains.paths_cat12.8.csv 
25 | 
26 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4
27 | queue
28 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8
29 | queue
30 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4
31 | queue
32 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8
33 | queue
34 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4
35 | queue
36 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8
37 | queue
38 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173
39 | queue
40 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473
41 | queue
42 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873
43 | queue
44 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273
45 | queue
46 | 
47 | 


--------------------------------------------------------------------------------
/codes/within_site_bias_correction.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os.path
 3 | import argparse
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.linear_model import LinearRegression
 7 | from sklearn.model_selection import StratifiedKFold
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--input_predictions_file", type=str, help="Path to predictions csv")
12 |     parser.add_argument("--BC_predictions_file", type=str, help="Path to bias corrected predictions")
13 | 
14 | # python3 within_site_bias_correction.py \
15 | #     --input_predictions_file ../results/ixi/ixi.all_models_pred.csv \
16 | #     --BC_predictions_file ../results/ixi/ixi.all_models_pred_BC.csv
17 |     
18 |     # read arguments
19 |     args = parser.parse_args()
20 |     input_predictions_file = args.input_predictions_file
21 |     BC_predictions_file = args.BC_predictions_file
22 | 
23 |     # Initialize
24 |     input_df, output_df = pd.DataFrame(), pd.DataFrame()
25 |     column_list, column_name_original = [], []
26 | 
27 |     if os.path.exists(input_predictions_file):  # if predictions exists
28 |         input_df = pd.read_csv(input_predictions_file) # read predictions from all workflows
29 |         print(input_df.columns)
30 |         print(input_df.index)
31 |         
32 |         if 'session' in input_df.columns:
33 |             column_list = input_df.columns[5:] # remove ['site', 'subject', 'age', 'gender'']
34 |             output_df = input_df[['site', 'subject', 'age', 'gender', 'session']]
35 |         else:
36 |             column_list = input_df.columns[4:] # remove ['site', 'subject', 'age', 'gender'']
37 |             output_df = input_df[['site', 'subject', 'age', 'gender']]
38 | 
39 |         # Fixed parameters from model training random seed and CV
40 |         rand_seed = 200
41 |         num_splits = 5  # how many train and test splits
42 |         num_bins = math.floor(len(input_df)/num_splits) # num of bins to be created = num of labels created
43 |         qc = pd.cut(input_df.index.tolist(), num_bins) # create bins for age
44 |         cv_5fold = StratifiedKFold(n_splits=num_splits, shuffle=False, random_state=None)
45 | 
46 |         for column in column_list:  # for each workflow, X = true age, y= predicted age
47 |             results_pred = pd.DataFrame()
48 |             X = ['age']
49 |             y = column 
50 |             print(f'worflow name: {column}')
51 | 
52 |             for train_idx, test_idx in cv_5fold.split(input_df, qc.codes):
53 |                 # print('test_idx', test_idx)
54 |                 train_df, test_df = input_df.loc[train_idx,:], input_df.loc[test_idx,:]  # get test and train dataframes
55 |                 print('train size:', train_df.shape, 'test size:', test_df.shape)
56 |                 # print(test_df)
57 | 
58 |                 train_x = train_df.loc[:, X]  # true age
59 |                 train_y = train_df.loc[:, y]  # predicted age
60 | 
61 |                 model = LinearRegression().fit(train_x, train_y)  # x = age, y = predicted age
62 |                 print(model.intercept_, model.coef_)
63 |                 corrected_pred = (test_df[y] - model.intercept_) / model.coef_ # corrected predictions
64 | 
65 |                 if results_pred.empty:
66 |                     results_pred = corrected_pred
67 |                 else:
68 |                     results_pred = pd.concat([results_pred, corrected_pred], axis=0)
69 | 
70 |             results_pred.sort_index(axis=0, level=None, ascending=True, inplace=True)
71 |             output_df = pd.concat([output_df, results_pred], axis=1)
72 | 
73 |         output_df.rename(columns=dict(zip(column_list, column_name_original)), inplace=True)
74 | 
75 |         print('ALL DONE')
76 |         print(f'Corrected predictions: \n {output_df}')
77 |         output_df.to_csv(BC_predictions_file, index=False)
78 |     else:
79 |         print(f'{input_predictions_file} not found')
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/brainage/create_splits.py:
--------------------------------------------------------------------------------
 1 | #!/home/smore/.venvs/py3smore/bin/python3
 2 | import math
 3 | import pandas as pd
 4 | from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
 5 | 
 6 | 
 7 | # def create_splits(data_df, repeats):
 8 | #     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
 9 | #     print('num_bins', num_bins, len(data_df)/repeats)
10 | #
11 | #     qc = pd.cut(data_df.index, num_bins)
12 | #     df = pd.DataFrame({'bin': qc.codes})
13 | #
14 | #     max_num = max(df['bin'].value_counts())
15 | #     print(df['bin'].value_counts())
16 | #     print(max_num, 'max_num')
17 | #
18 | #     test_idx = {}
19 | #     for rpt_num in range(0, repeats):
20 | #         key = 'repeat_' + str(rpt_num)
21 | #         test_idx[key] = []
22 | #
23 | #     if repeats == max_num:
24 | #         for num in range(0, max_num):
25 | #             for bin_idx in df['bin'].unique():
26 | #                 test = df[df['bin'] == bin_idx]
27 | #                 if num < len(test):
28 | #                     key = 'repeat_' + str(num)
29 | #                     test_idx[key].append(test.index[num])
30 | #     return test_idx
31 | 
32 | 
33 | def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
34 |     """
35 |     :param bins_on: variable used to create bins
36 |     :param num_bins: num of bins/classes to create
37 |     :param data: data to create cv splits on
38 |     :param num_splits: number of cv splits to create
39 |     :param shuffle: shuffle the data or not
40 |     :param random_state: random seed to use if shuffle=True
41 |     :return: a dictionary with index
42 |     """
43 |     qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
44 |     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
45 |     test_idx = {}
46 |     rpt_num = 0
47 |     for train_index, test_index in cv.split(data, qc.codes):
48 |         key = 'repeat_' + str(rpt_num)
49 |         test_idx[key] = test_index
50 |         rpt_num = rpt_num + 1
51 |     return test_idx
52 | 
53 | 
54 | def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
55 |     """
56 |     :param bins_on: variable used to create bins
57 |     :param data: data to create cv splits on
58 |     :param num_splits: number of cv splits to create
59 |     :param shuffle: shuffle the data or not
60 |     :param random_state: random seed to use if shuffle=True
61 |     :return: a dictionary with index
62 |     """
63 |     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
64 |     test_idx = {}
65 |     rpt_num = 0
66 |     for train_index, test_index in cv.split(data, bins_on):
67 |         key = 'repeat_' + str(rpt_num)
68 |         test_idx[key] = test_index
69 |         rpt_num = rpt_num + 1
70 |     return test_idx
71 | 
72 | 
73 | # def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
74 | #     """
75 | #     :param bins_on: variable used to create bins
76 | #     :param num_bins: num of bins/classes to create
77 | #     :param data: data to create cv splits on
78 | #     :param num_splits: number of cv splits to create
79 | #     :param shuffle: shuffle the data or not
80 | #     :param random_state: random seed to use if shuffle=True
81 | #     :return: cv iterator
82 | #     """
83 | #     qc = pd.cut(bins_on.tolist(), num_bins)
84 | #     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
85 | #     return cv
86 | 
87 | 
88 | def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
89 |     qc = pd.cut(bins_on.tolist(), num_bins)
90 |     cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
91 |     test_idx = {}
92 |     rpt_num = 0
93 |     for train_index, test_index in cv.split(data, qc.codes):
94 |         key = 'repeat_' + str(rpt_num)
95 |         test_idx[key] = test_index
96 |         rpt_num = rpt_num + 1
97 |     return test_idx
98 | 
99 | 


--------------------------------------------------------------------------------
/codes/cross_site_bias_correction_using_CN.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.linear_model import LinearRegression
 5 | 
 6 | 
 7 | def bias_correction(train_data, test_data, x,  y):
 8 | 
 9 |     # bias correction using cole's method: (Using HC from the test sample)
10 |     # a, b = np.polyfit(train_data[x], train_data[y], 1)
11 |     # print(a, b)
12 |     # corrected_predictions = (test_data[y] - b) / a
13 |     # print(corrected_predictions)
14 | 
15 |     # bias correction using cole's method: (Using HC from the test sample)
16 |     train_x = train_data[x].to_numpy().reshape(-1, 1)  # x = age
17 |     train_y = train_data[y].to_numpy().reshape(-1, 1)  # y = predictions
18 | 
19 |     lin_reg = LinearRegression().fit(train_x, train_y)
20 |     print(lin_reg.intercept_, lin_reg.coef_)
21 | 
22 |     corrected_predictions = (test_data[y] - lin_reg.intercept_[0]) / lin_reg.coef_[0][0]
23 | 
24 |     return corrected_predictions
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     # Read arguments from submit file
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path") # age and group is mandatory
31 |     parser.add_argument("--predictions_file", type=str, help="Predictions file path")
32 |     parser.add_argument("--predictions_column_name", type=str, help="Predictions", default='S4_R4_pca+gauss')
33 |     parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name", default='.BC') # eg: 'ADNI'
34 | 
35 |     # read arguments
36 |     args = parser.parse_args()
37 |     demographics_file = args.demographics_file
38 |     predictions_file = args.predictions_file
39 |     predictions_column_name = args.predictions_column_name
40 |     output_prefix = args.output_prefix
41 | 
42 |     # example
43 |     # python3 cross_site_bias_correction_using_CN.py \
44 |     #     --demographics_file ../data/ADNI/ADNI.subject_list_cat12.8.csv \
45 |     #     --predictions_file ../results/ADNI/ADNI.S4_R4_pca.gauss.prediction.csv \
46 |     #     --predictions_column_name S4_R4_pca+gauss \
47 |     #     --output_prefix _BC
48 | 
49 |     # creating output filename same as imput predictions file name but adding output_prefix
50 |     predictions_file_name_BC = predictions_file.replace('.csv', output_prefix + '.csv')
51 | 
52 |     demographics = pd.read_csv(demographics_file)
53 |     predictions = pd.read_csv(predictions_file)
54 | 
55 |     # check if predictions contains predictions_column_name column as given by the user
56 |     assert predictions_column_name in predictions.columns, f"{predictions_column_name} column not found in {predictions_file}"
57 | 
58 |     # check if demographics contains 'age' column and 'Research Group' column (which should have 'CN' as a category)
59 |     assert "Research Group" in demographics.columns, f"'Research Group' column not found in {demographics_file}"
60 |     assert "age" in demographics.columns, f"'age' column not found in {demographics_file}"
61 |     assert 'CN' in demographics['Research Group'].unique(), f"'CN' group is not found in 'Research Group' column in {demographics_file}"
62 | 
63 |     # check if the demographics and predictions are of same length
64 |     assert len(demographics) == len(predictions), "Mimatch between length of demographics and predictions"
65 |     combined_df = pd.concat([demographics, predictions], axis=1)
66 | 
67 |     train_data = combined_df[combined_df["Research Group"] == "CN"]  # train only on Healthy subjects
68 |     test_data = combined_df  # apply on whole sample
69 |     x = 'age'
70 |     y = predictions_column_name
71 | 
72 |     corrected_predictions = bias_correction(train_data=train_data, test_data=combined_df, x=x, y=y)
73 |     corrected_predictions = corrected_predictions.to_frame()
74 |     corrected_predictions = corrected_predictions.rename(columns={predictions_column_name: predictions_column_name + output_prefix}) # adding prefix to the column name
75 | 
76 |     corrected_predictions.to_csv(predictions_file_name_BC, index=False)
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,linux,python,visualstudiocode
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,linux,python,visualstudiocode
  4 | 
  5 | ### Linux ###
  6 | *~
  7 | 
  8 | # temporary files which can be created if a process still has a handle open of a deleted file
  9 | .fuse_hidden*
 10 | 
 11 | # KDE directory preferences
 12 | .directory
 13 | 
 14 | # Linux trash folder which might appear on any partition or disk
 15 | .Trash-*
 16 | 
 17 | # .nfs files are created when an open file is removed but is still being accessed
 18 | .nfs*
 19 | 
 20 | ### macOS ###
 21 | # General
 22 | .DS_Store
 23 | .AppleDouble
 24 | .LSOverride
 25 | 
 26 | # Icon must end with two \r
 27 | Icon
 28 | 
 29 | 
 30 | # Thumbnails
 31 | ._*
 32 | 
 33 | # Files that might appear in the root of a volume
 34 | .DocumentRevisions-V100
 35 | .fseventsd
 36 | .Spotlight-V100
 37 | .TemporaryItems
 38 | .Trashes
 39 | .VolumeIcon.icns
 40 | .com.apple.timemachine.donotpresent
 41 | 
 42 | # Directories potentially created on remote AFP share
 43 | .AppleDB
 44 | .AppleDesktop
 45 | Network Trash Folder
 46 | Temporary Items
 47 | .apdisk
 48 | 
 49 | ### Python ###
 50 | # Byte-compiled / optimized / DLL files
 51 | __pycache__/
 52 | *.py[cod]
 53 | *$py.class
 54 | 
 55 | # C extensions
 56 | *.so
 57 | 
 58 | # Distribution / packaging
 59 | .Python
 60 | build/
 61 | develop-eggs/
 62 | dist/
 63 | downloads/
 64 | eggs/
 65 | .eggs/
 66 | parts/
 67 | sdist/
 68 | var/
 69 | wheels/
 70 | pip-wheel-metadata/
 71 | share/python-wheels/
 72 | *.egg-info/
 73 | .installed.cfg
 74 | *.egg
 75 | MANIFEST
 76 | 
 77 | # PyInstaller
 78 | #  Usually these files are written by a python script from a template
 79 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 80 | *.manifest
 81 | *.spec
 82 | 
 83 | # Installer logs
 84 | pip-log.txt
 85 | pip-delete-this-directory.txt
 86 | 
 87 | # Unit test / coverage reports
 88 | htmlcov/
 89 | .tox/
 90 | .nox/
 91 | .coverage
 92 | .coverage.*
 93 | .cache
 94 | nosetests.xml
 95 | coverage.xml
 96 | *.cover
 97 | *.py,cover
 98 | .hypothesis/
 99 | .pytest_cache/
100 | pytestdebug.log
101 | 
102 | # Translations
103 | *.mo
104 | *.pot
105 | 
106 | # Django stuff:
107 | *.log
108 | local_settings.py
109 | db.sqlite3
110 | db.sqlite3-journal
111 | 
112 | # Flask stuff:
113 | instance/
114 | .webassets-cache
115 | 
116 | # Scrapy stuff:
117 | .scrapy
118 | 
119 | # Sphinx documentation
120 | docs/_build/
121 | doc/_build/
122 | 
123 | # PyBuilder
124 | target/
125 | 
126 | # Jupyter Notebook
127 | .ipynb_checkpoints
128 | 
129 | # IPython
130 | profile_default/
131 | ipython_config.py
132 | 
133 | # pyenv
134 | .python-version
135 | 
136 | # pipenv
137 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
138 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
139 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
140 | #   install all needed dependencies.
141 | #Pipfile.lock
142 | 
143 | # poetry
144 | #poetry.lock
145 | 
146 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
147 | __pypackages__/
148 | 
149 | # Celery stuff
150 | celerybeat-schedule
151 | celerybeat.pid
152 | 
153 | # SageMath parsed files
154 | *.sage.py
155 | 
156 | # Environments
157 | # .env
158 | .env/
159 | .venv/
160 | env/
161 | venv/
162 | ENV/
163 | env.bak/
164 | venv.bak/
165 | pythonenv*
166 | 
167 | # Spyder project settings
168 | .spyderproject
169 | .spyproject
170 | 
171 | # Rope project settings
172 | .ropeproject
173 | 
174 | # mkdocs documentation
175 | /site
176 | 
177 | # mypy
178 | .mypy_cache/
179 | .dmypy.json
180 | dmypy.json
181 | 
182 | # Pyre type checker
183 | .pyre/
184 | 
185 | # pytype static type analyzer
186 | .pytype/
187 | 
188 | # operating system-related files
189 | # file properties cache/storage on macOS
190 | *.DS_Store
191 | # thumbnail cache on Windows
192 | Thumbs.db
193 | 
194 | # profiling data
195 | .prof
196 | 
197 | 
198 | ### VisualStudioCode ###
199 | .vscode/*
200 | *.code-workspace
201 | 
202 | ### VisualStudioCode Patch ###
203 | # Ignore all local history of files
204 | .history
205 | .ionide
206 | 
207 | # End of https://www.toptal.com/developers/gitignore/api/macos,linux,python,visualstudiocode


--------------------------------------------------------------------------------
/codes/within_site_combine_predictions.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import argparse
  3 | import os.path
  4 | import numpy as np
  5 | import pandas as pd
  6 | from brainage import read_data
  7 | 
  8 | def check_predictions(data_df, test_idx, model, test_pred):
  9 | 
 10 |     all_idx = np.array(range(0, len(data_df)))
 11 |     train_idx = np.delete(all_idx, test_idx)
 12 |     train_df = data_df.loc[train_idx, :]
 13 |     test_df = data_df.loc[test_idx, :]
 14 | 
 15 |     if type(model) == list:
 16 |         train_pred = model[0].predict(train_df[X]).ravel()
 17 |     else:
 18 |         train_pred = model.predict(train_df[X]).ravel()
 19 |     print(train_pred.shape, train_df[y].shape)
 20 | 
 21 |     test_pred_model = model.predict(test_df[X]).ravel()
 22 |     assert(np.round(test_pred) == np.round(test_pred_model)).all() # check if test pred saved == test predictions using model
 23 | 
 24 |     # print('Prediction from CV models', test_pred)
 25 |     # print('Prediction saved during training',test_pred_model)
 26 | 
 27 |     print('Predictions match')
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 |     parser = argparse.ArgumentParser()
 32 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path")
 33 |     parser.add_argument("--features_path", type=str, help="Features file path")
 34 |     parser.add_argument("--model_path", type=str, help="Path to directory where within site models of particular datasets are saved")
 35 |     parser.add_argument("--output_prefix", type=str, help="Output prefix for predictions filename", default='all_models_pred')
 36 |     
 37 |     # Parse the arguments
 38 |     args = parser.parse_args()
 39 |     demographics_file = args.demographics_file
 40 |     features_path = args.features_path
 41 |     model_path = args.model_path
 42 |     output_prefix = args.output_prefix
 43 | 
 44 |     # python3 within_site_combine_predictions.py --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv --features_path ../data/ixi/ixi. --model_path ../results/ixi/ixi. --output_prefix all_models_pred
 45 | 
 46 |     # demographics_file = '../data/ixi/ixi.subject_list_cat12.8.csv'
 47 |     # features_path = '../data/ixi/ixi.'
 48 |     # model_path = '../results/ixi/ixi.'
 49 |     # output_prefix = 'all_models_pred'
 50 |     
 51 |     model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #, 'xgb']
 52 |     data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4', 'S4_R4', 'S4_R4', 'S8_R4', 'S8_R4',
 53 |                        'S0_R8', 'S0_R8', 'S4_R8', 'S4_R8', 'S8_R8', 'S8_R8']
 54 |     filenm_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca',
 55 |                        'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca']
 56 | 
 57 |     df_pred_all = pd.DataFrame()
 58 |     df_pred = pd.DataFrame()
 59 |     df = pd.DataFrame()
 60 | 
 61 |     for idx, filenm_item in enumerate(filenm_list):  # for each feature space
 62 |         for model_item in model_names:  
 63 |             features_file = features_path + data_list[idx] # get features file
 64 |             result_file = model_path + filenm_item + '.' + model_item + '.results'  # get results
 65 |             model_file = model_path + filenm_item + '.' + model_item + '.models'  # get models
 66 | 
 67 |             if os.path.isfile(model_file): # if model exists
 68 |                 print('\n')
 69 |                 print('data file: ', features_file)
 70 |                 print('demographic file: ', demographics_file)
 71 |                 print('model used:', model_file, '\n')
 72 |                 print('results file: ', result_file)
 73 | 
 74 |                 # Read the results file
 75 |                 res = pickle.load(open(result_file,'rb'))  # load the saved results
 76 |                 res_model = pickle.load(open(model_file, 'rb'))  # load the saved results
 77 |                 data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file)
 78 | 
 79 |                 df = pd.DataFrame()
 80 |                 df_pred = pd.DataFrame()
 81 | 
 82 |                 for key1, value1 in res.items():
 83 |                     df = pd.DataFrame()
 84 |                     for key2, value2 in value1.items():
 85 |                         print(key1, key2)
 86 |                         test_idx = value2['test_idx'] # get the saved test indices for each fold and pick up demo
 87 |                         print(value2['test_idx'].shape)
 88 |                         df['site'] = data_df.iloc[test_idx]['site']
 89 |                         df['subject'] = data_df.iloc[test_idx]['subject']
 90 |                         df['age'] = data_df.iloc[test_idx]['age']  # should be same as value2['true']
 91 |                         df['gender'] = data_df.iloc[test_idx]['gender']
 92 | 
 93 |                         if 'session' in data_df.columns:
 94 |                             df['session'] = data_df.iloc[test_idx]['session']
 95 | 
 96 |                         model = res_model[key1][key2]  # get CV model for each fold
 97 |                         test_pred = value2['predictions'] # get the saved predictions for each fold
 98 | 
 99 |                         check_predictions(data_df, test_idx, model, test_pred) # get predictions using model, check if equal to saved
100 | 
101 |                         df[filenm_item + ' + ' + key2] = value2['predictions']  # predictions
102 | 
103 |                     df_pred = pd.concat([df_pred, df], axis=0) # concat over all CV
104 |                     df_pred.sort_index(axis=0, level=None, ascending=True, inplace=True)
105 | 
106 |                 if len(df_pred_all) == 0: # concat over all workflows
107 |                     df_pred_all = df_pred
108 |                 else:
109 |                     df_pred_all = df_pred_all.merge(df_pred, on=list(set(data_df.columns.tolist()) - set(X)), how="left")
110 | 
111 |     print('\n', 'predictions dataframe:', '\n', df_pred_all)
112 |     save_path = model_path + output_prefix + '.csv'
113 |     print('output path:', save_path)
114 |     df_pred_all.to_csv(save_path, index=False)
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/codes/predict_age.py:
--------------------------------------------------------------------------------
  1 | #from read_data_mask_resampled import *
  2 | from brainage import calculate_voxelwise_features
  3 | from pathlib import Path
  4 | import pandas as pd
  5 | import argparse
  6 | import pickle
  7 | import os
  8 | import re
  9 | 
 10 | 
 11 | def model_pred(test_df, model_file, feature_space_str):
 12 |     """This functions predicts age
 13 |     Args:
 14 |         test_df (dataframe): test data
 15 |         model_file (pickle file): trained model file
 16 |         feature_space_str (string): feature space name
 17 | 
 18 |     Returns:
 19 |         dataframe: predictions from the model
 20 |     """    
 21 | 
 22 |     model = pickle.load(open(model_file, 'rb')) # load model
 23 |     pred = pd.DataFrame()
 24 |     for key, model_value in model.items():
 25 |         X = data_df.columns.tolist()
 26 |         pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X])  # preprocessed data
 27 |         y_pred = model_value.predict(test_df).ravel()
 28 |         print(y_pred.shape)
 29 |         pred[feature_space_str + '+' + key] = y_pred
 30 |     return pred
 31 | 
 32 | 
 33 | if __name__ == "__main__":
 34 |     parser = argparse.ArgumentParser()
 35 |     parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
 36 |     parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI_paths_cat12.8.csv'
 37 |     parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
 38 |     parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
 39 |     parser.add_argument("--mask_file", type=str, help="path to GM mask nii file",
 40 |                         default='../masks/brainmask_12.8.nii')
 41 |     parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4)
 42 |     parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4)
 43 |     parser.add_argument("--model_file", type=str, help="Trained model to be used to predict",
 44 |                         default='../trained_models/4sites.S4_R4_pca.gauss.models')
 45 |     # For testing
 46 |     # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI_paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii  --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models
 47 | 
 48 |     args = parser.parse_args()
 49 |     features_path = Path(args.features_path)
 50 |     subject_filepaths = args.subject_filepaths
 51 |     output_path = Path(args.output_path)
 52 |     output_prefix = args.output_prefix
 53 |     smooth_fwhm = args.smooth_fwhm
 54 |     resample_size = args.resample_size
 55 |     mask_file = args.mask_file
 56 |     model_file = args.model_file
 57 | 
 58 |     print('\nBrain-age trained model used: ', model_file)
 59 |     print('Subjects filepaths (test data): ', subject_filepaths)
 60 |     print('Directory to features path: ',  features_path)
 61 |     print('Results directory: ', output_path)
 62 |     print('Results filename prefix: ', output_prefix)
 63 |     print('GM mask used: ', mask_file)
 64 | 
 65 |     # get feature space name from the model file entered and
 66 |     # create feature space name using the input values (smoothing, resampling)
 67 |     # match them: they should be same
 68 | 
 69 |     # get feature space name from the model file entered in argument
 70 |     pipeline_name1 = model_file.split('/')[-1]
 71 |     feature_space = pipeline_name1.split('.')[1]
 72 |     model_name = pipeline_name1.split('.')[2]
 73 |     pipeline_name = feature_space + '.' + model_name
 74 |     
 75 |     # create feature space name using the input values (smoothing, resampling)
 76 |     pca_string = re.findall(r"pca", feature_space)
 77 |     if len(pca_string) == 1:
 78 |         feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca'
 79 |     else:
 80 |         feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size)
 81 | 
 82 |     # match them: they should be same
 83 |     assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})"
 84 | 
 85 |     print('Feature space: ', feature_space)
 86 |     print('Model name: ', model_name)
 87 | 
 88 |     # Create directories, create features if they don't exists
 89 |     output_path.mkdir(exist_ok=True, parents=True)
 90 |     features_path.mkdir(exist_ok=True, parents=True)
 91 |     features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size)
 92 |     features_fullfile = os.path.join(features_path, features_filename)
 93 |     print('\nfilename for features created: ', features_fullfile)
 94 | 
 95 |     if os.path.isfile(features_fullfile): # check if features file exists
 96 |         print('\n----File exists')
 97 |         data_df = pickle.load(open(features_fullfile, 'rb'))
 98 |         print('Features loaded')
 99 |     else:
100 |         print('\n-----Extracting features')
101 |         # create features
102 |         data_df = calculate_voxelwise_features(subject_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size)
103 |         # save features
104 |         pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4)
105 |         data_df.to_csv(features_fullfile + '.csv', index=False)
106 |         print('Feature extraction done and saved')
107 | 
108 |     # get predictions and save
109 |     try:
110 |         predictions_df = model_pred(data_df, model_file, feature_space_str)
111 |         # save predictions
112 |         predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv'
113 |         predictions_fullfile = os.path.join(output_path, predictions_filename)
114 |         print('\nfilename for predictions created: ', predictions_fullfile)
115 |         predictions_df.to_csv(predictions_fullfile, index=False)
116 |         print(predictions_df)
117 | 
118 |     except FileNotFoundError:
119 |         print(f'{model_file} is not present')
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/codes/cross_site_bias_correction.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import argparse
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.linear_model import LinearRegression
  6 | from brainage import  read_data, performance_metric
  7 | from sklearn.model_selection import RepeatedStratifiedKFold
  8 | 
  9 | 
 10 | if __name__ == '__main__':
 11 |     # Read arguments from submit file
 12 |     parser = argparse.ArgumentParser()
 13 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path")
 14 |     parser.add_argument("--features_file", type=str, help="Features file path")
 15 |     parser.add_argument("--model_file", type=str, help="Path to saved model ", default='../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss') # path to scores-CV models file
 16 | 
 17 |     # read arguments
 18 |     args = parser.parse_args()
 19 |     demographics_file = args.demographics_file
 20 |     features_file = args.features_file
 21 |     model_file = args.model_file
 22 |     model_name = model_file.split('.')[-1]
 23 | 
 24 | # python3 cross_site_bias_correction.py \
 25 | #     --demographics_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.subject_list_cat12.8.csv \
 26 | #     --features_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.S4_R4 \
 27 | #     --model_file ../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss
 28 | 
 29 |     scores_path = model_file + '.scores' # contains CV models
 30 |     cv_prediction_savepath = model_file + '.predictions.csv' # save CV predictions
 31 |     bias_params_savepath = model_file + '.bias_params' # save BC parameters
 32 | 
 33 |     print('\nfeatures used:', features_file)
 34 |     print('\model_file:', model_file)
 35 |     print('\nscores_path:', scores_path)
 36 |     print('\ncv_prediction_savepath:', cv_prediction_savepath)
 37 |     print('\nbias_params_savepath:', bias_params_savepath)
 38 |     print('\nmodel used:', model_name)
 39 | 
 40 |     # Load the data which was used for training
 41 |     data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file)
 42 | 
 43 |     # Fixed variables, set random seed, create classes for age
 44 |     rand_seed, n_splits, n_repeats = 200, 5, 5  # fixed during training models
 45 |     qc = pd.cut(data_df['age'].tolist(), bins=5, precision=1)  # create bins for train data only
 46 |     print('age_bins', qc.categories, 'age_codes', qc.codes)
 47 |     data_df['bins'] = qc.codes # add bin/classes as a column in train df
 48 | 
 49 |     # Load scores which contains CV models
 50 |     scores = pickle.load(open(scores_path, 'rb'))
 51 | 
 52 |     # get the exact train and test splits of CV as used during training
 53 |     test_idx_all = list()
 54 |     cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=rand_seed).split(data_df, data_df.bins)
 55 |     for train_idx, test_idx in cv:
 56 |         test_idx_all.append(test_idx)
 57 | 
 58 |     # Get CV predictions for each split and repeat
 59 |     predictions_df = pd.DataFrame()
 60 |     predictions_df_all = pd.DataFrame()
 61 |     cv_split = range(0, 25, 5)  # [0, 5, 10, 15, 20, 25] get predictions and arrange them in diff. columns for diff. repeats
 62 | 
 63 |     for each_split in cv_split: # for each split (25 in total)
 64 |         print('each_split', each_split)
 65 |         predictions_df = pd.DataFrame()
 66 |         for ind in range(each_split, each_split + n_splits):  # run from (0,5), (5,10), (10,15), (15,20), (20,25)
 67 |             print('Split number', ind)
 68 |             temp_df = pd.DataFrame()
 69 |             model_cv = scores[model_name]['estimator'][ind] # pick CV estimator
 70 |             test_idx = test_idx_all[ind] # pick test indices
 71 | 
 72 |             # get predictions for test data
 73 |             test_df = data_df.iloc[test_idx, :] # take test data from one split
 74 |             y_true = test_df[y]
 75 |             y_pred = model_cv.predict(test_df[X]).ravel()
 76 |             mae, mse, corr = performance_metric(y_true, y_pred)
 77 |             print(f' test true age size: {y_true.shape}, predicted age sixe: {y_pred.shape}')
 78 |             print(f'MAE: {mae}, MSE: {mse}, CoRR: {corr}')
 79 | 
 80 |             if predictions_df.empty:
 81 |                 predictions_df['test_index'] = pd.Series(test_idx)
 82 |                 predictions_df['predictions_' + str(each_split)] = pd.Series(y_pred)
 83 |             else:
 84 |                 temp_df['test_index'] = pd.Series(test_idx)
 85 |                 temp_df['predictions_' + str(each_split)] = pd.Series(y_pred)
 86 | 
 87 |             predictions_df = pd.concat([predictions_df, temp_df], axis=0)  # append for all the splits of one repeat
 88 | 
 89 |         predictions_df.sort_values(by=['test_index'], inplace=True)
 90 | 
 91 |         if predictions_df_all.empty:
 92 |             predictions_df_all = predictions_df
 93 |         else:
 94 |             predictions_df_all = predictions_df_all.merge(predictions_df, on=['test_index'], how="left") # merge for all the repeats
 95 | 
 96 |     print('predictions_df_all', predictions_df_all)
 97 |     predictions_df_all = predictions_df_all.reset_index(drop=True)
 98 |     predictions_df_all = pd.concat([data_df[['site', 'subject', 'age', 'gender']], predictions_df_all], axis=1) # add subject info
 99 |     predictions_df_all.to_csv(cv_prediction_savepath)
100 | 
101 |     # Calculate bias correction parameters (m and c) from cv predictions for each column
102 |     results_pred = pd.DataFrame()
103 |     filter_col = [col for col in predictions_df_all if col.startswith('predictions')]
104 |     print('filter_col', filter_col)
105 | 
106 |     model_intercept, model_coef = [], []
107 |     model_bias_params = {'c':0, 'm': 1}
108 | 
109 |     for column in filter_col:  # for 5 repeats
110 |         X_lin = 'age'
111 |         y_lin = column
112 |         train_x = predictions_df_all.loc[:, X_lin].to_numpy().reshape(-1, 1)  # true age
113 |         train_y = predictions_df_all.loc[:, y_lin].to_numpy().reshape(-1, 1)  # predicted age
114 |         lin_reg = LinearRegression().fit(train_x, train_y)
115 | 
116 |         print(f'Intercept: {lin_reg.intercept_}, slope: {lin_reg.coef_}')
117 |         model_intercept.append(lin_reg.intercept_)
118 |         model_coef.append(lin_reg.coef_)
119 | 
120 |     # use this m and c for bias correction on test data later
121 |     model_bias_params['m'] = np.mean(model_coef)
122 |     model_bias_params['c'] = np.mean(model_intercept)
123 |     print('average slope', model_bias_params['m'])
124 |     print('average intercept', model_bias_params['c'])
125 |     pickle.dump(model_bias_params, open(bias_params_savepath, 'wb'))
126 |     print('ALL DONE')
127 | 


--------------------------------------------------------------------------------
/codes/predict_age_sing.py:
--------------------------------------------------------------------------------
  1 | #from read_data_mask_resampled import *
  2 | from brainage import calculate_voxelwise_features
  3 | from pathlib import Path
  4 | import pandas as pd
  5 | import argparse
  6 | import pickle
  7 | import os
  8 | import re
  9 | 
 10 | 
 11 | def model_pred(test_df, model_file, feature_space_str):
 12 |     """This functions predicts age
 13 |     Args:
 14 |         test_df (dataframe): test data
 15 |         model_file (pickle file): trained model file
 16 |         feature_space_str (string): feature space name
 17 | 
 18 |     Returns:
 19 |         dataframe: predictions from the model
 20 |     """    
 21 | 
 22 |     model = pickle.load(open(model_file, 'rb')) # load model
 23 |     pred = pd.DataFrame()
 24 |     for key, model_value in model.items():
 25 |         X = data_df.columns.tolist()
 26 |         pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X])  # preprocessed data
 27 |         y_pred = model_value.predict(test_df).ravel()
 28 |         print(y_pred.shape)
 29 |         pred[feature_space_str + '+' + key] = y_pred
 30 |     return pred
 31 | 
 32 | 
 33 | if __name__ == "__main__":
 34 |     parser = argparse.ArgumentParser()
 35 |     parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
 36 |     parser.add_argument("--data_dir", type=str, help="path to data dir")  #
 37 |     parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI.paths_cat12.8.csv'
 38 |     parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
 39 |     parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
 40 |     parser.add_argument("--mask_file", type=str, help="path to GM mask nii file",
 41 |                         default='../masks/brainmask_12.8.nii')
 42 |     parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4)
 43 |     parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4)
 44 |     parser.add_argument("--model_file", type=str, help="Trained model to be used to predict",
 45 |                         default='../trained_models/4sites.S4_R4_pca.gauss.models')
 46 |     # For testing
 47 |     # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii  --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models
 48 | 
 49 |     args = parser.parse_args()
 50 |     features_path = args.features_path
 51 |     data_dir = args.data_dir
 52 |     subject_filepaths = args.subject_filepaths
 53 |     output_path = args.output_path
 54 |     output_prefix = args.output_prefix
 55 |     smooth_fwhm = args.smooth_fwhm
 56 |     resample_size = args.resample_size
 57 |     mask_file = args.mask_file
 58 |     model_file = args.model_file
 59 | 
 60 |     print('\nBrain-age trained model used: ', model_file)
 61 |     print('Data directory (test data): ', data_dir)
 62 |     print('Subjects filepaths (test data): ', subject_filepaths)
 63 |     print('Directory to features path: ',  features_path)
 64 |     print('Results directory: ', output_path)
 65 |     print('Results filename prefix: ', output_prefix)
 66 |     print('GM mask used: ', mask_file)
 67 |     
 68 |     # create full filename for the nii files of the subjects and save as csv in features_path
 69 |     subject_filepaths_nii = pd.read_csv(subject_filepaths, header=None)
 70 |     subject_filepaths_nii = data_dir + '/' +subject_filepaths_nii
 71 |     print(subject_filepaths_nii)
 72 |     subject_full_filepaths = os.path.join(features_path, 'subject_full_filepaths.csv')
 73 |     print(subject_full_filepaths)
 74 |     subject_filepaths_nii.to_csv(subject_full_filepaths, header=False, index=False)
 75 |     
 76 |     
 77 |     # get feature space name from the model file entered and
 78 |     # create feature space name using the input values (smoothing, resampling)
 79 |     # match them: they should be same
 80 | 
 81 |     # get feature space name from the model file entered in argument
 82 |     pipeline_name1 = model_file.split('/')[-1]
 83 |     feature_space = pipeline_name1.split('.')[1]
 84 |     model_name = pipeline_name1.split('.')[2]
 85 |     pipeline_name = feature_space + '.' + model_name
 86 |     
 87 |     # create feature space name using the input values (smoothing, resampling)
 88 |     pca_string = re.findall(r"pca", feature_space)
 89 |     if len(pca_string) == 1:
 90 |         feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca'
 91 |     else:
 92 |         feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size)
 93 | 
 94 |     # match them: they should be same
 95 |     assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})"
 96 | 
 97 |     print('Feature space: ', feature_space)
 98 |     print('Model name: ', model_name)
 99 | 
100 |     # Create directories, create features if they don't exists
101 |     Path(output_path).mkdir(exist_ok=True, parents=True)
102 |     Path(features_path).mkdir(exist_ok=True, parents=True)
103 |     features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size)
104 |     features_fullfile = os.path.join(features_path, features_filename)
105 |     print('\nfilename for features created: ', features_fullfile)
106 | 
107 |     if os.path.isfile(features_fullfile): # check if features file exists
108 |         print('\n----File exists')
109 |         data_df = pickle.load(open(features_fullfile, 'rb'))
110 |         print('Features loaded')
111 |     else:
112 |         print('\n-----Extracting features')
113 |         # create features
114 |         data_df = calculate_voxelwise_features(subject_full_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size)
115 |         # save features
116 |         pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4)
117 |         data_df.to_csv(features_fullfile + '.csv', index=False)
118 |         print('Feature extraction done and saved')
119 | 
120 |     # get predictions and save
121 |     try:
122 |         predictions_df = model_pred(data_df, model_file, feature_space_str)
123 |         # save predictions
124 |         predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv'
125 |         predictions_fullfile = os.path.join(output_path, predictions_filename)
126 |         print('\nfilename for predictions created: ', predictions_fullfile)
127 |         predictions_df.to_csv(predictions_fullfile, index=False)
128 |         print(predictions_df)
129 | 
130 |     except FileNotFoundError:
131 |         print(f'{model_file} is not present')
132 | 
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/codes/cross_site_combine_predictions.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import argparse
  3 | import os.path
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.metrics import mean_absolute_error, mean_squared_error
  7 | 
  8 | def model_pred(test_df, X, y, model_file, workflow_name):
  9 | 
 10 |     # load the model
 11 |     model = pickle.load(open(model_file, 'rb'))
 12 |     y_true = test_df[y].reset_index(drop=True)
 13 | 
 14 |     # Initialize dataframe for saving output
 15 |     pred = pd.DataFrame()
 16 |     mae_corr = pd.DataFrame()
 17 | 
 18 |     for key, model_value in model.items():
 19 |         X_preprocessed, _ = model_value.preprocess(test_df[X], y_true, until='variancethreshold')# until='zscore'
 20 |         # print('X_preprocessed shape after variancethreshold',  X_preprocessed.shape)
 21 | 
 22 |         # predict test data
 23 |         y_pred = model_value.predict(test_df[X]).ravel()
 24 |         print('age and predicted age sizes', y_true.shape, y_pred.shape)
 25 |         mae = np.round(mean_absolute_error(y_true, y_pred), 3)
 26 |         mse = np.round(mean_squared_error(y_true, y_pred), 2)
 27 |         corr = np.round(np.corrcoef(y_pred, y_true)[1, 0], 2)
 28 | 
 29 |         print('MAE:', mae, 'MSE:', mse, 'CoRR:', corr)
 30 |         print('workflow_name:', workflow_name, key)
 31 | 
 32 |         pred[workflow_name] = y_pred # add column for predictions
 33 |         mae_corr = pd.concat([mae_corr, pd.DataFrame([{'mae': mae, 'mse': mse, 'corr': corr}], index=[workflow_name])], axis=0)
 34 | 
 35 |     return pred, y_true, mae_corr
 36 | 
 37 | 
 38 | def read_data(features_file, demographics_file):
 39 |     demo_df = pd.read_csv(open(demographics_file, 'rb'))
 40 |     data_df = pickle.load(open(features_file, 'rb'))
 41 |     data_df = pd.concat([demo_df, data_df], axis=1)
 42 | #    data_df = data_df.drop(columns='file_path_cat12.8')
 43 |     data_df.rename(columns=lambda X: str(X), inplace=True)  # convert numbers to strings as column names
 44 |     X = [col for col in data_df if col.startswith('f_')]
 45 |     y = 'age'
 46 |     age = data_df[y].round().astype(int)  # round off age and convert to integer
 47 |     data_df[y] = age
 48 |     return data_df, X, y
 49 | 
 50 | 
 51 | if __name__ == '__main__':
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path")
 54 |     parser.add_argument("--features_path", type=str, help="Features file path")
 55 |     parser.add_argument("--model_path", type=str, help="Path to directory where within site models of particular datasets are saved")
 56 |     parser.add_argument("--output_prefix", type=str, help="Output prefix for predictions filename", default='pred_1000brains_all')
 57 | 
 58 |     # Parse the arguments
 59 |     args = parser.parse_args()
 60 |     demographics_file = args.demographics_file
 61 |     features_path = args.features_path
 62 |     model_path = args.model_path
 63 |     output_prefix = args.output_prefix
 64 | 
 65 |     # python3 cross_site_combine_predictions.py --demographics_file ../data/1000brains/1000brains.subject_list_cat12.8.csv --features_path ../data/1000brains/1000brains. --model_path ../results/ixi_camcan_enki/ixi_camcan_enki. --output_prefix pred_1000brains_all
 66 | 
 67 |     # demographics_file = '../data/1000brains/1000brains.subject_list_cat12.8.csv'
 68 |     # features_path = '../data/1000brains/1000brains.'
 69 |     # model_path = '../results/ixi_camcan_enki/ixi_camcan_enki.'
 70 |     # output_prefix = 'pred_1000brains_all'
 71 | 
 72 |     model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly']
 73 |     data_list = ['173', '473', '873', '1273', 'S0_R4', 'S0_R4', 'S4_R4', 'S4_R4', 'S8_R4', 'S8_R4',
 74 |                  'S0_R8', 'S0_R8', 'S4_R8', 'S4_R8', 'S8_R8', 'S8_R8']
 75 |     filenm_list = ['173', '473', '873', '1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca',
 76 |                  'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca']
 77 | 
 78 |     output_df = pd.DataFrame()
 79 |     mae_corr_df = pd.DataFrame()
 80 | 
 81 |     for idx, data_item in enumerate(filenm_list): # for each feature space
 82 |         for model_item in model_names:
 83 |             features_file = features_path + data_list[idx]  # get test features
 84 |             model_file = model_path + data_item + '.' + model_item + '.models' # get models
 85 | 
 86 |             if os.path.exists(model_file) and os.path.exists(features_file): # if test data and trained model exists
 87 |                 print('\n')
 88 |                 print('test data', features_file)
 89 |                 print('demographic file: ', demographics_file)
 90 |                 print('model used', model_file)
 91 |                 print("model and data exists")
 92 | 
 93 |                 test_df, test_X, test_y = read_data(features_file, demographics_file) # load test data, read data and demo both
 94 |                 y_pred1, y_true1, mae_corr1 = model_pred(test_df, test_X, test_y, model_file,
 95 |                                                          str(data_item + ' + ' + model_item))  # predict test data
 96 | 
 97 |                 if output_df.empty:
 98 |                     needed_cols = test_df.columns[~test_df.columns.isin(test_X)].tolist()
 99 |                     output_df = test_df[needed_cols].copy()
100 | 
101 |                 output_df = pd.concat([output_df, y_pred1], axis=1)  # concat for all workflows
102 |                 mae_corr_df = pd.concat([mae_corr_df, mae_corr1], axis=0)
103 | 
104 |     print('\n', 'predictions dataframe:', '\n', output_df)
105 | 
106 |     mae_corr_df.to_csv(model_path + output_prefix + '_temp.csv')
107 |     output_df.to_csv(model_path + output_prefix + '.csv', index=False)
108 | 
109 |     # keep predictions from 32 selected workdlows (we trained more than 32)
110 |     selected_workflows_df = ['site', 'subject', 'age', 'gender',
111 |                              '173 + rf', '173 + gauss', '173 + lasso',
112 |                              '473 + lasso', '473 + rvr_poly',
113 |                              '873 + gauss', '873 + elasticnet',
114 |                              '1273 + gauss', '1273 + rvr_poly',
115 |                              'S0_R4 + lasso',
116 |                              'S4_R4 + ridge', 'S4_R4 + rvr_lin', 'S4_R4 + gauss',
117 |                              'S4_R4_pca + ridge', 'S4_R4_pca + rf', 'S4_R4_pca + rvr_lin', 'S4_R4_pca + gauss',
118 |                              'S8_R4 + kernel_ridge',
119 |                              'S8_R4_pca + rvr_lin', 'S8_R4_pca + gauss', 'S8_R4_pca + lasso', 'S8_R4_pca + rvr_poly',
120 |                              'S0_R8 + rvr_poly', 'S0_R8_pca + lasso', 'S0_R8_pca + elasticnet', 'S0_R8_pca + rvr_poly',
121 |                              'S4_R8 + ridge', 'S4_R8 + rvr_lin', 'S4_R8 + lasso',
122 |                              'S8_R8 + ridge', 'S8_R8 + kernel_ridge',
123 |                              'S8_R8_pca + elasticnet']
124 | 
125 |     if 'session' in output_df.columns:
126 |         selected_workflows_df.insert(4, 'session')
127 | 
128 |     output_df = output_df.reindex(columns=selected_workflows_df)
129 |     output_df.to_csv( model_path + output_prefix + '_selected' + '.csv', index=False)
130 | 
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 1.  **Set up**
  2 | 
  3 | ```
  4 | git clone https://github.com/juaml/brainage_estimation.git
  5 | cd brainage_estimation
  6 | python3 -m venv brainage_env
  7 | source brainage_env/bin/activate
  8 | pip install -r requirements.txt
  9 | # install other packages
 10 | pip install https://github.com/JamesRitchie/scikit-rvm/archive/master.zip
 11 | #brew install gcc # for Mac users in case you don't have it
 12 | pip install glmnet
 13 | ```
 14 | 
 15 | After the set up following codes can be run as provided in the `codes` directory.
 16 | 
 17 | 2. **Get predictions** 
 18 | 
 19 | We provide pretrained models that can used to obtain predictions on new samples.
 20 | 
 21 | ```
 22 | python3 predict_age.py \
 23 |     --features_path path_to_features_dir \
 24 |     --subject_filepaths path_to_txt_file \            
 25 |     --output_path path_to_output_dir \            
 26 |     --output_prefix PREFIX \         
 27 |     --mask_file ../masks/brainmask_12.8.nii \            
 28 |     --smooth_fwhm 4 \
 29 |     --resample_size 4 \
 30 |     --model_file ../trained_models/4sites.S4_R4_pca.gauss.models
 31 | ```
 32 | 
 33 | The arguments are:
 34 | - `--features_path` should point to a directory where calculated features are stored as a `pickle` file.
 35 | - `--subject_filepaths` should point to a text file containing path to the CAT12.8's `mwp1` file for each subject per line.
 36 | - `--output_path` points to a directory where the predictions will be saved.
 37 | - `--output_prefix` prefix for the output files.
 38 | - `--mask_file` points to the GM mask to be used (defaults to `../masks/brainmask_12.8.nii`)
 39 | - `--smooth_fwhm` smoothing kernel size to be used (defaults to `4`)
 40 | - `--resample_size` resampling of the voxels to isometric size (defaults to `4`)
 41 | - `--model_file` should point to an already trained model (defaults to `4sites_S4_R4_pca.gauss.models`)
 42 |              
 43 | This will calculate features with 4mm smoothing and 4mm resampling (`S4_R4`) for all subjects in the file provided via `--subject_filepaths`.
 44 | The predictions will be performed using the S4_R4_pca+gauss model.
 45 | The model will perform `PCA` based on the model used.
 46 | Note that if the features are available in the `--features_path` then they will not be recalculated.
 47 | 
 48 | 3. **calculate features: voxel-wise and parcel-wise features**
 49 |         
 50 | It is possible to calculate features from a list of CAT12.8 files.
 51 | 
 52 | Voxel-wise features
 53 | ```
 54 | python3 calculate_features_voxelwise.py \
 55 |     --features_path ../data/ADNI/ \
 56 |     --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv \
 57 |     --output_prefix ADNI \
 58 |     --mask_file ../masks/brainmask_12.8.nii \
 59 |     --smooth_fwhm 4 \
 60 |     --resample_size 8 \
 61 | ```
 62 | 
 63 | Parcel-wise features
 64 | ```
 65 | python3 calculate_features_parcelwise.py \
 66 |     --features_path ../data/ADNI/ \
 67 |     --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv \
 68 |     --output_prefix ADNI \
 69 |     --mask_file ../masks/BSF_173.nii \
 70 |     --num_parcels 173 \
 71 | ```
 72 |     
 73 | 4. **Within-site: Train models**
 74 |         
 75 | ```
 76 | python3 within_site_train.py \
 77 |     --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv \
 78 |     --features_file ../data/ixi/ixi.173 \
 79 |     --output_path ../results/ixi \
 80 |     --output_prefix ixi.173 \
 81 |     --models rvr_lin \
 82 |     --pca_status 0
 83 | ```
 84 | 
 85 | The arguments are:
 86 | - `--demographics_file` should point to a `csv` file with four columns `{'subject', 'site', 'age', 'gender'}`.
 87 | - `--features_file` should point to a `pickle` file with features.
 88 | - `--output_path` points to a directory where the models, scores and results will be saved.
 89 | - `--output_prefix` prefix for output files which will be used to create three files `.models`, `.scores`, and `.results`.
 90 | - `--models` one or more models to train, multiple models can be provided as a comma separated list.
 91 | - `--pca_status` either 0 (no PCA) or 1 (for PCA retaining 100% variance). 
 92 | 
 93 | This will run outer 5-fold and inner 5x5-fold cross-validation.
 94 | 
 95 | In case you are using `HTcondor`, you can also use the provided submit file.
 96 | 
 97 | `condor_submit within_site_ixi.submit`
 98 | 
 99 | 
100 | 5. **Within-site: Read results from saved models**  
101 |         
102 | `python3 within_site_read_results.py --data_nm ../results/ixi/ixi.`
103 | 
104 | 
105 | 6. **Within-site: Get predictions from 128 workflows**  
106 |         
107 | ```
108 | python3 within_site_combine_predictions.py \
109 |     --demographics_file ../data/ixi/ixi.subject_list_cat12.8.csv \
110 |     --features_path ../data/ixi/ixi. \
111 |     --model_path ../results/ixi/ixi. \
112 |     --output_prefix all_models_pred
113 |  ```
114 |         
115 | 7. **Within-site: Bias correction**
116 |         
117 | ```
118 | python3 within_site_bias_correction.py \
119 |     --input_predictions_file ../results/ixi/ixi.all_models_pred.csv \
120 |     --BC_predictions_file ../results/ixi/ixi.all_models_pred_BC.csv
121 | ```
122 | 
123 | 
124 | 8. **Cross-site: Train and test**  
125 |       
126 | First train a model with three sites.
127 | ```
128 | python3 cross_site_train.py \
129 |     --demographics_file ../data/ixi_camcan_enki/ixi_camcan_enki_subject_list_cat12.8.csv \
130 |     --features_file ../data/ixi_camcan_enki/ixi_camcan_enki.173 \
131 |     --output_path ../results/ixi_camcan_enki \
132 |     --output_prefix ixi_camcan_enki.173 \
133 |     --models rvr_lin \
134 |     --pca_status 0
135 | ```
136 | 
137 | Now we can make predictions on the hold-out site using all models available in the `--model_path`.
138 | ```  
139 | python3 cross_site_combine_predictions.py \
140 |     --demographics_file ../data/1000brains/1000brains.subject_list_cat12.8.csv \
141 |     --features_path ../data/1000brains/1000brains. \
142 |     --model_path ../results/ixi_camcan_enki/ixi_camcan_enki. \
143 |     --output_prefix pred_1000brains_all
144 | 
145 | ```
146 | 
147 | 9. **Cross-site: Read results from saved models**  
148 |         
149 | Create cross-validation scores from cross-site predictions.
150 |         
151 | `python3 cross_site_read_results.py --data_nm ../results/ixi_camcan_enki/ixi_camcan_enki.`
152 | 
153 |      
154 | 10. **Cross-site: Bias correction**
155 | 
156 | Using the CV predictions from the training data:
157 | 
158 | ```
159 | python3 cross_site_bias_correction.py \
160 |     --demographics_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.subject_list_cat12.8.csv \
161 |     --features_file ../data/ixi_camcan_enki_1000brains/ixi_camcan_enki_1000brains.S4_R4 \
162 |     --model_file ../results/ixi_camcan_enki_1000brains/4sites.S4_R4_pca_cv.gauss
163 | ```
164 | 
165 | Using the control subjects from the testing data: 
166 | 
167 | This code will train bias correction model using the predictions and age from the control group (`CN`) group and apply to it the full sample. It needs `demographics_file` which should contain `age` and `Research Group` columns, and `Research Group` column should contain `CN` category. `predictions_file` should contain a column for predictions defined by `predictions_column_name`. The bias corrected predictions will be saved in the same location as `predictions_file` with a prefix defined by `output_prefix`.
168 | 
169 | ```
170 | python3 cross_site_bias_correction_using_CN.py \
171 |     --demographics_file ../data/ADNI/ADNI.subject_list_cat12.8.csv \
172 |     --predictions_file ../results/ADNI/ADNI.S4_R4_pca.gauss.prediction.csv \
173 |     --predictions_column_name S4_R4_pca+gauss \
174 |     --output_prefix _BC
175 | ```
176 | 


--------------------------------------------------------------------------------
/codes/cross_site_read_results.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os.path
  3 | import argparse
  4 | import pandas as pd
  5 | 
  6 | # all possible inputs
  7 | ## cross site (3 sites)
  8 | # data_nm = '..results/camcan_enki_1000brains/camcan_enki_1000brains_'
  9 | # data_nm = '..results/ixi_enki_1000brains/ixi_enki_1000brains_'
 10 | # data_nm = '..results/ixi_camcan_enki/ixi_camcan_enki_'
 11 | # data_nm = '..results/ixi_camcan_1000brains/ixi_camcan_1000brains_'
 12 | ## cross-site (4 sites)
 13 | # data_nm = '..results/ixi_camcan_enki_1000brains/4sites_'
 14 | 
 15 | if __name__ == '__main__':
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument("--data_nm", type=str, help="Output path for one dataset")
 18 | 
 19 |     args = parser.parse_args()
 20 |     data_nm = args.data_nm
 21 | 
 22 |     # Filename to save results
 23 |     cv_file_ext = 'cv_scores.csv'
 24 |     cv_file_ext_selected = 'cv_scores_selected.csv'
 25 | 
 26 |     # Complete results filepaths
 27 |     cv_filename = data_nm + cv_file_ext
 28 |     cv_filename_selected = data_nm + cv_file_ext_selected
 29 | 
 30 | 
 31 |     # all model names
 32 |     model_names = ['lin_reg', 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #'xgb'
 33 |     model_names_new = ['LiR', 'RR', 'RFR', 'RVRlin', 'KRR', 'GPR', 'LR', 'ENR', 'RVRpoly'] # 'XGB'
 34 | 
 35 |     # all feature spaces names
 36 |     data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca',
 37 |                        'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca']
 38 |     data_list_new = ['173', '473', '873','1273', 'S0_R4', 'S0_R4 + PCA', 'S4_R4', 'S4_R4 + PCA', 'S8_R4', 'S8_R4 + PCA',
 39 |                        'S0_R8', 'S0_R8 + PCA', 'S4_R8', 'S4_R8 + PCA', 'S8_R8', 'S8_R8 + PCA']
 40 | 
 41 |     # check which scores file is missing
 42 |     missing_outs = []
 43 |     for data_item in data_list:
 44 |         for model_item in model_names:
 45 |             scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file
 46 |             if os.path.isfile(scores_item):
 47 |                 print('yes')
 48 |             else:
 49 |                 missing_outs.append(scores_item)
 50 |     print('Missing files:\n', missing_outs)
 51 | 
 52 |     # get the saved cv scores
 53 |     df = pd.DataFrame()
 54 |     df_cv = pd.DataFrame()
 55 |     for data_item in data_list:
 56 |         for model_item in model_names:
 57 |             scores_item = data_nm + data_item + '.' + model_item + '.scores'  # create the complete path to scores file
 58 |             if os.path.isfile(scores_item):
 59 |                 res = pickle.load(open(scores_item, 'rb'))
 60 |                 df = pd.DataFrame()
 61 |                 mae_all, mse_all, corr_all, corr_delta_all, key_all = list(), list(), list(), list(), list()
 62 |                 for key, value in res.items():
 63 |                     mae = round(value['test_neg_mean_absolute_error'].mean() * -1, 3)
 64 |                     mse = round(value['test_neg_mean_squared_error'].mean() * -1, 3)
 65 |                     corr = round(value['test_r2'].mean(), 3)
 66 |                     mae_all.append(mae)
 67 |                     mse_all.append(mse)
 68 |                     corr_all.append(corr)
 69 |                     key_all.append(key)
 70 | 
 71 |                 df['model'] = key_all
 72 |                 df['data'] = len(mae_all) * [data_item]
 73 |                 df['cv_mae'] = mae_all
 74 |                 df['cv_mse'] = mse_all
 75 |                 df['cv_corr'] = corr_all
 76 |                 # print(df)
 77 |                 df_cv = pd.concat([df_cv, df], axis=0)
 78 | 
 79 |     df_cv = df_cv.reset_index(drop=True)
 80 |     df_cv['workflow_name'] = df_cv['data'] + ' + ' + df_cv['model']
 81 |     df_cv['data'] = df_cv['data'].replace(data_list, data_list_new)
 82 |     df_cv['model'] = df_cv['model'].replace(model_names, model_names_new)
 83 |     df_cv['workflow_name_updated'] = df_cv['data'] + ' + ' + df_cv['model']
 84 |     df_cv.reset_index(drop=True, inplace=True)
 85 | 
 86 |     # selected 32 workflows (since we have more then 32 selected workflows)
 87 |     selected_workflows_df = pd.DataFrame([
 88 |         '173 + GPR',
 89 |         '473 + LR',
 90 |         '473 + RVRpoly',
 91 |         '1273 + GPR',
 92 |         'S4_R4 + RR',
 93 |         'S4_R4 + GPR',
 94 |         'S4_R4 + PCA + RFR',
 95 |         'S4_R4 + PCA + RVRlin',
 96 |         'S8_R4 + PCA + RVRlin',
 97 |         'S8_R4 + PCA + GPR',
 98 |         'S0_R8 + PCA + ENR',
 99 |         'S0_R8 + PCA + RVRpoly',
100 |         'S4_R8 + RR',
101 |         'S8_R8 + RR',
102 |         'S8_R8 + KRR',
103 |         'S8_R8 + PCA + ENR',
104 |         'S4_R4 + PCA + GPR',
105 |         'S4_R4 + RVRlin',
106 |         'S4_R4 + PCA + RR',
107 |         'S4_R8 + RVRlin',
108 |         'S8_R4 + KRR',
109 |         'S0_R4 + LR',
110 |         'S8_R4 + PCA + RVRpoly',
111 |         'S0_R8 + RVRpoly',
112 |         'S4_R8 + LR',
113 |         '873 + GPR',
114 |         'S8_R4 + PCA + LR',
115 |         '1273 + RVRpoly',
116 |         '873 + ENR',
117 |         '173 + LR',
118 |         'S0_R8 + PCA + LR',
119 |         '173 + RFR'], columns=['workflow_name_updated'])
120 | 
121 |     df_final = df_cv.merge(selected_workflows_df, how='inner', on=['workflow_name_updated'])
122 | 
123 |     # save the csv files
124 |     print('\n cv results file:', cv_filename)
125 |     print(df_cv)
126 |     print('\n selected results file:', cv_filename_selected)
127 |     print(df_final)
128 |     df_cv.to_csv(cv_filename, index=False)
129 |     df_final.to_csv(cv_filename_selected, index=False)
130 | 
131 | 
132 |     # # check model parameters
133 |     print('\n Model Parameters')
134 |     error_models = list()
135 |     for data_item in data_list:
136 |         for model_item in model_names:
137 |             model_item = data_nm + data_item + '.' + model_item + '.models'  # get models
138 |             # print('\n','model filename', model_item)
139 |             if os.path.isfile(model_item):
140 |                 print('\n', 'model filename', model_item)
141 |                 res = pickle.load(open(model_item, 'rb'))
142 |                 # print(res)
143 |                 for key, value in res.items():
144 |                     print(key)
145 | 
146 |                     if key == 'gauss':
147 |                         model = res['gauss']['gauss']
148 |                         # print(model.get_params())
149 |                         print(model.kernel_.get_params())
150 | 
151 |                     elif key == 'kernel_ridge':
152 |                         model = res['kernel_ridge']['kernelridge']
153 |                         print(model)
154 |                         # print(model.get_params())
155 | 
156 |                     elif key == 'rvr_lin':
157 |                         model = res['rvr_lin']['rvr']
158 |                         print(model)
159 |                         # print(model.get_params())
160 | 
161 |                     elif key == 'rvr_poly':
162 |                         model = res['rvr_poly']['rvr']
163 |                         print(model)
164 |                         # print(model.get_params())
165 | 
166 |                     elif key == 'rf':
167 |                         model = res['rf']['rf']
168 |                         print(model)
169 |                         # print(model.get_params())
170 | 
171 |                     else:
172 |                         model = res[key]['elasticnet']
173 |                         # print(model.get_params())
174 |                         print(model.lambda_best_)
175 | 
176 |             else:
177 |                 error_models.append(model_item)
178 | 


--------------------------------------------------------------------------------
/brainage/calculate_features.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import nilearn
  3 | from nilearn import image
  4 | import numpy as np
  5 | import pandas as pd
  6 | import nibabel as nib
  7 | import nibabel.processing as npr
  8 | 
  9 | def subsample_img(img, f):
 10 |     """Reduce resample_to_img features of a 3D array by a given factor f."""
 11 | 
 12 |     data = img.get_fdata()
 13 |     mask = np.zeros(img.shape)
 14 |     mask[::f, ::f, ::f] = 1
 15 |     data = data * mask
 16 |     return nib.Nifti1Image(data, img.affine, img.header)
 17 | 
 18 | def binarize_3d(img, threshold):
 19 |     """binarize 3D spatial image"""
 20 |     return nib.Nifti1Image(
 21 |         np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
 22 |     )
 23 | 
 24 | def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
 25 |     """Calculate voxelwise features for the subjects
 26 | 
 27 |     Args:
 28 |         phenotype_file (csv or txt): A csv or text file with path to subject images
 29 |         mask_file (nii): The GM mask file to be used to extract features
 30 |         smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
 31 |         resample_size (int): Resample image to given voxel size
 32 | 
 33 |     Returns:
 34 |         data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
 35 |     """    
 36 | 
 37 |     phenotype = pd.read_csv(phenotype_file, header=None)
 38 |     
 39 |     # don't need this anymore
 40 |     # filename, file_extension = os.path.splitext(phenotype_file)
 41 |     # if file_extension == ".txt":
 42 |     #     phenotype = pd.read_csv(phenotype_file, header=None)
 43 |     # elif file_extension == ".csv":
 44 |     #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
 45 |     # else:
 46 |     #     raise ValueError("Wrong file. Please imput either a csv or text file")
 47 | 
 48 |     print(phenotype.shape)
 49 |     print(phenotype.head())
 50 | 
 51 | #    phenotype = phenotype.iloc[0:15]
 52 | 
 53 |     data_resampled = np.array([])  # array to save resampled features from subjects mri
 54 |     count = 0
 55 |     for index, row in phenotype.iterrows():  # iterate over each row
 56 |         sub_file = row.values[0]
 57 | 
 58 |         if os.path.exists(sub_file):
 59 |             print(f"\n-----Processing subject number {count}------")
 60 |             sub_img = nib.load(sub_file)  # load subject image
 61 |             mask_img = nib.load(mask_file)  # load mask image
 62 |             print("Subject and mask image loaded")
 63 |             print("sub affine original \n", sub_img.affine, sub_img.shape)
 64 |             print("mask affine original \n", mask_img.affine, mask_img.shape)
 65 | 
 66 |             print("Perform smoothing")
 67 |             sub_img = image.smooth_img(
 68 |                 sub_img, smooth_fwhm
 69 |             )  # smooth the image with 4 mm FWHM
 70 | 
 71 |             print("Perform resampling")
 72 |             # trying to match Gaser
 73 |             mask_img_rs = npr.resample_to_output(
 74 |                 mask_img, [resample_size] * len(mask_img.shape), order=1
 75 |             )  # resample mask
 76 |             print(
 77 |                 "mask affine after resampling\n",
 78 |                 mask_img_rs.affine,
 79 |                 mask_img_rs.shape,
 80 |             )
 81 | 
 82 |             sub_img_rs = image.resample_to_img(
 83 |                 sub_img, mask_img_rs, interpolation="linear"
 84 |             )  # resample subject
 85 |             print(
 86 |                 "sub affine after resampling\n",
 87 |                 sub_img_rs.affine,
 88 |                 sub_img_rs.shape,
 89 |             )
 90 | 
 91 |             binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
 92 |             mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
 93 | 
 94 |             sub_data_rs = sub_img_rs.get_fdata()[
 95 |                 mask_rs
 96 |             ]  # extract voxel using the binarized mask
 97 |             sub_data_rs = sub_data_rs.reshape(1, -1)
 98 | 
 99 |             if data_resampled.size == 0:
100 |                 data_resampled = sub_data_rs
101 |             else:
102 |                 data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
103 |             count = count + 1
104 |             print(data_resampled.shape)
105 | 
106 |     print("\n *** Feature extraction done ***")
107 | 
108 |     # renaming the columns and convering to dataframe
109 |     data_resampled = pd.DataFrame(data_resampled)
110 |     data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
111 |     print('Feature names:', data_resampled.columns)
112 | 
113 |     print(f"The size of the feature space is {data_resampled.shape}")
114 | 
115 |     return data_resampled
116 | 
117 | 
118 | 
119 | def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
120 |     """Calculate parcelwise features for the subjects
121 | 
122 |     Args:
123 |         phenotype_file (csv or text): A csv or text file with path to subject images
124 |         mask_dir (_type_): The GM mask file to be used to extract features
125 |         num_parcels (_type_): Number of parcels
126 |     
127 |     Returns:
128 |         data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
129 |     """    
130 | 
131 |     phenotype = pd.read_csv(phenotype_file, header=None)
132 | 
133 |     # filename, file_extension = os.path.splitext(phenotype_file)
134 | 
135 |     # if file_extension == '.txt':
136 |     #     phenotype = pd.read_csv(phenotype_file, header=None)
137 |     # elif file_extension == '.csv':
138 |     #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
139 |     # else:
140 |     #     raise ValueError("Wrong file. Please imput either a csv or text file")
141 | 
142 |     print(phenotype.shape)
143 |     print(phenotype.head())
144 | #    phenotype = phenotype.iloc[0:15]
145 | 
146 |     data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
147 |     count = 0
148 | 
149 |     for index, row in phenotype.iterrows(): # iterate over each row
150 |         sub_file = row.values[0]
151 | 
152 |         if os.path.exists(sub_file):
153 |             print(f'\nProcessing subject number {count}')
154 |             sub_img = nib.load(sub_file)  # load subject image
155 |             mask_img = nib.load(mask_dir)  # load mask image
156 |             print ('Subject and mask image loaded')
157 |             print(sub_file, sub_img.affine, mask_img.affine)
158 | 
159 |             sub_data = sub_img.get_fdata()
160 |             sub_data[sub_data == 0] = np.nan # replace zeros with Nan
161 |             sub_data_parcels = []
162 | 
163 |             if not np.array_equal(sub_img.affine, mask_img.affine):
164 |                 mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
165 |             else:
166 |                 print("Subject and mask have same affine")
167 | 
168 |             for num in range(1, int(num_parcels) + 1):
169 |                 itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
170 |                 sub_mat = sub_data[itemindex]
171 | 
172 |                 if np.all(np.isnan(sub_mat)):
173 |                     sub_agg = 0
174 |                 else:
175 |                     sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
176 |                 sub_data_parcels.append(sub_agg)
177 | 
178 |             data_parcels.append(sub_data_parcels)
179 |             print(len(data_parcels))
180 |             count = count + 1
181 | 
182 |     print('\n *** Feature extraction done ***')
183 |     data_parcels = pd.DataFrame(data_parcels)
184 |     data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
185 |     print(data_parcels.columns)
186 | 
187 |     print('final dataframe shape', data_parcels.shape)
188 |     return data_parcels
189 | 


--------------------------------------------------------------------------------
/codes/within_site_ixi.submit:
--------------------------------------------------------------------------------
  1 | # The environment
  2 | universe       = vanilla
  3 | getenv         = True
  4 | 
  5 | # resources
  6 | request_cpus   = 1
  7 | request_memory = 5G
  8 | 
  9 | 
 10 | # Execution
 11 | initial_dir    = .
 12 | executable     = $(initial_dir)/run_in_venv.sh
 13 | transfer_executable = False
 14 | 
 15 | # Logs
 16 | #log            = $(initial_dir)/../logs/$(Cluster).$(Process).log
 17 | #output         = $(initial_dir)/../logs/$(Cluster).$(Process).out
 18 | #error          = $(initial_dir)/../logs/$(Cluster).$(Process).err
 19 | 
 20 | log            = $(initial_dir)/../logs/$(result_prefix).$(model).log
 21 | output         = $(initial_dir)/../logs/$(result_prefix).$(model).out
 22 | error          = $(initial_dir)/../logs/$(result_prefix).$(model).err
 23 | 
 24 | # --models: 'ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb'
 25 | # --pca_status: 0 or 1
 26 | 
 27 | # enki (change data_name and subject_filepaths_csv to run for different dataset)
 28 | data_name = ixi
 29 | subject_filepaths_csv = ixi.subject_list_cat12.8.csv
 30 | 
 31 | arguments = test_package_env python3 within_site_train.py --demographics_file ../data/$(data_name)/$(subject_filepaths_csv) --features_file ../data/$(data_name)/$(feature_name) --output_path ../results/$(data_name) --output_prefix $(result_prefix) --models $(model) --pca_status $(pca)
 32 | 
 33 | ########## 173 parcels
 34 | feature_name = $(data_name).173
 35 | result_prefix = $(data_name).173
 36 | 
 37 | pca = 0
 38 | model = ridge
 39 | queue
 40 | model = rf
 41 | queue
 42 | model = rvr_lin
 43 | queue
 44 | model = kernel_ridge
 45 | queue
 46 | model = gauss
 47 | queue
 48 | model = lasso
 49 | queue
 50 | model = elasticnet
 51 | queue
 52 | model = rvr_poly
 53 | queue
 54 | #model = xgb
 55 | #queue
 56 | 
 57 | 
 58 | 
 59 | ########## 473 parcels
 60 | feature_name = $(data_name).473
 61 | result_prefix = $(data_name).473
 62 | pca = 0
 63 | model = ridge
 64 | queue
 65 | model = rf
 66 | queue
 67 | model = rvr_lin
 68 | queue
 69 | model = kernel_ridge
 70 | queue
 71 | model = gauss
 72 | queue
 73 | model = lasso
 74 | queue
 75 | model = elasticnet
 76 | queue
 77 | model = rvr_poly
 78 | queue
 79 | #model = xgb
 80 | #queue
 81 | 
 82 | 
 83 | 
 84 | ########## 873 parcels
 85 | feature_name = $(data_name).873
 86 | result_prefix = $(data_name).873
 87 | pca = 0
 88 | model = ridge
 89 | queue
 90 | model = rf
 91 | queue
 92 | model = rvr_lin
 93 | queue
 94 | model = kernel_ridge
 95 | queue
 96 | model = gauss
 97 | queue
 98 | model = lasso
 99 | queue
100 | model = elasticnet
101 | queue
102 | model = rvr_poly
103 | queue
104 | #model = xgb
105 | #queue
106 | 
107 | 
108 | 
109 | ########## 1273 parcels
110 | feature_name = $(data_name).1273
111 | result_prefix = $(data_name).1273
112 | pca = 0
113 | model = ridge
114 | queue
115 | model = rf
116 | queue
117 | model = rvr_lin
118 | queue
119 | model = kernel_ridge
120 | queue
121 | model = gauss
122 | queue
123 | model = lasso
124 | queue
125 | model = elasticnet
126 | queue
127 | model = rvr_poly
128 | queue
129 | #model = xgb
130 | #queue
131 | 
132 | 
133 | 
134 | ########## S0_R4
135 | feature_name = $(data_name).S0_R4
136 | result_prefix = $(data_name).S0_R4
137 | pca = 0
138 | model = ridge
139 | queue
140 | model = rf
141 | queue
142 | model = rvr_lin
143 | queue
144 | model = kernel_ridge
145 | queue
146 | model = gauss
147 | queue
148 | model = lasso
149 | queue
150 | model = elasticnet
151 | queue
152 | model = rvr_poly
153 | queue
154 | #model = xgb
155 | #queue
156 | 
157 | 
158 | 
159 | ########## S0_R8
160 | feature_name = $(data_name).S0_R8
161 | result_prefix = $(data_name).S0_R8
162 | pca = 0
163 | model = ridge
164 | queue
165 | model = rf
166 | queue
167 | model = rvr_lin
168 | queue
169 | model = kernel_ridge
170 | queue
171 | model = gauss
172 | queue
173 | model = lasso
174 | queue
175 | model = elasticnet
176 | queue
177 | model = rvr_poly
178 | queue
179 | #model = xgb
180 | #queue
181 | 
182 | 
183 | 
184 | ########## S4_R4
185 | feature_name = $(data_name).S4_R4
186 | result_prefix = $(data_name).S4_R4
187 | pca = 0
188 | model = ridge
189 | queue
190 | model = rf
191 | queue
192 | model = rvr_lin
193 | queue
194 | model = kernel_ridge
195 | queue
196 | model = gauss
197 | queue
198 | model = lasso
199 | queue
200 | model = elasticnet
201 | queue
202 | model = rvr_poly
203 | queue
204 | #model = xgb
205 | #queue
206 | 
207 | 
208 | 
209 | ########## S4_R8
210 | feature_name = $(data_name).S4_R8
211 | result_prefix = $(data_name).S4_R8
212 | pca = 0
213 | model = ridge
214 | queue
215 | model = rf
216 | queue
217 | model = rvr_lin
218 | queue
219 | model = kernel_ridge
220 | queue
221 | model = gauss
222 | queue
223 | model = lasso
224 | queue
225 | model = elasticnet
226 | queue
227 | model = rvr_poly
228 | queue
229 | #model = xgb
230 | #queue
231 | 
232 | 
233 | ########## S8_R4
234 | feature_name = $(data_name).S8_R4
235 | result_prefix = $(data_name).S8_R4
236 | pca = 0
237 | model = ridge
238 | queue
239 | model = rf
240 | queue
241 | model = rvr_lin
242 | queue
243 | model = kernel_ridge
244 | queue
245 | model = gauss
246 | queue
247 | model = lasso
248 | queue
249 | model = elasticnet
250 | queue
251 | model = rvr_poly
252 | queue
253 | #model = xgb
254 | #queue
255 | 
256 | 
257 | 
258 | 
259 | ########## S8_R8
260 | feature_name = $(data_name).S8_R8
261 | result_prefix = $(data_name).S8_R8
262 | pca = 0
263 | model = ridge
264 | queue
265 | model = rf
266 | queue
267 | model = rvr_lin
268 | queue
269 | model = kernel_ridge
270 | queue
271 | model = gauss
272 | queue
273 | model = lasso
274 | queue
275 | model = elasticnet
276 | queue
277 | model = rvr_poly
278 | queue
279 | #model = xgb
280 | #queue
281 | 
282 | ##################### PCA
283 | ########## S0_R4_pca
284 | feature_name = $(data_name).S0_R4
285 | result_prefix = $(data_name).S0_R4_pca
286 | pca = 1
287 | model = ridge
288 | queue
289 | model = rf
290 | queue
291 | model = rvr_lin
292 | queue
293 | model = kernel_ridge
294 | queue
295 | model = gauss
296 | queue
297 | model = lasso
298 | queue
299 | model = elasticnet
300 | queue
301 | model = rvr_poly
302 | queue
303 | #model = xgb
304 | #queue
305 | 
306 | 
307 | 
308 | ########## S0_R8_pca
309 | feature_name = $(data_name).S0_R8
310 | result_prefix = $(data_name).S0_R8_pca
311 | pca = 1
312 | model = ridge
313 | queue
314 | model = rf
315 | queue
316 | model = rvr_lin
317 | queue
318 | model = kernel_ridge
319 | queue
320 | model = gauss
321 | queue
322 | model = lasso
323 | queue
324 | model = elasticnet
325 | queue
326 | model = rvr_poly
327 | queue
328 | #model = xgb
329 | #queue
330 | 
331 | 
332 | 
333 | ########## S4_R4_pca
334 | feature_name = $(data_name).S4_R4
335 | result_prefix = $(data_name).S4_R4_pca
336 | pca = 1
337 | model = ridge
338 | queue
339 | model = rf
340 | queue
341 | model = rvr_lin
342 | queue
343 | model = kernel_ridge
344 | queue
345 | model = gauss
346 | queue
347 | model = lasso
348 | queue
349 | model = elasticnet
350 | queue
351 | model = rvr_poly
352 | queue
353 | #model = xgb
354 | #queue
355 | 
356 | 
357 | 
358 | ########## S4_R8_pca
359 | feature_name = $(data_name).S4_R8
360 | result_prefix = $(data_name).S4_R8_pca
361 | pca = 1
362 | model = ridge
363 | queue
364 | model = rf
365 | queue
366 | model = rvr_lin
367 | queue
368 | model = kernel_ridge
369 | queue
370 | model = gauss
371 | queue
372 | model = lasso
373 | queue
374 | model = elasticnet
375 | queue
376 | model = rvr_poly
377 | queue
378 | #model = xgb
379 | #queue
380 | 
381 | ########## S8_R4_pca
382 | feature_name = $(data_name).S8_R4
383 | result_prefix = $(data_name).S8_R4_pca
384 | pca = 1
385 | model = ridge
386 | queue
387 | model = rf
388 | queue
389 | model = rvr_lin
390 | queue
391 | model = kernel_ridge
392 | queue
393 | model = gauss
394 | queue
395 | model = lasso
396 | queue
397 | model = elasticnet
398 | queue
399 | model = rvr_poly
400 | queue
401 | #model = xgb
402 | #queue
403 | 
404 | 
405 | 
406 | 
407 | ########## S8_R8
408 | feature_name = $(data_name).S8_R8
409 | result_prefix = $(data_name).S8_R8_pca
410 | pca = 1
411 | model = ridge
412 | queue
413 | model = rf
414 | queue
415 | model = rvr_lin
416 | queue
417 | model = kernel_ridge
418 | queue
419 | model = gauss
420 | queue
421 | model = lasso
422 | queue
423 | model = elasticnet
424 | queue
425 | model = rvr_poly
426 | queue
427 | #model = xgb
428 | #queue


--------------------------------------------------------------------------------
/codes/calculate_features2.submit:
--------------------------------------------------------------------------------
 1 | # The environment
 2 | universe       = vanilla
 3 | getenv         = True
 4 | 
 5 | # resources
 6 | request_cpus   = 1
 7 | request_memory = 5G
 8 | 
 9 | # Execution
10 | initial_dir    = .
11 | executable     = $(initial_dir)/run_in_venv.sh
12 | 
13 | # Job
14 | #log            = $(initial_dir)/../logs/$(Cluster).$(Process).log
15 | #output         = $(initial_dir)/../logs/$(Cluster).$(Process).out
16 | #error          = $(initial_dir)/../logs/$(Cluster).$(Process).err
17 | 
18 | log            = $(initial_dir)/../logs/$(data_name).$(Process).log
19 | output         = $(initial_dir)/../logs/$(data_name).$(Process).out
20 | error          = $(initial_dir)/../logs/$(data_name).$(Process).err
21 | 
22 | # 1000brains (change data_name and subject_filepaths_csv to run for different dataset)
23 | data_name = 1000brains
24 | subject_filepaths_csv = 1000brains.paths_cat12.8.csv 
25 | 
26 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4
27 | queue
28 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8
29 | queue
30 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4
31 | queue
32 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8
33 | queue
34 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4
35 | queue
36 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8
37 | queue
38 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173
39 | queue
40 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473
41 | queue
42 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873
43 | queue
44 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273
45 | queue
46 | 
47 | # camcan (change data_name and subject_filepaths_csv to run for different dataset)
48 | data_name = camcan
49 | subject_filepaths_csv = camcan.paths_cat12.8.csv 
50 | 
51 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4
52 | queue
53 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8
54 | queue
55 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4
56 | queue
57 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8
58 | queue
59 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4
60 | queue
61 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8
62 | queue
63 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173
64 | queue
65 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473
66 | queue
67 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873
68 | queue
69 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273
70 | queue
71 | 
72 | 
73 | # enki (change data_name and subject_filepaths_csv to run for different dataset)
74 | data_name = enki
75 | subject_filepaths_csv = enki.paths_cat12.8.csv 
76 | 
77 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 4
78 | queue
79 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 0 --resample_size 8
80 | queue
81 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 4
82 | queue
83 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 4 --resample_size 8
84 | queue
85 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 4
86 | queue
87 | arguments = test_package_env python3 calculate_features_voxelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/brainmask_12.8.nii --smooth_fwhm 8 --resample_size 8
88 | queue
89 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_173.nii --num_parcels 173
90 | queue
91 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_473.nii --num_parcels 473
92 | queue
93 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_873.nii --num_parcels 873
94 | queue
95 | arguments = test_package_env python3 calculate_features_parcelwise.py --features_path ../data/$(data_name)/ --subject_filepaths ../data/$(data_name)/$(subject_filepaths_csv) --output_prefix $(data_name) --mask_file ../masks/BSF_1273.nii --num_parcels 1273
96 | queue
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/codes/cross_site_train.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import argparse
  4 | import pandas as pd
  5 | from pathlib import Path
  6 | 
  7 | from brainage import read_data, XGBoostAdapted
  8 | 
  9 | import xgboost as xgb
 10 | from skrvm import RVR
 11 | from glmnet import ElasticNet
 12 | import sklearn.gaussian_process as gp
 13 | from sklearn.kernel_ridge import KernelRidge
 14 | from sklearn.decomposition import PCA
 15 | from sklearn.feature_selection import VarianceThreshold
 16 | from sklearn.model_selection import RepeatedStratifiedKFold
 17 | 
 18 | from julearn import run_cross_validation
 19 | from julearn.utils import configure_logging
 20 | from julearn.transformers import register_transformer
 21 | 
 22 | start_time = time.time()
 23 | 
 24 | def none_or_str(value):
 25 |     if value == 'None':
 26 |         return None
 27 |     return value
 28 | 
 29 | if __name__ == '__main__':
 30 |     parser = argparse.ArgumentParser()
 31 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path")
 32 |     parser.add_argument("--features_file", type=str, help="Features file path")
 33 |     parser.add_argument("--output_path", type=str, help="Path to output directory")
 34 |     parser.add_argument("--output_prefix", type=str, help="Output prefix (used {dataname}.{featurename}")
 35 |     parser.add_argument("--models", type=str, nargs='?', const=1, default="ridge",
 36 |                        help="models to use (comma seperated no space): ridge,rf,rvr_linear")
 37 |     parser.add_argument("--pca_status", type=int, default=0,
 38 |                        help="0: no pca, 1: yes pca")
 39 |     parser.add_argument("--confounds", type=none_or_str, help="confounds", default=None)
 40 |     parser.add_argument("--n_jobs", type=int, default=1, help="Number of parallel jobs to run")
 41 | 
 42 |     configure_logging(level='INFO')
 43 | 
 44 |     # Parse the arguments
 45 |     args = parser.parse_args()
 46 |     demographics_file = args.demographics_file
 47 |     features_file = args.features_file
 48 |     output_path = Path(args.output_path)
 49 |     output_prefix = args.output_prefix
 50 |     model_required = [x.strip() for x in args.models.split(',')]  # converts string into list
 51 |     confounds = args.confounds
 52 |     pca_status = bool(args.pca_status)
 53 |     n_jobs = args.n_jobs
 54 |     output_path.mkdir(exist_ok=True, parents=True) # check and create output directory
 55 | 
 56 |     # initialize random seed and create test indices
 57 |     rand_seed = 200
 58 |     n_repeats = 5 # for inner CV
 59 |     n_splits = 5  # how many train and test splits (both for other and inner)
 60 | 
 61 |     print('\nDemographics file: ', demographics_file)
 62 |     print('Features file: ', features_file)
 63 |     print('Ouput path : ', output_path)
 64 |     print('Ouput prefix: ', output_prefix)
 65 |     print('Model:', model_required, type(model_required))
 66 |     print('PCA status : ', pca_status)
 67 |     print('Random seed : ', rand_seed)
 68 |     print('Num of splits for kfolds : ', n_splits, '\n')
 69 |     print('confounds:', confounds, type(confounds))
 70 |     print('Num of parallel jobs initiated: ', n_jobs, '\n')
 71 | 
 72 |     # read the features, demographics and define X and y
 73 |     data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file)
 74 | 
 75 |     # register VarianceThreshold as a transformer
 76 |     register_transformer('variancethreshold', VarianceThreshold, returned_features='unknown', apply_to='all_features')
 77 |     var_threshold = 1e-5
 78 | 
 79 |     # Initialize variables, set random seed, create classes for age
 80 |     scores_cv, models, results = {}, {}, {}
 81 |     qc = pd.cut(data_df['age'].tolist(), bins=5, precision=1)  # create bins for train data only
 82 |     print('age_bins', qc.categories, 'age_codes', qc.codes)
 83 |     data_df['bins'] = qc.codes # add bin/classes as a column in train df
 84 | 
 85 |     # Define all models and model parameters
 86 |     rvr_linear = RVR()
 87 |     rvr_poly = RVR()
 88 |     kernel_ridge = KernelRidge()  # kernelridge
 89 |     lasso = ElasticNet(alpha=1, standardize=False)
 90 |     elasticnet = ElasticNet(alpha=0.5, standardize=False)
 91 |     ridge = ElasticNet(alpha=0, standardize=False)
 92 |     xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
 93 |     pca = PCA(n_components=None)  # max as many components as sample size
 94 | 
 95 |     model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb']
 96 |     model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
 97 | 
 98 |     model_para_list = [{'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed,
 99 |                         'elasticnet__n_jobs': n_jobs},
100 | 
101 |                        {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
102 |                         'rf__max_features': 0.33, 'rf__min_samples_leaf': 5, 'rf__n_jobs':n_jobs,
103 |                         'rf__random_state': rand_seed},
104 | 
105 |                        {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
106 |                         'rvr__random_state': rand_seed},
107 | 
108 |                        {'variancethreshold__threshold': var_threshold,
109 |                         'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
110 |                         'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5,
111 |                         'search_params': {'n_jobs': n_jobs}},
112 | 
113 |                        {'variancethreshold__threshold': var_threshold,
114 |                         'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
115 |                         'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
116 | 
117 |                        {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed,
118 |                         'elasticnet__n_jobs': n_jobs},
119 | 
120 |                        {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed,
121 |                         'elasticnet__n_jobs': n_jobs},
122 | 
123 |                        {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
124 |                         'rvr__random_state': rand_seed},
125 | 
126 |                         {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
127 |                          'xgboostadapted__max_depth': [1, 2, 3, 6, 8], 'xgboostadapted__n_estimators': 100,
128 |                          'xgboostadapted__reg_alpha': [0.0001, 0.01, 0.1, 1, 10],
129 |                          'xgboostadapted__reg_lambda': [0.0001, 0.01, 0.1, 1, 10, 20],
130 |                          'xgboostadapted__random_seed': rand_seed, 'search_params': {'n_jobs': n_jobs}}]
131 | 
132 |     # Define processing for X (features)
133 |     if confounds is None:
134 |         if pca_status:
135 |             preprocess_X = ['variancethreshold', 'zscore', pca]
136 |         else:
137 |             preprocess_X = ['variancethreshold', 'zscore']
138 |     else:
139 |         if pca_status:
140 |             preprocess_X = ['variancethreshold', 'zscore', 'remove_confound', pca]
141 |         else:
142 |             preprocess_X = ['variancethreshold', 'zscore', 'remove_confound']
143 |     print('Preprocessing includes:', preprocess_X)
144 |      
145 |     # Get the model, its parameters, pca status and train
146 |     for ind in range(0, len(model_required)):  # run only for required models and not all
147 |         print('model required index and name:', ind, model_required[ind])
148 |         i = model_names.index(model_required[ind])  # find index of required model in model_names list and use this index i to access model params
149 |         assert model_required[ind] == model_names[i] # sanity check
150 |         print('model picked from the list', model_names[i], model_list[i], '\n')
151 |        
152 |         # initialize dictionaries to save scores and models here to save every model separately
153 |         scores_cv, models = {}, {}
154 |         
155 |         cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=rand_seed).split(data_df, data_df.bins)
156 | 
157 |         scores, model = run_cross_validation(X=X, y=y, data=data_df, preprocess_X=preprocess_X, confounds=confounds,
158 |                                              problem_type='regression', model=model_list[i], cv=cv,
159 |                                      return_estimator='all', model_params=model_para_list[i], seed=rand_seed,
160 |                                              scoring=
161 |                                      ['neg_mean_absolute_error', 'neg_mean_squared_error','r2'], n_jobs=n_jobs) # adapted run_cross_validation to give n_jobs
162 | 
163 |         scores_cv[model_names[i]] = scores
164 | 
165 |         if model_names[i] == 'kernel_ridge' or model_names[i] == 'xgb':
166 |             models[model_names[i]] = model.best_estimator_
167 |             print('best model', model.best_estimator_)
168 |             print('best para', model.best_params_)
169 |         else:
170 |             models[model_names[i]] = model
171 |             print('best model', model)
172 | 
173 |         print('Output file name')
174 |         print(output_path / f'{output_prefix}.{model_names[i]}.models')
175 |         pickle.dump(models, open(output_path / f'{output_prefix}.{model_names[i]}.models', "wb"))
176 |         pickle.dump(scores_cv, open(output_path / f'{output_prefix}.{model_names[i]}.scores', "wb"))
177 | 
178 |     print('ALL DONE')
179 |     print("--- %s seconds ---" % (time.time() - start_time))
180 |     print("--- %s minutes ---" % ((time.time() - start_time)/60))
181 |     print("--- %s hours ---" % ((time.time() - start_time)/3600))
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/codes/within_site_read_results.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os.path
  3 | import argparse
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | # all possible inputs
  8 | ## within site
  9 | # data_nm = '..results/ixi/ixi_'
 10 | # data_nm = '..results/enki/enki_'
 11 | # data_nm = '..results/camcan/camcan_'
 12 | # data_nm = '..results/1000brains/1000brains_'
 13 | 
 14 | if __name__ == '__main__':
 15 |     parser = argparse.ArgumentParser()
 16 |     parser.add_argument("--data_nm", type=str, help="Output path for one dataset")
 17 | 
 18 |     args = parser.parse_args()
 19 |     data_nm = args.data_nm
 20 | 
 21 |     # Filename to save results
 22 |     cv_file_ext = 'cv_scores.csv'
 23 |     test_file_ext = 'test_scores.csv'
 24 |     combined_file_ext = 'cv_test_scores.csv'
 25 | 
 26 |     # Complete results filepaths
 27 |     cv_filename = data_nm + cv_file_ext
 28 |     test_filename = data_nm + test_file_ext
 29 |     combined_filename = data_nm + combined_file_ext
 30 | 
 31 |     # all model names
 32 |     model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly'] #'xgb'
 33 |     model_names_new = ['RR', 'RFR', 'RVRlin', 'KRR', 'GPR', 'LR', 'ENR', 'RVRpoly'] # 'XGB'
 34 | 
 35 |     # all feature spaces names
 36 |     data_list = ['173', '473', '873','1273', 'S0_R4', 'S0_R4_pca', 'S4_R4', 'S4_R4_pca', 'S8_R4', 'S8_R4_pca',
 37 |                        'S0_R8', 'S0_R8_pca', 'S4_R8', 'S4_R8_pca', 'S8_R8', 'S8_R8_pca']
 38 |     data_list_new = ['173', '473', '873','1273', 'S0_R4', 'S0_R4 + PCA', 'S4_R4', 'S4_R4 + PCA', 'S8_R4', 'S8_R4 + PCA',
 39 |                        'S0_R8', 'S0_R8 + PCA', 'S4_R8', 'S4_R8 + PCA', 'S8_R8', 'S8_R8 + PCA']
 40 | 
 41 |     # check which scores file is missing
 42 |     missing_outs = []
 43 |     for data_item in data_list:
 44 |         for model_item in model_names:
 45 |             scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file
 46 |             if os.path.isfile(scores_item):
 47 |                 print('yes')
 48 |             else:
 49 |                 missing_outs.append(scores_item)
 50 |     print('Missing files:\n', missing_outs)
 51 | 
 52 |     # get the saved cv scores
 53 |     df = pd.DataFrame()
 54 |     df_cv = pd.DataFrame()
 55 |     for data_item in data_list:
 56 |         for model_item in model_names:
 57 |             scores_item = data_nm + data_item + '.' + model_item + '.scores' # create the complete path to scores file
 58 |             if os.path.isfile(scores_item):
 59 |                 print(scores_item)
 60 |                 res = pickle.load(open(scores_item,'rb'))
 61 |                 df = pd.DataFrame()
 62 |                 for key1, value1 in res.items():
 63 |                     print('key1', key1)
 64 |                     mae_all, mse_all, corr_all, corr_delta_all, key_all = list(), list(), list(), list(), list()
 65 |                     for key, value in value1.items():
 66 |                         mae = round(value['test_neg_mean_absolute_error'].mean() * -1, 3)
 67 |                         mse = round(value['test_neg_mean_squared_error'].mean() * -1, 3)
 68 |                         corr = round(value['test_r2'].mean(), 3)
 69 |                         mae_all.append(mae)
 70 |                         mse_all.append(mse)
 71 |                         corr_all.append(corr)
 72 |                         key_all.append(key)
 73 | 
 74 |                     df['model'] = key_all
 75 |                     df['data'] = len(mae_all) * [data_item]
 76 |                     df[key1 + '_mae'] = mae_all
 77 |                     df[key1 + '_mse'] = mse_all
 78 |                     df[key1 + '_corr'] = corr_all
 79 |                 # print(df)
 80 |                 df_cv = pd.concat([df_cv, df], axis=0)
 81 | 
 82 |     df_cv.reset_index(drop=True, inplace=True)
 83 | 
 84 |     xx_mae = df_cv.loc[:, df_cv.columns.str.endswith('_mae')].values # ro take average over repeats of mae
 85 |     xx_mse = df_cv.loc[:, df_cv.columns.str.endswith('_mse')].values # ro take average over repeats of mae
 86 |     xx_corr = df_cv.loc[:, df_cv.columns.str.endswith('_corr')].values # ro take average over repeats of mae
 87 | 
 88 |     df_cv['mean_cv_mae'] = np.mean(xx_mae, axis=1).round(3)
 89 |     df_cv['mean_cv_mse'] = np.mean(xx_mse, axis=1).round(3)
 90 |     df_cv['mean_cv_corr'] = np.mean(xx_corr, axis=1).round(3)
 91 | 
 92 |     df_cv['workflow_name'] = df_cv['data'] + ' + ' + df_cv['model']
 93 |     df_cv['data'] = df_cv['data'].replace(data_list, data_list_new)
 94 |     df_cv['model'] = df_cv['model'].replace(model_names, model_names_new)
 95 | 
 96 | 
 97 |     # # get the saved test scores
 98 |     df = pd.DataFrame()
 99 |     df_test = pd.DataFrame()
100 | 
101 |     for data_item in data_list:
102 |         for model_item in model_names:
103 |             scores_item = data_nm + data_item + '.' + model_item + '.results' # create the complete path to scores file
104 |             if os.path.isfile(scores_item):
105 |                 print(scores_item)
106 |                 res = pickle.load(open(scores_item,'rb'))
107 |                 df = pd.DataFrame()
108 |                 for key1, value1 in res.items():
109 |                     print('key1', key1)
110 |                     mae_all, mse_all, corr_all, key_all = list(), list(), list(), list()
111 |                     for key, value in value1.items():
112 |                         mae = value['mae']
113 |                         mse = value['mse']
114 |                         corr = value['corr']
115 |                         mae_all.append(mae)
116 |                         mse_all.append(mse)
117 |                         corr_all.append(corr)
118 |                         key_all.append(key)
119 |                     df['model'] = key_all
120 |                     df['data'] = len(mae_all) * [data_item]
121 |                     df[key1 + '_mae'] = mae_all
122 |                     df[key1 + '_mse'] = mse_all
123 |                     df[key1 + '_corr'] = corr_all
124 |                 # print(df)
125 |                 df_test = pd.concat([df_test, df], axis=0)
126 | 
127 |     df_test.reset_index(drop=True, inplace=True)
128 | 
129 |     xx_mae = df_test.loc[:, df_test.columns.str.endswith('_mae')].values # ro take average over repeats of mae
130 |     xx_mse = df_test.loc[:, df_test.columns.str.endswith('_mse')].values # ro take average over repeats of mae
131 |     xx_corr = df_test.loc[:, df_test.columns.str.endswith('_corr')].values # ro take average over repeats of mae
132 | 
133 |     df_test['mean_test_mae'] = np.mean(xx_mae, axis=1).round(3)
134 |     df_test['mean_test_mse'] = np.mean(xx_mse, axis=1).round(3)
135 |     df_test['mean_test_corr'] = np.mean(xx_corr, axis=1).round(3)
136 | 
137 |     df_test['workflow_name'] = df_test['data'] + ' + ' + df_test['model']
138 |     df_test['data'] = df_test['data'].replace(data_list, data_list_new)
139 |     df_test['model'] = df_test['model'].replace(model_names, model_names_new)
140 | 
141 |     df_combined1 = df_cv[['model', 'data', 'mean_cv_mae', 'mean_cv_mse', 'mean_cv_corr', 'workflow_name']].copy()
142 |     df_combined2 = df_test[['model', 'data', 'mean_test_mae', 'mean_test_mse', 'mean_test_corr', 'workflow_name']].copy()
143 |     df_combined = pd.merge(df_combined1, df_combined2, how='left', on=['model', 'data', 'workflow_name'])
144 |     df_combined['workflow_name_updated'] = df_combined['data'] + ' + ' + df_combined['model']
145 |     df_combined.reset_index(drop=True, inplace=True)
146 | 
147 |     # save the csv files
148 |     print('\n cv results file:', cv_filename)
149 |     print(df_cv)
150 |     print('\n test results file:', test_filename)
151 |     print(df_test)
152 |     print('\n combined results file:', combined_filename)
153 |     print(df_combined)
154 | 
155 |     df_cv.to_csv(cv_filename, index=False)
156 |     df_test.to_csv(test_filename, index=False)
157 |     df_combined.to_csv(combined_filename, index=False)
158 | 
159 | 
160 |     # # check model parameters
161 |     print('\n Model Parameters')
162 |     error_models = list()
163 |     for data_item in data_list:
164 |         for model_item in model_names:
165 |             model_item = data_nm + data_item + '.' + model_item + '.models'  # get models
166 | 
167 |             if os.path.isfile(model_item):
168 |                 print('\n', 'model filename', model_item)
169 | 
170 |                 res = pickle.load(open(model_item, 'rb'))
171 |                 # print(res)
172 | 
173 |                 for key1, value1 in res.items():
174 |                     for key2, value2 in value1.items():
175 |                         print(key1, key2)
176 | 
177 |                         if key2 == 'linreg':
178 |                             print(res[key1]['linreg']['linreg'].intercept_, res[key1]['linreg']['linreg'].coef_)
179 | 
180 |                         elif key2 == 'gauss':
181 |                             model = res[key1]['gauss']['gauss']
182 |                             # print(model.get_params())
183 |                             print(model.kernel_.get_params())
184 | 
185 |                         elif key2 == 'kernel_ridge':
186 |                             model = res[key1]['kernel_ridge']['kernelridge']
187 |                             print(model)
188 | 
189 |                         elif key2 == 'rvr_lin':
190 |                             model = res[key1]['rvr_lin']['rvr']
191 |                             print(model)
192 | 
193 |                         elif key2 == 'rvr_poly':
194 |                             model = res[key1]['rvr_poly']['rvr']
195 |                             print(model)
196 | 
197 |                         elif key2 == 'rf':
198 |                             model = res[key1]['rf']['rf']
199 |                             print(model)
200 | 
201 |                         elif key2 == 'xgb':
202 |                             model = res[key1]['xgb']['xgboostadapted']
203 |                             print(model)
204 | 
205 |                         else:  # for lasso, ridge, elasticnet
206 |                             model = res[key1][key2]['elasticnet']
207 |                             print(model.lambda_best_)
208 | 
209 |             else:
210 |                 error_models.append(model_item)
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/codes/within_site_train.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | import pickle
  4 | import argparse
  5 | import numpy as np
  6 | import pandas as pd
  7 | from pathlib import Path
  8 | 
  9 | from brainage import stratified_splits, read_data, XGBoostAdapted, performance_metric
 10 | 
 11 | import xgboost as xgb
 12 | from skrvm import RVR
 13 | from glmnet import ElasticNet
 14 | import sklearn.gaussian_process as gp
 15 | from sklearn.kernel_ridge import KernelRidge
 16 | from sklearn.decomposition import PCA
 17 | from sklearn.feature_selection import VarianceThreshold
 18 | from sklearn.model_selection import RepeatedStratifiedKFold
 19 | 
 20 | from julearn import run_cross_validation
 21 | from julearn.utils import configure_logging
 22 | from julearn.transformers import register_transformer
 23 | 
 24 | start_time = time.time()
 25 | 
 26 | if __name__ == '__main__':
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument("--demographics_file", type=str, help="Demographics file path")
 29 |     parser.add_argument("--features_file", type=str, help="Features file path")
 30 |     parser.add_argument("--output_path", type=str, help="Path to output directory")
 31 |     parser.add_argument("--output_prefix", type=str, help="Output prefix (used {dataname}.{featurename}")
 32 |     parser.add_argument("--models", type=str, nargs='?', const=1, default="ridge",
 33 |                        help="models to use (comma seperated no space): ridge,rf,rvr_linear")
 34 |     parser.add_argument("--pca_status", type=int, default=0,
 35 |                        help="0: no pca, 1: yes pca")
 36 | 
 37 |     configure_logging(level='INFO')
 38 | 
 39 |     # Parse the arguments
 40 |     args = parser.parse_args()
 41 |     demographics_file = args.demographics_file
 42 |     features_file = args.features_file
 43 |     output_path = Path(args.output_path)
 44 |     output_prefix = args.output_prefix
 45 |     model_required = [x.strip() for x in args.models.split(',')]  # converts string into list
 46 |     pca_status = bool(args.pca_status)
 47 |     output_path.mkdir(exist_ok=True, parents=True) # check and create output directory
 48 | 
 49 |     # initialize random seed and create test indices
 50 |     rand_seed = 200
 51 |     n_repeats = 5 # for inner CV
 52 |     num_splits = 5  # how many train and test splits (both for other and inner)
 53 | 
 54 |     print('\nDemographics file: ', demographics_file)
 55 |     print('Features file: ', features_file)
 56 |     print('Ouput path : ', output_path)
 57 |     print('Ouput prefix: ', output_prefix)
 58 |     print('Model : ', model_required)
 59 |     print('PCA status : ', pca_status)
 60 |     print('Random seed : ', rand_seed)
 61 |     print('Num of splits for kfolds : ', num_splits, '\n')
 62 | 
 63 |     # read the features, demographics and define X and y
 64 |     data_df, X, y = read_data(features_file=features_file, demographics_file=demographics_file)
 65 | 
 66 |     # register VarianceThreshold as a transformer
 67 |     register_transformer('variancethreshold', VarianceThreshold, returned_features='unknown', apply_to='all_features')
 68 |     var_threshold = 1e-5
 69 | 
 70 |     # Create stratified splits for outer CV
 71 |     num_bins = math.floor(len(data_df)/num_splits) # num of bins to be created = num of labels created
 72 |     test_indices = stratified_splits(bins_on=data_df.index, num_bins=num_bins, data=data_df, num_splits=num_splits,
 73 |                                      shuffle=False, random_state=None)  # creates dictionary of test indices
 74 |     
 75 |     # Define all models and model parameters
 76 |     rvr_linear = RVR()
 77 |     rvr_poly = RVR()
 78 |     kernel_ridge = KernelRidge()
 79 |     lasso = ElasticNet(alpha=1, standardize=False)
 80 |     elasticnet = ElasticNet(alpha=0.5, standardize=False)
 81 |     ridge = ElasticNet(alpha=0, standardize=False)
 82 |     xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
 83 |     pca = PCA(n_components=None)  # max as many components as sample size
 84 |     
 85 |     model_names = ['ridge', 'rf', 'rvr_lin', 'kernel_ridge', 'gauss', 'lasso', 'elasticnet', 'rvr_poly', 'xgb']
 86 |     model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
 87 |     model_para_list = [{'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
 88 | 
 89 |                        {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
 90 |                         'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
 91 |                         'rf__random_state': rand_seed},
 92 | 
 93 |                        {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
 94 |                         'rvr__random_state': rand_seed},
 95 | 
 96 |                        {'variancethreshold__threshold': var_threshold,
 97 |                         'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
 98 |                         'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
 99 | 
100 |                        {'variancethreshold__threshold': var_threshold,
101 |                         'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
102 |                         'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
103 | 
104 |                        {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
105 | 
106 |                        {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
107 | 
108 |                        {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
109 |                         'rvr__random_state': rand_seed},
110 | 
111 |                        {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
112 |                         'xgboostadapted__max_depth': [6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
113 |                         'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
114 |                         'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}
115 |     
116 |     # Define processing for X (features)
117 |     if pca_status:
118 |         preprocess_X = ['variancethreshold', 'zscore', pca]
119 |     else:
120 |         preprocess_X = ['variancethreshold', 'zscore']
121 |     print('Preprocessing includes:', preprocess_X)
122 |     
123 |     # Get the model, its parameters, pca status and train
124 |     for ind in range(0, len(model_required)):
125 |         print('model required:', model_required[ind])
126 |         i = model_names.index(model_required[ind])
127 |         assert model_required[ind] == model_names[i]  # sanity check
128 |         print('model picked from the list', model_names[i], model_list[i], '\n')
129 | 
130 |         # initialize dictionaries to save scores, models and results here to save every model separately
131 |         scores_cv = {k: {} for k in test_indices.keys()}
132 |         models = {k: {} for k in test_indices.keys()}
133 |         results = {k: {} for k in test_indices.keys()}
134 |         
135 |         for repeat_key in test_indices.keys():
136 |             all_idx = np.array(range(0, len(data_df)))
137 |             print('\n \n--Repeat', repeat_key)
138 |             test_idx = test_indices[repeat_key]  # get test indices
139 |             train_idx = np.delete(all_idx, test_idx)  # get train indices
140 |             train_df, test_df = data_df.loc[train_idx,:], data_df.loc[test_idx,:]  # get test and train dataframes
141 |             train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)
142 |             print('train size:', train_df.shape, 'test size:', test_df.shape)
143 |             qc = pd.cut(train_df[y].tolist(), bins=5)  # create bins for only train set using age, use this for stratification
144 |             # print('age_bins', qc.categories, 'age_codes', qc.codes)
145 | 
146 |             cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=n_repeats, random_state=rand_seed).split(train_df, qc.codes)
147 | 
148 |             scores, model = run_cross_validation(X=X, y=y, data=train_df, preprocess_X=preprocess_X,
149 |                                                  problem_type='regression', model=model_list[i], cv=cv,
150 |                                          return_estimator='final', model_params=model_para_list[i], seed=rand_seed,
151 |                                                  scoring=
152 |                                          ['neg_mean_absolute_error', 'neg_mean_squared_error','r2'])
153 | 
154 |             scores_cv[repeat_key][model_names[i]] = scores
155 | 
156 |             if model_names[i] == 'kernel_ridge' or model_names[i] == 'xgb':
157 |                 models[repeat_key][model_names[i]] = model.best_estimator_
158 |                 print('best model', model.best_estimator_)
159 |                 print('best para', model.best_params_)
160 |             else:
161 |                 models[repeat_key][model_names[i]] = model
162 |                 print('best model', model)
163 | 
164 |             # Predict on test split
165 |             y_true = test_df[y]
166 |             y_pred = model.predict(test_df[X]).ravel()
167 |             y_delta = y_true - y_pred
168 |             print(y_true.shape, y_pred.shape)
169 |             
170 |             mae, mse, corr = performance_metric(y_true, y_pred)
171 |             print('MAE:', mae, 'MSE:', mse, 'CoRR', corr)
172 |             results[repeat_key][model_names[i]] = {'predictions': y_pred, 'true': y_true, 'test_idx': test_idx,
173 |                                                    'delta': y_delta, 'mae': mae, 'mse': mse, 'corr': corr}
174 | 
175 |             print('Output file name')
176 |             print(output_path / f'{output_prefix}.{model_names[i]}.models')
177 |             pickle.dump(results, open(output_path / f'{output_prefix}.{model_names[i]}.results', "wb"))
178 |             pickle.dump(scores_cv, open(output_path / f'{output_prefix}.{model_names[i]}.scores', "wb"))
179 |             pickle.dump(models, open(output_path / f'{output_prefix}.{model_names[i]}.models', "wb"))
180 | 
181 |     print('ALL DONE')
182 |     print("--- %s seconds ---" % (time.time() - start_time))
183 |     print("--- %s minutes ---" % ((time.time() - start_time)/60))
184 |     print("--- %s hours ---" % ((time.time() - start_time)/3600))
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/codes/cat_standalone_batch-HiFi1mm.m:
--------------------------------------------------------------------------------
  1 | % Batch file for CAT12 segmentation, mean GM ROI extraction to csv,
  2 | % TIV & tissue volume extraction for SPM12 standalone installation
  3 | %
  4 | %   ** eats GZIP for real now **
  5 | %
  6 | % ----> CREATE INDIVIDUAL OUTPUT FOLDER BEFORE <----
  7 | %
  8 | %  input:
  9 | %  <*T1w.nii.gz>
 10 | %_______________________________________________________________________
 11 | % $Id: cat_standalone_batch.m r1871
 12 | %-----------------------------------------------------------------------
 13 | 
 14 | % Used CAT12.8 r1871
 15 | 
 16 | % INPUT FILE
 17 | matlabbatch{1}.spm.tools.cat.estwrite.data(1) = '<UNDEFINED>';
 18 | matlabbatch{1}.spm.tools.cat.estwrite.data_wmh = {''};
 19 | matlabbatch{1}.spm.tools.cat.estwrite.nproc = 0;
 20 | matlabbatch{1}.spm.tools.cat.estwrite.useprior = '';
 21 | % Remove comments if you would like to change TPM by using additional arguments in cat_standalone.sh
 22 | % or change this field manually by editing the "<UNDEFINED" field
 23 | % Otherwise the default value from cat_defaults.m is used.
 24 | % 2st parameter field, that will be dynamically replaced by cat_standalone.sh
 25 | %matlabbatch{1}.spm.tools.cat.estwrite.opts.tpm = {'<UNDEFINED>'};
 26 | 
 27 | % Affine regularisation (SPM12 default = mni) - '';'mni';'eastern';'subj';'none';'rigid'
 28 | matlabbatch{1}.spm.tools.cat.estwrite.opts.affreg = 'mni';
 29 | 
 30 | % Strength of the bias correction that controls the biasreg and biasfwhm parameter (CAT only!)
 31 | % 0 - use SPM parameter; eps - ultralight, 0.25 - light, 0.5 - medium, 0.75 - strong, and 1 - heavy corrections
 32 | % job.opts.biasreg	= min(  10 , max(  0 , 10^-(job.opts.biasstr*2 + 2) ));
 33 | % job.opts.biasfwhm	= min( inf , max( 30 , 30 + 60*job.opts.biasstr ));
 34 | matlabbatch{1}.spm.tools.cat.estwrite.opts.biasstr = 0.5;
 35 | %Overview  of parameters:   accstr:  0.50   0.75   1.00  samp:    3.00   2.00   1.00 (in mm)  tol:     1e-4   1e-8   1e-16SPM default is samp
 36 | matlabbatch{1}.spm.tools.cat.estwrite.opts.accstr = 0.8;
 37 | % Use center-of-mass to roughly correct for differences in the position between image and template. This will internally correct the origin.
 38 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.setCOM = 1;
 39 | % Affine PreProcessing (APP) with rough bias correction and brain extraction for special anatomies (nonhuman/neonates)
 40 | % 0 - none; 1070 - default; [1 - light; 2 - full; 1144 - update of 1070, 5 - animal (no affreg)]
 41 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.APP = 1070;
 42 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.affmod = 0;
 43 | % Strength of the noise correction: 0 to 1; 0 - no filter, -Inf - auto, 1 - full, 2 - ISARNLM (else SANLM), default -Inf
 44 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.NCstr = -Inf;
 45 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.spm_kamap = 0;
 46 | % Strength of the local adaption: 0 to 1; default 0.5
 47 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.LASstr = 0.5;
 48 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.LASmyostr = 0;
 49 | % Strength of skull-stripping: 0 - SPM approach; eps to 1  - gcut; 2 - new APRG approach; -1 - no skull-stripping (already skull-stripped); default = 2
 50 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.gcutstr = 2;
 51 | % Strength of the cleanup process: 0 to 1; default 0.5
 52 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.cleanupstr = 0.5;
 53 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.BVCstr = 0.5;
 54 | % Correction of WM hyperintensities: 0 - no correction, 1 - only for Dartel/Shooting
 55 | % 2 - also correct segmentation (to WM), 3 - handle as separate class; default 1
 56 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.WMHC = 2;
 57 | % Stroke lesion correction (SLC): 0 - no correction, 1 - handling of manual lesion that have to be set to zero!
 58 | % 2 - automatic lesion detection (in development)
 59 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.SLC = 0;
 60 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.mrf = 1;
 61 | % % resolution handling: 'native','fixed','best', 'optimal'
 62 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.segmentation.restypes.optimal = [1 0.1];
 63 | % Remove comments and edit entry if you would like to change the Dartel/Shooting approach
 64 | % Otherwise the default value from cat_defaults.m is used.
 65 | % entry for choosing shooting approach
 66 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.shootingtpm = {fullfile('/', 'templates_1mm', 'Template_0_GS1mm.nii')};
 67 | %matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.shootingtpm = {fullfile(spm('dir'),'toolbox','cat12','templates_MNI152NLin2009cAsym','Template_1_GS.nii')};
 68 | % entry for choosing dartel approach
 69 | %matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.dartel.darteltpm = {fullfile(spm('dir'),'toolbox','cat12','templates_MNI152NLin2009cAsym','Template_1_Dartel.nii')};
 70 | 
 71 | % Strength of Shooting registration: 0 - Dartel, eps (fast), 0.5 (default) to 1 (accurate) optimized Shooting, 4 - default Shooting; default 0.5
 72 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.regmethod.shooting.regstr = 1;
 73 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.vox = 1;
 74 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.registration.bb = 45;
 75 | 
 76 | % surface and thickness creation:   0 - no (default), 1 - lh+rh, 2 - lh+rh+cerebellum,
 77 | % 3 - lh, 4 - rh, 5 - lh+rh (fast, no registration, only for quick quality check and not for analysis),
 78 | % 6 - lh+rh+cerebellum (fast, no registration, only for quick quality check and not for analysis)
 79 | % 9 - thickness only (for ROI analysis, experimental!)
 80 | % +10 to estimate WM and CSF width/depth/thickness (experimental!)
 81 | matlabbatch{1}.spm.tools.cat.estwrite.output.surface = 1;
 82 | matlabbatch{1}.spm.tools.cat.estwrite.output.surf_measures = 1;
 83 | % surface options
 84 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.pbtres = 0.5;
 85 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.pbtmethod = 'pbt2x';
 86 |  % surface recontruction pipeline & self-intersection correction: 0/1 - CS1 without/with/with-optimized SIC; 20/21/22 - CS2 without/with/with-optimized SIC;
 87 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.SRP = 22;
 88 |  % optimize surface sampling: 0 - PBT res. (slow); 1 - optimal res. (default); 2 - internal res.; 3 - SPM init; 4 - MATLAB init; 5 - SPM full;
 89 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.reduce_mesh = 1;
 90 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.vdist = 2;
 91 | % % reduce myelination effects (experimental, not yet working properly!)
 92 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.pbtlas         = 0;
 93 | % % distance method for estimating thickness:  1 - Tfs: Freesurfer method using mean(Tnear1,Tnear2) (default in 12.7+); 0 - Tlink: linked distance (used before 12.7)
 94 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.thick_measure  = 1;
 95 | % % upper limit for Tfs thickness measure similar to Freesurfer (only valid if cat.extopts.thick_measure is set to "1"
 96 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.thick_limit    = 5;
 97 | 
 98 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.scale_cortex = 0.7;
 99 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.add_parahipp = 0.1;
100 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.surface.close_parahipp = 1;
101 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.experimental = 0;
102 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.new_release = 0;
103 | % set this to 1 for skipping preprocessing if already processed data exist
104 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.lazy = 0;
105 | % catch errors: 0 - stop with error (default); 1 - catch preprocessing errors (requires MATLAB 2008 or higher);
106 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.ignoreErrors = 1;
107 | % verbose output: 1 - default; 2 - details; 3 - write debugging files
108 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.verb = 2;
109 | % display and print out pdf-file of results: 0 - off, 1 - volume only, 2 - volume and surface (default)
110 | matlabbatch{1}.spm.tools.cat.estwrite.extopts.admin.print = 2;
111 | matlabbatch{1}.spm.tools.cat.estwrite.output.BIDS.BIDSno = 1;
112 | 
113 | % define here volume atlases
114 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.neuromorphometrics = 1;
115 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.lpba40 = 1;
116 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.cobra = 1;
117 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.hammers = 1;
118 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.ibsr = 1;
119 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.aal3 = 1;
120 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.mori = 1;
121 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.anatomy3 = 1;
122 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.julichbrain = 1;
123 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_100Parcels_17Networks_order = 1;
124 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_200Parcels_17Networks_order = 1;
125 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_400Parcels_17Networks_order = 1;
126 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.Schaefer2018_600Parcels_17Networks_order = 1;
127 | matlabbatch{1}.spm.tools.cat.estwrite.output.ROImenu.atlases.ownatlas = {''};
128 | 
129 | % % { name fileid GUIlevel use } - in development
130 | % matlabbatch{1}.spm.tools.cat.estwrite.extopts.satlas      = { ...
131 | %   'Desikan'                fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_a2009s.freesurfer.annot')                        1   1;
132 | %   'Destrieux'              fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_DK40.freesurfer.annot')                          1   1;
133 | %   'HCP'                    fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.aparc_HCP_MMP1.freesurfer.annot')                      1   1;
134 | %   ... Schaefer atlases ...
135 | %   'Schaefer2018_100P_17N'  fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_100Parcels_17Networks_order.annot')       1   1;
136 | %   'Schaefer2018_200P_17N'  fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_200Parcels_17Networks_order.annot')       1   1;
137 | %   'Schaefer2018_400P_17N'  fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_400Parcels_17Networks_order.annot')       1   1;
138 | %   'Schaefer2018_600P_17N'  fullfile(spm('dir'),'toolbox','cat12','atlases_surfaces','lh.Schaefer2018_600Parcels_17Networks_order.annot')       1   1;
139 | % };
140 | 
141 | 
142 | % Writing options (see cat_defaults for the description of parameters)
143 | %   native    0/1     (none/yes)
144 | %   warped    0/1     (none/yes)
145 | %   mod       0/1/2/3 (none/affine+nonlinear/nonlinear only/both)
146 | %   dartel    0/1/2/3 (none/rigid/affine/both)
147 | 
148 | % GM/WM/CSF/WMH
149 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.native = 0;
150 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.warped = 0;
151 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.mod = 3;
152 | matlabbatch{1}.spm.tools.cat.estwrite.output.GM.dartel = 0;
153 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.native = 0;
154 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.warped = 0;
155 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.mod = 0;
156 | matlabbatch{1}.spm.tools.cat.estwrite.output.WM.dartel = 0;
157 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.native = 0;
158 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.warped = 0;
159 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.mod = 0;
160 | matlabbatch{1}.spm.tools.cat.estwrite.output.CSF.dartel = 0;
161 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.native = 0;
162 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.warped = 0;
163 | matlabbatch{1}.spm.tools.cat.estwrite.output.ct.dartel = 0;
164 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.native = 0;
165 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.warped = 0;
166 | matlabbatch{1}.spm.tools.cat.estwrite.output.pp.dartel = 0;
167 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.native = 0;
168 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.warped = 0;
169 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.mod = 0;
170 | matlabbatch{1}.spm.tools.cat.estwrite.output.WMH.dartel = 0;
171 | 
172 | % stroke lesion tissue maps (only for opt.extopts.SLC>0) - in development
173 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.native = 0;
174 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.warped = 0;
175 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.mod = 0;
176 | matlabbatch{1}.spm.tools.cat.estwrite.output.SL.dartel = 0;
177 | 
178 | % Tissue classes 4-6 to create own TPMs
179 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.native = 0;
180 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.warped = 0;
181 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.mod = 0;
182 | matlabbatch{1}.spm.tools.cat.estwrite.output.TPMC.dartel = 0;
183 | 
184 | % atlas maps (for evaluation)
185 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.native = 1;
186 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.warped = 0;
187 | matlabbatch{1}.spm.tools.cat.estwrite.output.atlas.dartel = 0;
188 | 
189 | % label
190 | % background=0, CSF=1, GM=2, WM=3, WMH=4 (if opt.extopts.WMHC==3), SL=1.5 (if opt.extopts.SLC>0)matlabbatch{1}.spm.tools.cat.estwrite.output.label.native = 0;
191 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.native = 1;
192 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.warped = 1;
193 | matlabbatch{1}.spm.tools.cat.estwrite.output.label.dartel = 0;
194 | 
195 | % bias and noise corrected, global intensity normalized
196 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.native = 0;
197 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.warped = 0;
198 | matlabbatch{1}.spm.tools.cat.estwrite.output.bias.dartel = 0;
199 | 
200 | % bias and noise corrected, (locally - if LAS>0) intensity normalized
201 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.native = 0;
202 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.warped = 0;
203 | matlabbatch{1}.spm.tools.cat.estwrite.output.las.dartel = 0;
204 | 
205 | % jacobian determinant 0/1 (none/yes)
206 | matlabbatch{1}.spm.tools.cat.estwrite.output.jacobianwarped = 1;
207 | 
208 | % deformations, order is [forward inverse]
209 | matlabbatch{1}.spm.tools.cat.estwrite.output.warps = [1 1];
210 | 
211 | % deformation matrices (affine and rigid)
212 | matlabbatch{1}.spm.tools.cat.estwrite.output.rmat = 1;
213 | 


--------------------------------------------------------------------------------