├── .gitignore ├── LICENSE ├── README.md ├── checkpoints └── .gitignore ├── data ├── processed │ └── .gitignore └── raw │ └── .gitignore ├── environment.yaml ├── main.py ├── multiprocess_bench.py ├── notebooks ├── .gitignore └── downstream_results_exploration.ipynb ├── outputs └── .gitignore ├── pretraining_resources ├── configs │ ├── phase1 │ │ ├── phase1_base.yaml │ │ └── phase1_large.yaml │ └── phase2 │ │ ├── _phase2_bio_base.yaml │ │ ├── _phase2_bio_large.yaml │ │ ├── phase2_base.yaml │ │ └── phase2_large.yaml └── convert_dataset_to_mds.py ├── results ├── .gitignore └── aggregate_results.py ├── scripts └── run_parallel.sh └── src ├── dataloader └── dataloader.py ├── metrics └── metrics.py └── tasks ├── mask_classification.py ├── sequence_classification.py └── token_classification.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/README.md -------------------------------------------------------------------------------- /checkpoints/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/processed/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /data/raw/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/environment.yaml -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/main.py -------------------------------------------------------------------------------- /multiprocess_bench.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/multiprocess_bench.py -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | results.ipynb -------------------------------------------------------------------------------- /notebooks/downstream_results_exploration.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/notebooks/downstream_results_exploration.ipynb -------------------------------------------------------------------------------- /outputs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /pretraining_resources/configs/phase1/phase1_base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase1/phase1_base.yaml -------------------------------------------------------------------------------- /pretraining_resources/configs/phase1/phase1_large.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase1/phase1_large.yaml -------------------------------------------------------------------------------- /pretraining_resources/configs/phase2/_phase2_bio_base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase2/_phase2_bio_base.yaml -------------------------------------------------------------------------------- /pretraining_resources/configs/phase2/_phase2_bio_large.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase2/_phase2_bio_large.yaml -------------------------------------------------------------------------------- /pretraining_resources/configs/phase2/phase2_base.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase2/phase2_base.yaml -------------------------------------------------------------------------------- /pretraining_resources/configs/phase2/phase2_large.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/configs/phase2/phase2_large.yaml -------------------------------------------------------------------------------- /pretraining_resources/convert_dataset_to_mds.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/pretraining_resources/convert_dataset_to_mds.py -------------------------------------------------------------------------------- /results/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !aggregate_results.py -------------------------------------------------------------------------------- /results/aggregate_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/results/aggregate_results.py -------------------------------------------------------------------------------- /scripts/run_parallel.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/scripts/run_parallel.sh -------------------------------------------------------------------------------- /src/dataloader/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/src/dataloader/dataloader.py -------------------------------------------------------------------------------- /src/metrics/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/src/metrics/metrics.py -------------------------------------------------------------------------------- /src/tasks/mask_classification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/src/tasks/mask_classification.py -------------------------------------------------------------------------------- /src/tasks/sequence_classification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/src/tasks/sequence_classification.py -------------------------------------------------------------------------------- /src/tasks/token_classification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindvalllab/BioClinical-ModernBERT/HEAD/src/tasks/token_classification.py --------------------------------------------------------------------------------